LCOV - code coverage report
Current view: top level - ASM_AVX2 - jnt_convolve_2d_avx2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 1141 1672 68.2 %
Date: 2019-11-25 17:38:06 Functions: 37 55 67.3 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <immintrin.h>
      13             : #include "aom_dsp_rtcd.h"
      14             : #include "convolve.h"
      15             : #include "convolve_avx2.h"
      16             : #include "EbDefinitions.h"
      17             : #include "EbMemory_SSE4_1.h"
      18             : 
      19           0 : static INLINE __m128i jnt_2d_comp_avg_round_4_sse2(const __m128i src) {
      20           0 :     const __m128i round = _mm_set1_epi32(1 << (COMPOUND_ROUND1_BITS - 1));
      21           0 :     const __m128i dst = _mm_add_epi32(src, round);
      22           0 :     const __m128i d = _mm_srai_epi32(dst, COMPOUND_ROUND1_BITS);
      23           0 :     return _mm_packs_epi32(d, d);
      24             : }
      25             : 
      26           0 : static INLINE __m128i jnt_2d_comp_avg_round_half_pel_sse2(const __m128i src) {
      27           0 :     const __m128i round = _mm_set1_epi16(1);
      28           0 :     const __m128i dst = _mm_add_epi16(src, round);
      29           0 :     return _mm_srai_epi16(dst, 1);
      30             : }
      31             : 
      32           0 : static INLINE __m128i jnt_2d_comp_avg_round_4x2_sse2(const __m128i src[2]) {
      33           0 :     const __m128i round = _mm_set1_epi32(1 << (COMPOUND_ROUND1_BITS - 1));
      34           0 :     const __m128i dst0 = _mm_add_epi32(src[0], round);
      35           0 :     const __m128i dst1 = _mm_add_epi32(src[1], round);
      36           0 :     const __m128i d0 = _mm_srai_epi32(dst0, COMPOUND_ROUND1_BITS);
      37           0 :     const __m128i d1 = _mm_srai_epi32(dst1, COMPOUND_ROUND1_BITS);
      38           0 :     return _mm_packs_epi32(d0, d1);
      39             : }
      40             : 
      41     1276390 : static INLINE __m256i jnt_2d_comp_avg_round_8_avx2(const __m256i src) {
      42     1276390 :     const __m256i round = _mm256_set1_epi32(1 << (COMPOUND_ROUND1_BITS - 1));
      43     1276390 :     const __m256i dst = _mm256_add_epi32(src, round);
      44     1276390 :     const __m256i d = _mm256_srai_epi32(dst, COMPOUND_ROUND1_BITS);
      45     1276390 :     return _mm256_packs_epi32(d, d);
      46             : }
      47             : 
      48   464184000 : static INLINE __m256i jnt_2d_comp_avg_round_8x2_avx2(const __m256i src[2]) {
      49   464184000 :     const __m256i round = _mm256_set1_epi32(1 << (COMPOUND_ROUND1_BITS - 1));
      50   464184000 :     const __m256i dst0 = _mm256_add_epi32(src[0], round);
      51   928367000 :     const __m256i dst1 = _mm256_add_epi32(src[1], round);
      52   464184000 :     const __m256i d0 = _mm256_srai_epi32(dst0, COMPOUND_ROUND1_BITS);
      53   464184000 :     const __m256i d1 = _mm256_srai_epi32(dst1, COMPOUND_ROUND1_BITS);
      54   464184000 :     return _mm256_packs_epi32(d0, d1);
      55             : }
      56             : 
      57    65680700 : static INLINE __m256i jnt_2d_comp_avg_round_half_pel_avx2(const __m256i src) {
      58    65680700 :     const __m256i round = _mm256_set1_epi16(1);
      59    65680700 :     const __m256i dst = _mm256_add_epi16(src, round);
      60    65680700 :     return _mm256_srai_epi16(dst, 1);
      61             : }
      62             : 
      63           0 : static INLINE void jnt_2d_comp_avg_round_store_2x2_sse2(
      64             :     const __m128i res, const __m128i factor, const __m128i offset,
      65             :     const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
      66             :     const int32_t dst8_stride) {
      67           0 :     const __m128i r = jnt_2d_comp_avg_round_4_sse2(res);
      68             :     __m128i d;
      69             : 
      70           0 :     d = load_u16_2x2_sse4_1(dst, dst_stride);
      71           0 :     d = _mm_unpacklo_epi16(d, r);
      72           0 :     d = _mm_madd_epi16(d, factor);
      73           0 :     d = _mm_add_epi32(d, offset);
      74           0 :     d = _mm_srai_epi32(d, 8);
      75           0 :     d = _mm_packs_epi32(d, d);
      76           0 :     pack_store_2x2_sse2(d, dst8, dst8_stride);
      77           0 : }
      78             : 
      79           0 : static INLINE void jnt_2d_comp_avg_round_store_half_pel_2x2_sse2(
      80             :     const __m128i res, const __m128i factor, const __m128i offset,
      81             :     const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
      82             :     const int32_t dst8_stride) {
      83           0 :     const __m128i r = jnt_2d_comp_avg_round_half_pel_sse2(res);
      84             :     __m128i d;
      85             : 
      86           0 :     d = load_u16_2x2_sse4_1(dst, dst_stride);
      87           0 :     d = _mm_unpacklo_epi16(d, r);
      88           0 :     d = _mm_madd_epi16(d, factor);
      89           0 :     d = _mm_add_epi32(d, offset);
      90           0 :     d = _mm_srai_epi32(d, 8);
      91           0 :     d = _mm_packs_epi32(d, d);
      92           0 :     pack_store_2x2_sse2(d, dst8, dst8_stride);
      93           0 : }
      94             : 
      95           0 : static INLINE void jnt_2d_comp_avg_round_store_4x2_sse2(
      96             :     const __m128i res[2], const __m128i factor, const __m128i offset,
      97             :     const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
      98             :     const int32_t dst8_stride) {
      99           0 :     const __m128i r = jnt_2d_comp_avg_round_4x2_sse2(res);
     100           0 :     const __m128i d = load_u16_4x2_sse2(dst, dst_stride);
     101             :     __m128i dd[2];
     102             : 
     103           0 :     dd[0] = _mm_unpacklo_epi16(d, r);
     104           0 :     dd[1] = _mm_unpackhi_epi16(d, r);
     105           0 :     dd[0] = _mm_madd_epi16(dd[0], factor);
     106           0 :     dd[1] = _mm_madd_epi16(dd[1], factor);
     107           0 :     dd[0] = _mm_add_epi32(dd[0], offset);
     108           0 :     dd[1] = _mm_add_epi32(dd[1], offset);
     109           0 :     dd[0] = _mm_srai_epi32(dd[0], 8);
     110           0 :     dd[1] = _mm_srai_epi32(dd[1], 8);
     111           0 :     dd[0] = _mm_packs_epi32(dd[0], dd[1]);
     112           0 :     pack_store_4x2_sse2(dd[0], dst8, dst8_stride);
     113           0 : }
     114             : 
     115           0 : static INLINE void jnt_2d_comp_avg_round_store_half_pel_4x2_sse2(
     116             :     const __m128i res, const __m128i factor, const __m128i offset,
     117             :     const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
     118             :     const int32_t dst8_stride) {
     119           0 :     const __m128i r = jnt_2d_comp_avg_round_half_pel_sse2(res);
     120           0 :     const __m128i d = load_u16_4x2_sse2(dst, dst_stride);
     121             :     __m128i dd[2];
     122             : 
     123           0 :     dd[0] = _mm_unpacklo_epi16(d, r);
     124           0 :     dd[1] = _mm_unpackhi_epi16(d, r);
     125           0 :     dd[0] = _mm_madd_epi16(dd[0], factor);
     126           0 :     dd[1] = _mm_madd_epi16(dd[1], factor);
     127           0 :     dd[0] = _mm_add_epi32(dd[0], offset);
     128           0 :     dd[1] = _mm_add_epi32(dd[1], offset);
     129           0 :     dd[0] = _mm_srai_epi32(dd[0], 8);
     130           0 :     dd[1] = _mm_srai_epi32(dd[1], 8);
     131           0 :     dd[0] = _mm_packs_epi32(dd[0], dd[1]);
     132           0 :     pack_store_4x2_sse2(dd[0], dst8, dst8_stride);
     133           0 : }
     134             : 
     135     1276390 : static INLINE __m256i jnt_2d_comp_avg_round_pack_8_avx2(const __m256i res,
     136             :     const __m256i factor,
     137             :     const __m256i offset,
     138             :     const __m256i dst) {
     139     1276390 :     const __m256i r = jnt_2d_comp_avg_round_8_avx2(res);
     140             :     __m256i d[2];
     141             : 
     142     1276380 :     d[0] = _mm256_unpacklo_epi16(dst, r);
     143     1276380 :     d[0] = _mm256_madd_epi16(d[0], factor);
     144     1276380 :     d[0] = _mm256_add_epi32(d[0], offset);
     145     1276380 :     d[0] = _mm256_srai_epi32(d[0], 8);
     146     2552770 :     return _mm256_packs_epi32(d[0], d[0]);
     147             : }
     148             : 
     149   464298000 : static INLINE __m256i jnt_2d_comp_avg_round_pack_16_avx2(const __m256i res[2],
     150             :     const __m256i factor,
     151             :     const __m256i offset,
     152             :     const __m256i dst) {
     153   464298000 :     const __m256i r = jnt_2d_comp_avg_round_8x2_avx2(res);
     154             :     __m256i d[2];
     155             : 
     156   464075000 :     d[0] = _mm256_unpacklo_epi16(dst, r);
     157   464075000 :     d[1] = _mm256_unpackhi_epi16(dst, r);
     158   464075000 :     d[0] = _mm256_madd_epi16(d[0], factor);
     159   464075000 :     d[1] = _mm256_madd_epi16(d[1], factor);
     160   464075000 :     d[0] = _mm256_add_epi32(d[0], offset);
     161   464075000 :     d[1] = _mm256_add_epi32(d[1], offset);
     162   464075000 :     d[0] = _mm256_srai_epi32(d[0], 8);
     163   464075000 :     d[1] = _mm256_srai_epi32(d[1], 8);
     164   928150000 :     return _mm256_packs_epi32(d[0], d[1]);
     165             : }
     166             : 
     167    65691100 : static INLINE __m256i jnt_2d_comp_avg_round_pack_half_pel_avx2(
     168             :     const __m256i res, const __m256i factor, const __m256i offset,
     169             :     const __m256i dst) {
     170    65691100 :     const __m256i r = jnt_2d_comp_avg_round_half_pel_avx2(res);
     171             :     __m256i d[2];
     172             : 
     173    65704700 :     d[0] = _mm256_unpacklo_epi16(dst, r);
     174    65704700 :     d[1] = _mm256_unpackhi_epi16(dst, r);
     175    65704700 :     d[0] = _mm256_madd_epi16(d[0], factor);
     176    65704700 :     d[1] = _mm256_madd_epi16(d[1], factor);
     177    65704700 :     d[0] = _mm256_add_epi32(d[0], offset);
     178    65704700 :     d[1] = _mm256_add_epi32(d[1], offset);
     179    65704700 :     d[0] = _mm256_srai_epi32(d[0], 8);
     180    65704700 :     d[1] = _mm256_srai_epi32(d[1], 8);
     181   131409000 :     return _mm256_packs_epi32(d[0], d[1]);
     182             : }
     183             : 
     184     1276390 : static INLINE void jnt_2d_comp_avg_round_store_4x2_avx2(
     185             :     const __m256i res, const __m256i factor, const __m256i offset,
     186             :     const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
     187             :     const int32_t dst8_stride) {
     188             :     __m128i d_128[2];
     189             :     __m256i d;
     190             : 
     191     1276390 :     d_128[0] = _mm_loadl_epi64((__m128i *)(dst));
     192     1276390 :     d_128[1] = _mm_loadl_epi64((__m128i *)(dst + dst_stride));
     193     1276390 :     d = _mm256_setr_m128i(d_128[0], d_128[1]);
     194     1276390 :     d = loadu_u16_8x2_avx2(dst, dst_stride);
     195     1276390 :     d = jnt_2d_comp_avg_round_pack_8_avx2(res, factor, offset, d);
     196     1276390 :     d = _mm256_packus_epi16(d, d);
     197     1276390 :     const __m128i d0 = _mm256_castsi256_si128(d);
     198     1276390 :     const __m128i d1 = _mm256_extracti128_si256(d, 1);
     199     1276390 :     _mm_storel_epi64((__m128i *)dst8, d0);
     200     1276390 :     _mm_storel_epi64((__m128i *)(dst8 + dst8_stride), d1);
     201     1276390 : }
     202             : 
     203    65555300 : static INLINE void jnt_2d_comp_avg_round_store_8x2_avx2(
     204             :     const __m256i res[2], const __m256i factor, const __m256i offset,
     205             :     const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
     206             :     const int32_t dst8_stride) {
     207    65555300 :     const __m256i d = loadu_u16_8x2_avx2(dst, dst_stride);
     208             :     const __m256i dd =
     209    65580700 :         jnt_2d_comp_avg_round_pack_16_avx2(res, factor, offset, d);
     210    65660300 :     pack_store_8x2_avx2(dd, dst8, dst8_stride);
     211    65626700 : }
     212             : 
     213     8005510 : static INLINE void jnt_2d_comp_avg_round_store_half_pel_8x2_avx2(
     214             :     const __m256i res, const __m256i factor, const __m256i offset,
     215             :     const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
     216             :     const int32_t dst8_stride) {
     217     8005510 :     const __m256i d = loadu_u16_8x2_avx2(dst, dst_stride);
     218             :     const __m256i dd =
     219     8005540 :         jnt_2d_comp_avg_round_pack_half_pel_avx2(res, factor, offset, d);
     220     8006830 :     pack_store_8x2_avx2(dd, dst8, dst8_stride);
     221     8006380 : }
     222             : 
     223    59794100 : static INLINE void jnt_2d_comp_avg_round_store_16x2_avx2(
     224             :     const __m256i res[4], const __m256i factor, const __m256i offset,
     225             :     const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
     226             :     const int32_t dst8_stride) {
     227             :     __m256i d[2];
     228             : 
     229    59794100 :     d[0] = _mm256_loadu_si256((__m256i *)dst);
     230    59794100 :     d[1] = _mm256_loadu_si256((__m256i *)(dst + dst_stride));
     231    59794100 :     d[0] = jnt_2d_comp_avg_round_pack_16_avx2(res + 0, factor, offset, d[0]);
     232    59884900 :     d[1] = jnt_2d_comp_avg_round_pack_16_avx2(res + 2, factor, offset, d[1]);
     233    59869000 :     xy_y_pack_store_16x2_avx2(d[0], d[1], dst8, dst8_stride);
     234    59862700 : }
     235             : 
     236     8432120 : static INLINE void jnt_2d_comp_avg_round_store_half_pel_16x2_avx2(
     237             :     const __m256i res[2], const __m256i factor, const __m256i offset,
     238             :     const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
     239             :     const int32_t dst8_stride) {
     240             :     __m256i d[2];
     241             : 
     242     8432120 :     d[0] = _mm256_loadu_si256((__m256i *)dst);
     243     8432120 :     d[1] = _mm256_loadu_si256((__m256i *)(dst + dst_stride));
     244     8432980 :     d[0] =
     245     8432120 :         jnt_2d_comp_avg_round_pack_half_pel_avx2(res[0], factor, offset, d[0]);
     246     8433000 :     d[1] =
     247     8432980 :         jnt_2d_comp_avg_round_pack_half_pel_avx2(res[1], factor, offset, d[1]);
     248     8433000 :     xy_y_pack_store_16x2_avx2(d[0], d[1], dst8, dst8_stride);
     249     8432240 : }
     250             : 
     251             : SIMD_INLINE void jnt_2d_comp_avg_round_store_32_avx2(
     252             :     const __m256i r0[2], const __m256i r1[2], const __m256i factor,
     253             :     const __m256i offset, const ConvBufType *const dst, uint8_t *const dst8) {
     254             :     __m256i d[2];
     255             : 
     256    28275000 :     d[0] = loadu_u16_8x2_avx2(dst, 16);
     257   141809000 :     d[1] = loadu_u16_8x2_avx2(dst + 8, 16);
     258   141824000 :     d[0] = jnt_2d_comp_avg_round_pack_16_avx2(r0, factor, offset, d[0]);
     259   141868000 :     d[1] = jnt_2d_comp_avg_round_pack_16_avx2(r1, factor, offset, d[1]);
     260   141861000 :     convolve_store_32_avx2(d[0], d[1], dst8);
     261   141854000 : }
     262             : 
     263             : SIMD_INLINE void jnt_2d_comp_avg_round_store_half_pel_32_avx2(
     264             :     const __m256i res[2], const __m256i factor, const __m256i offset,
     265             :     const ConvBufType *const dst, uint8_t *const dst8) {
     266             :     __m256i d[2];
     267             : 
     268     7802800 :     d[0] = loadu_u16_8x2_avx2(dst, 16);
     269    20456100 :     d[1] = loadu_u16_8x2_avx2(dst + 8, 16);
     270    20457200 :     d[0] =
     271    20455800 :         jnt_2d_comp_avg_round_pack_half_pel_avx2(res[0], factor, offset, d[0]);
     272    20457200 :     d[1] =
     273    20457200 :         jnt_2d_comp_avg_round_pack_half_pel_avx2(res[1], factor, offset, d[1]);
     274    20457200 :     convolve_store_32_avx2(d[0], d[1], dst8);
     275    20456700 : }
     276             : 
     277           0 : static INLINE __m128i jnt_2d_round_4_sse2(const __m128i src,
     278             :     const __m128i offset) {
     279           0 :     const __m128i dst = _mm_add_epi32(src, offset);
     280           0 :     const __m128i d = _mm_srai_epi32(dst, COMPOUND_ROUND1_BITS);
     281           0 :     return _mm_packs_epi32(d, d);
     282             : }
     283             : 
     284           0 : static INLINE __m128i jnt_2d_round_half_pel_sse2(const __m128i src,
     285             :     const __m128i offset) {
     286           0 :     const __m128i dst = _mm_add_epi16(src, offset);
     287           0 :     return _mm_srai_epi16(dst, 1);
     288             : }
     289             : 
     290           0 : static INLINE __m128i jnt_2d_round_4x2_sse2(const __m128i src[2],
     291             :     const __m128i offset) {
     292           0 :     const __m128i dst0 = _mm_add_epi32(src[0], offset);
     293           0 :     const __m128i dst1 = _mm_add_epi32(src[1], offset);
     294           0 :     const __m128i d0 = _mm_srai_epi32(dst0, COMPOUND_ROUND1_BITS);
     295           0 :     const __m128i d1 = _mm_srai_epi32(dst1, COMPOUND_ROUND1_BITS);
     296           0 :     return _mm_packs_epi32(d0, d1);
     297             : }
     298             : 
     299     6497180 : static INLINE __m256i jnt_2d_round_4x2_avx2(const __m256i src,
     300             :     const __m256i offset) {
     301     6497180 :     const __m256i dst = _mm256_add_epi32(src, offset);
     302     6497180 :     const __m256i d = _mm256_srai_epi32(dst, COMPOUND_ROUND1_BITS);
     303     6497180 :     return _mm256_packs_epi32(d, d);
     304             : }
     305             : 
     306  2420930000 : static INLINE __m256i jnt_2d_round_16_avx2(const __m256i src[2],
     307             :     const __m256i offset) {
     308  2420930000 :     const __m256i dst0 = _mm256_add_epi32(src[0], offset);
     309  4841870000 :     const __m256i dst1 = _mm256_add_epi32(src[1], offset);
     310  2420930000 :     const __m256i d0 = _mm256_srai_epi32(dst0, COMPOUND_ROUND1_BITS);
     311  2420930000 :     const __m256i d1 = _mm256_srai_epi32(dst1, COMPOUND_ROUND1_BITS);
     312  2420930000 :     return _mm256_packs_epi32(d0, d1);
     313             : }
     314             : 
     315   373219000 : static INLINE __m256i jnt_2d_round_half_pel_avx2(const __m256i src,
     316             :     const __m256i offset) {
     317   373219000 :     const __m256i dst0 = _mm256_add_epi16(src, offset);
     318   373219000 :     return _mm256_srai_epi16(dst0, 1);
     319             : }
     320             : 
     321           0 : static INLINE void jnt_2d_avg_round_store_2x2_sse2(
     322             :     const __m128i res, const __m128i offset, const ConvBufType *const dst,
     323             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
     324           0 :     const __m128i r = jnt_2d_round_4_sse2(res, offset);
     325             :     __m128i d;
     326             : 
     327           0 :     d = load_u16_2x2_sse4_1(dst, dst_stride);
     328           0 :     d = jnt_avg_4x2_sse2(r, d);
     329           0 :     pack_store_2x2_sse2(d, dst8, dst8_stride);
     330           0 : }
     331             : 
     332           0 : static INLINE void jnt_2d_avg_round_store_half_pel_2x2_sse2(
     333             :     const __m128i res, const __m128i offset, const ConvBufType *const dst,
     334             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
     335           0 :     const __m128i r = jnt_2d_round_half_pel_sse2(res, offset);
     336             :     __m128i d;
     337             : 
     338           0 :     d = load_u16_2x2_sse4_1(dst, dst_stride);
     339           0 :     d = jnt_avg_4x2_sse2(r, d);
     340           0 :     pack_store_2x2_sse2(d, dst8, dst8_stride);
     341           0 : }
     342             : 
     343           0 : static INLINE void jnt_2d_avg_round_store_4x2_sse2(
     344             :     const __m128i res[2], const __m128i offset, const ConvBufType *const dst,
     345             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
     346           0 :     const __m128i r = jnt_2d_round_4x2_sse2(res, offset);
     347             :     __m128i d;
     348             : 
     349           0 :     d = load_u16_4x2_sse2(dst, dst_stride);
     350           0 :     d = jnt_avg_4x2_sse2(r, d);
     351           0 :     pack_store_4x2_sse2(d, dst8, dst8_stride);
     352           0 : }
     353             : 
     354           0 : static INLINE void jnt_2d_avg_round_store_half_pel_4x2_sse2(
     355             :     const __m128i res, const __m128i offset, const ConvBufType *const dst,
     356             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
     357           0 :     const __m128i r = jnt_2d_round_half_pel_sse2(res, offset);
     358             :     __m128i d;
     359             : 
     360           0 :     d = load_u16_4x2_sse2(dst, dst_stride);
     361           0 :     d = jnt_avg_4x2_sse2(r, d);
     362           0 :     pack_store_4x2_sse2(d, dst8, dst8_stride);
     363           0 : }
     364             : 
     365     2199240 : static INLINE void jnt_2d_avg_round_store_4x2_avx2(
     366             :     const __m256i res, const __m256i offset, const ConvBufType *const dst,
     367             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
     368     2199240 :     const __m256i r = jnt_2d_round_4x2_avx2(res, offset);
     369             :     __m128i d_128[2];
     370             :     __m256i d;
     371             : 
     372     2199250 :     d_128[0] = _mm_loadl_epi64((__m128i *)(dst));
     373     2199250 :     d_128[1] = _mm_loadl_epi64((__m128i *)(dst + dst_stride));
     374     2199250 :     d = _mm256_setr_m128i(d_128[0], d_128[1]);
     375     2199250 :     d = jnt_avg_16_avx2(r, d);
     376     2199260 :     d = _mm256_packus_epi16(d, d);
     377     2199260 :     const __m128i d0 = _mm256_castsi256_si128(d);
     378     2199260 :     const __m128i d1 = _mm256_extracti128_si256(d, 1);
     379     2199260 :     _mm_storel_epi64((__m128i *)dst8, d0);
     380     2199260 :     _mm_storel_epi64((__m128i *)(dst8 + dst8_stride), d1);
     381     2199260 : }
     382             : 
     383    75330400 : static INLINE void jnt_2d_avg_round_store_8x2_avx2(
     384             :     const __m256i res[2], const __m256i offset, const ConvBufType *const dst,
     385             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
     386    75330400 :     const __m256i r = jnt_2d_round_16_avx2(res, offset);
     387             :     __m256i d;
     388             : 
     389    75355400 :     d = loadu_u16_8x2_avx2(dst, dst_stride);
     390    75357100 :     d = jnt_avg_16_avx2(r, d);
     391    75330800 :     pack_store_8x2_avx2(d, dst8, dst8_stride);
     392    75330600 : }
     393             : 
     394     8006060 : static INLINE void jnt_2d_avg_round_store_half_pel_8x2_avx2(
     395             :     const __m256i res, const __m256i offset, const ConvBufType *const dst,
     396             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
     397     8006060 :     const __m256i r = jnt_2d_round_half_pel_avx2(res, offset);
     398             :     __m256i d;
     399             : 
     400     8006040 :     d = loadu_u16_8x2_avx2(dst, dst_stride);
     401     8005850 :     d = jnt_avg_16_avx2(r, d);
     402     8005830 :     pack_store_8x2_avx2(d, dst8, dst8_stride);
     403     8005940 : }
     404             : 
     405    72250300 : static INLINE void jnt_2d_avg_round_store_16x2_avx2(
     406             :     const __m256i res[4], const __m256i offset, const ConvBufType *const dst,
     407             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
     408             :     __m256i r[2], d[2];
     409             : 
     410    72250300 :     r[0] = jnt_2d_round_16_avx2(res + 0, offset);
     411    72302900 :     r[1] = jnt_2d_round_16_avx2(res + 2, offset);
     412    72276600 :     d[0] = _mm256_loadu_si256((__m256i *)dst);
     413    72276600 :     d[1] = _mm256_loadu_si256((__m256i *)(dst + dst_stride));
     414    72276600 :     d[0] = jnt_avg_16_avx2(r[0], d[0]);
     415    72272600 :     d[1] = jnt_avg_16_avx2(r[1], d[1]);
     416    72260800 :     xy_y_pack_store_16x2_avx2(d[0], d[1], dst8, dst8_stride);
     417    72312200 : }
     418             : 
     419     8432720 : static INLINE void jnt_2d_avg_round_store_half_pel_16x2_avx2(
     420             :     const __m256i res[2], const __m256i offset, const ConvBufType *const dst,
     421             :     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
     422             :     __m256i r[2], d[2];
     423             : 
     424     8432720 :     r[0] = jnt_2d_round_half_pel_avx2(res[0], offset);
     425     8432770 :     r[1] = jnt_2d_round_half_pel_avx2(res[1], offset);
     426     8432610 :     d[0] = _mm256_loadu_si256((__m256i *)dst);
     427     8432610 :     d[1] = _mm256_loadu_si256((__m256i *)(dst + dst_stride));
     428     8432610 :     d[0] = jnt_avg_16_avx2(r[0], d[0]);
     429     8432620 :     d[1] = jnt_avg_16_avx2(r[1], d[1]);
     430     8432440 :     xy_y_pack_store_16x2_avx2(d[0], d[1], dst8, dst8_stride);
     431     8432760 : }
     432             : 
     433             : SIMD_INLINE void jnt_2d_avg_round_store_32_avx2(const __m256i r0[2],
     434             :     const __m256i r1[2],
     435             :     const __m256i offset,
     436             :     const ConvBufType *const dst,
     437             :     uint8_t *const dst8) {
     438             :     __m256i r[2], d[2];
     439             : 
     440    45772000 :     r[0] = jnt_2d_round_16_avx2(r0, offset);
     441   176810000 :     r[1] = jnt_2d_round_16_avx2(r1, offset);
     442   176835000 :     d[0] = loadu_u16_8x2_avx2(dst, 16);
     443   176856000 :     d[1] = loadu_u16_8x2_avx2(dst + 8, 16);
     444   176848000 :     d[0] = jnt_avg_16_avx2(r[0], d[0]);
     445   176837000 :     d[1] = jnt_avg_16_avx2(r[1], d[1]);
     446   176823000 :     convolve_store_32_avx2(d[0], d[1], dst8);
     447   176818000 : }
     448             : 
     449    20445700 : static INLINE void jnt_2d_avg_round_store_half_pel_32_avx2(
     450             :     const __m256i res[2], const __m256i offset, const ConvBufType *const dst,
     451             :     uint8_t *const dst8) {
     452             :     __m256i r[2], d[2];
     453             : 
     454    20445700 :     r[0] = jnt_2d_round_half_pel_avx2(res[0], offset);
     455    20445700 :     r[1] = jnt_2d_round_half_pel_avx2(res[1], offset);
     456    20443600 :     d[0] = loadu_u16_8x2_avx2(dst, 16);
     457    20445100 :     d[1] = loadu_u16_8x2_avx2(dst + 8, 16);
     458    20445100 :     d[0] = jnt_avg_16_avx2(r[0], d[0]);
     459    20444700 :     d[1] = jnt_avg_16_avx2(r[1], d[1]);
     460    20444100 :     convolve_store_32_avx2(d[0], d[1], dst8);
     461    20444200 : }
     462             : 
     463           0 : static INLINE void jnt_2d_no_avg_round_store_2x2_sse2(
     464             :     const __m128i res, const __m128i offset, ConvBufType *const dst,
     465             :     const int32_t dst_stride) {
     466           0 :     const __m128i d = jnt_2d_round_4_sse2(res, offset);
     467             :     store_u16_2x2_sse2(d, dst, dst_stride);
     468           0 : }
     469             : 
     470           0 : static INLINE void jnt_2d_no_avg_round_store_half_pel_2x2_sse2(
     471             :     const __m128i res, const __m128i offset, ConvBufType *const dst,
     472             :     const int32_t dst_stride) {
     473           0 :     const __m128i d = jnt_2d_round_half_pel_sse2(res, offset);
     474             :     store_u16_2x2_sse2(d, dst, dst_stride);
     475           0 : }
     476             : 
     477           0 : static INLINE void jnt_2d_no_avg_round_store_4x2_sse2(
     478             :     const __m128i res[2], const __m128i offset, ConvBufType *const dst,
     479             :     const int32_t dst_stride) {
     480           0 :     const __m128i d = jnt_2d_round_4x2_sse2(res, offset);
     481             :     store_u16_4x2_sse2(d, dst, dst_stride);
     482           0 : }
     483             : 
     484     4298480 : static INLINE void jnt_2d_no_avg_round_store_4x2_avx2(
     485             :     const __m256i res, const __m256i offset, ConvBufType *const dst,
     486             :     const int32_t dst_stride) {
     487     4298480 :     const __m256i d = jnt_2d_round_4x2_avx2(res, offset);
     488     4298450 :     const __m128i d0 = _mm256_castsi256_si128(d);
     489     4298450 :     const __m128i d1 = _mm256_extracti128_si256(d, 1);
     490     4298450 :     _mm_storel_epi64((__m128i *)dst, d0);
     491     4298450 :     _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
     492     4298450 : }
     493             : 
     494           0 : static INLINE void jnt_2d_no_avg_round_store_half_pel_4x2_sse2(
     495             :     const __m128i res, const __m128i offset, ConvBufType *const dst,
     496             :     const int32_t dst_stride) {
     497           0 :     const __m128i d = jnt_2d_round_half_pel_sse2(res, offset);
     498             :     store_u16_4x2_sse2(d, dst, dst_stride);
     499           0 : }
     500             : 
     501   278780000 : static INLINE void jnt_2d_no_avg_round_store_8x2_avx2(
     502             :     const __m256i res[2], const __m256i offset, ConvBufType *const dst,
     503             :     const int32_t dst_stride) {
     504   278780000 :     const __m256i d = jnt_2d_round_16_avx2(res, offset);
     505   279259000 :     storeu_u16_8x2_avx2(d, dst, dst_stride);
     506   279488000 : }
     507             : 
     508    40594300 : static INLINE void jnt_2d_no_avg_round_store_half_pel_8x2_avx2(
     509             :     const __m256i res, const __m256i offset, ConvBufType *const dst,
     510             :     const int32_t dst_stride) {
     511    40594300 :     const __m256i d = jnt_2d_round_half_pel_avx2(res, offset);
     512    40592500 :     storeu_u16_8x2_avx2(d, dst, dst_stride);
     513    40597600 : }
     514             : 
     515   266936000 : static INLINE void jnt_2d_no_avg_round_store_16x2_avx2(
     516             :     const __m256i res[4], const __m256i offset, ConvBufType *const dst,
     517             :     const int32_t dst_stride) {
     518   266936000 :     const __m256i d0 = jnt_2d_round_16_avx2(res + 0, offset);
     519   267823000 :     const __m256i d1 = jnt_2d_round_16_avx2(res + 2, offset);
     520             :     _mm256_storeu_si256((__m256i *)dst, d0);
     521   268116000 :     _mm256_storeu_si256((__m256i *)(dst + dst_stride), d1);
     522   268116000 : }
     523             : 
     524    43083800 : static INLINE void jnt_2d_no_avg_round_store_half_pel_16x2_avx2(
     525             :     const __m256i res[2], const __m256i offset, ConvBufType *const dst,
     526             :     const int32_t dst_stride) {
     527    43083800 :     const __m256i d0 = jnt_2d_round_half_pel_avx2(res[0], offset);
     528    43081900 :     const __m256i d1 = jnt_2d_round_half_pel_avx2(res[1], offset);
     529             :     _mm256_storeu_si256((__m256i *)dst, d0);
     530    43080500 :     _mm256_storeu_si256((__m256i *)(dst + dst_stride), d1);
     531    43080500 : }
     532             : 
     533   581160000 : static INLINE void jnt_2d_no_avg_round_store_32_avx2(const __m256i r0[2],
     534             :     const __m256i r1[2],
     535             :     const __m256i offset,
     536             :     ConvBufType *const dst) {
     537   581160000 :     const __m256i d0 = jnt_2d_round_16_avx2(r0, offset);
     538   585857000 :     const __m256i d1 = jnt_2d_round_16_avx2(r1, offset);
     539   585480000 :     jnt_no_avg_store_16x2_avx2(d0, d1, dst, 16);
     540   585649000 : }
     541             : 
     542    91258500 : static INLINE void jnt_2d_no_avg_round_store_half_pel_32_avx2(
     543             :     const __m256i res[2], const __m256i offset, ConvBufType *const dst) {
     544    91258500 :     const __m256i d0 = jnt_2d_round_half_pel_avx2(res[0], offset);
     545    91261800 :     const __m256i d1 = jnt_2d_round_half_pel_avx2(res[1], offset);
     546    91227100 :     jnt_no_avg_store_16x2_avx2(d0, d1, dst, 16);
     547    91245500 : }
     548             : 
     549    92521900 : static void jnt_convolve_2d_hor_2tap_avx2(
     550             :     const uint8_t *src, const int32_t src_stride, const int32_t w,
     551             :     const int32_t h, const InterpFilterParams *filter_params_x,
     552             :     const int32_t subpel_x_q4, int16_t *const im_block) {
     553    92521900 :     const uint8_t *src_ptr = src;
     554    92521900 :     int32_t y = h;
     555    92521900 :     int16_t *im = im_block;
     556             :     __m128i coeffs_128[4];
     557             :     __m256i coeffs_256[4];
     558             : 
     559    92521900 :     if (w <= 8) {
     560    40526200 :         prepare_half_coeffs_2tap_ssse3(
     561             :             filter_params_x, subpel_x_q4, coeffs_128);
     562             : 
     563    40532600 :         if (w == 2) {
     564             :             do {
     565             :                 const __m128i r =
     566           0 :                     x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
     567           0 :                 xy_x_round_store_2x2_sse2(r, im);
     568           0 :                 src_ptr += 2 * src_stride;
     569           0 :                 im += 2 * 2;
     570           0 :                 y -= 2;
     571           0 :             } while (y);
     572             :         }
     573    40535200 :         else if (w == 4) {
     574             :             do {
     575             :                 const __m128i r =
     576           0 :                     x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
     577           0 :                 xy_x_round_store_4x2_sse2(r, im);
     578           0 :                 src_ptr += 2 * src_stride;
     579           0 :                 im += 2 * 4;
     580           0 :                 y -= 2;
     581           0 :             } while (y);
     582             :         }
     583             :         else {
     584    40539300 :             assert(w == 8);
     585             : 
     586             :             do {
     587             :                 __m128i r[2];
     588             : 
     589   331476000 :                 x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, r);
     590   330888000 :                 xy_x_round_store_8x2_sse2(r, im);
     591   330635000 :                 src_ptr += 2 * src_stride;
     592   330635000 :                 im += 2 * 8;
     593   330635000 :                 y -= 2;
     594   330635000 :             } while (y);
     595             :         }
     596             :     }
     597             :     else {
     598    51995700 :         prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
     599             : 
     600    52341500 :         if (w == 16) {
     601             :             do {
     602             :                 __m256i r[2];
     603             : 
     604   325434000 :                 x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
     605   323488000 :                 xy_x_round_store_32_avx2(r, im);
     606   325072000 :                 src_ptr += 2 * src_stride;
     607   325072000 :                 im += 2 * 16;
     608   325072000 :                 y -= 2;
     609   325072000 :             } while (y);
     610             :         }
     611    21555300 :         else if (w == 32) {
     612             :             do {
     613   388049000 :                 xy_x_2tap_32_avx2(src_ptr, coeffs_256, im);
     614   387730000 :                 src_ptr += src_stride;
     615   387730000 :                 im += 32;
     616   387730000 :             } while (--y);
     617             :         }
     618     4744960 :         else if (w == 64) {
     619             :             do {
     620   154700000 :                 xy_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, im + 0 * 32);
     621   154650000 :                 xy_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, im + 1 * 32);
     622   154645000 :                 src_ptr += src_stride;
     623   154645000 :                 im += 64;
     624   154645000 :             } while (--y);
     625             :         }
     626             :         else {
     627           0 :             assert(w == 128);
     628             : 
     629             :             do {
     630           0 :                 xy_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, im + 0 * 32);
     631           0 :                 xy_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, im + 1 * 32);
     632           0 :                 xy_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, im + 2 * 32);
     633           0 :                 xy_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, im + 3 * 32);
     634           0 :                 src_ptr += src_stride;
     635           0 :                 im += 128;
     636           0 :             } while (--y);
     637             :         }
     638             :     }
     639    91304300 : }
     640             : 
     641     1842500 : static void jnt_convolve_2d_hor_4tap_avx2(
     642             :     const uint8_t *src, const int32_t src_stride, const int32_t w,
     643             :     const int32_t h, const InterpFilterParams *filter_params_x,
     644             :     const int32_t subpel_x_q4, int16_t *const im_block) {
     645     1842500 :     const uint8_t *src_ptr = src - 1;
     646     1842500 :     int32_t y = h;
     647     1842500 :     int16_t *im = im_block;
     648             :     __m128i coeffs_128[4];
     649             : 
     650     1842500 :     prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
     651             : 
     652     1842530 :     if (w == 2) {
     653             :         do {
     654             :             const __m128i r =
     655           0 :                 x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
     656           0 :             xy_x_round_store_2x2_sse2(r, im);
     657           0 :             src_ptr += 2 * src_stride;
     658           0 :             im += 2 * 2;
     659           0 :             y -= 2;
     660           0 :         } while (y);
     661             :     }
     662             :     else {
     663     1842530 :         assert(w == 4);
     664             : 
     665             :         do {
     666             :             const __m128i r =
     667    12752200 :                 x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
     668    12751600 :             xy_x_round_store_4x2_sse2(r, im);
     669    12750700 :             src_ptr += 2 * src_stride;
     670    12750700 :             im += 2 * 4;
     671    12750700 :             y -= 2;
     672    12750700 :         } while (y);
     673             :     }
     674     1841070 : }
     675             : 
     676    38411000 : static void jnt_convolve_2d_hor_6tap_avx2(
     677             :     const uint8_t *src, const int32_t src_stride, const int32_t w,
     678             :     const int32_t h, const InterpFilterParams *filter_params_x,
     679             :     const int32_t subpel_x_q4, int16_t *const im_block) {
     680    38411000 :     const uint8_t *src_ptr = src - 2;
     681    38411000 :     int32_t y = h;
     682    38411000 :     int16_t *im = im_block;
     683             :     __m256i coeffs_256[4], filt_256[4];
     684             : 
     685    38411000 :     filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
     686    38411000 :     filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
     687    38411000 :     filt_256[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
     688             : 
     689    38411000 :     prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
     690             : 
     691    38440200 :     if (w == 8) {
     692             :         do {
     693   182864000 :             const __m256i res = x_convolve_6tap_8x2_avx2(
     694             :                 src_ptr, src_stride, coeffs_256, filt_256);
     695   182558000 :             xy_x_round_store_8x2_avx2(res, im);
     696   182732000 :             src_ptr += 2 * src_stride;
     697   182732000 :             im += 2 * 8;
     698   182732000 :             y -= 2;
     699   182732000 :         } while (y);
     700             :     }
     701    20671400 :     else if (w == 16) {
     702             :         do {
     703             :             __m256i r[2];
     704             : 
     705   150065000 :             x_convolve_6tap_16x2_avx2(
     706             :                 src_ptr, src_stride, coeffs_256, filt_256, r);
     707   149552000 :             xy_x_round_store_32_avx2(r, im);
     708   149999000 :             src_ptr += 2 * src_stride;
     709   149999000 :             im += 2 * 16;
     710   149999000 :             y -= 2;
     711   149999000 :         } while (y);
     712             :     }
     713     8763400 :     else if (w == 32) {
     714             :         do {
     715   172998000 :             xy_x_6tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
     716   172938000 :             src_ptr += src_stride;
     717   172938000 :             im += 32;
     718   172938000 :         } while (--y);
     719             :     }
     720     1915660 :     else if (w == 64) {
     721             :         do {
     722    64319500 :             xy_x_6tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
     723    64328000 :             xy_x_6tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
     724    64314700 :             src_ptr += src_stride;
     725    64314700 :             im += 64;
     726    64314700 :         } while (--y);
     727             :     }
     728             :     else {
     729           0 :         assert(w == 128);
     730             : 
     731             :         do {
     732           0 :             xy_x_6tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
     733           0 :             xy_x_6tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
     734           0 :             xy_x_6tap_32_avx2(src_ptr + 64, 16, coeffs_256, filt_256, im + 64);
     735           0 :             xy_x_6tap_32_avx2(src_ptr + 96, 16, coeffs_256, filt_256, im + 96);
     736           0 :             src_ptr += src_stride;
     737           0 :             im += 128;
     738           0 :         } while (--y);
     739             :     }
     740    38175800 : }
     741             : 
     742    16457900 : static void jnt_convolve_2d_hor_8tap_avx2(
     743             :     const uint8_t *src, const int32_t src_stride, const int32_t w,
     744             :     const int32_t h, const InterpFilterParams *filter_params_x,
     745             :     const int32_t subpel_x_q4, int16_t *const im_block) {
     746    16457900 :     const uint8_t *src_ptr = src - 3;
     747    16457900 :     int32_t y = h;
     748    16457900 :     int16_t *im = im_block;
     749             :     __m256i coeffs_256[4], filt_256[4];
     750             : 
     751    16457900 :     filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
     752    16457900 :     filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
     753    16457900 :     filt_256[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
     754    16457900 :     filt_256[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
     755             : 
     756    16457900 :     prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
     757             : 
     758    16464100 :     if (w == 8) {
     759             :         do {
     760    84064200 :             const __m256i res = x_convolve_8tap_8x2_avx2(
     761             :                 src_ptr, src_stride, coeffs_256, filt_256);
     762    84007400 :             xy_x_round_store_8x2_avx2(res, im);
     763    84039500 :             src_ptr += 2 * src_stride;
     764    84039500 :             im += 2 * 8;
     765    84039500 :             y -= 2;
     766    84039500 :         } while (y);
     767             :     }
     768     9201710 :     else if (w == 16) {
     769             :         do {
     770             :             __m256i r[2];
     771             : 
     772             :             x_convolve_8tap_16x2_avx2(
     773             :                 src_ptr, src_stride, coeffs_256, filt_256, r);
     774    71086600 :             xy_x_round_store_32_avx2(r, im);
     775    71155700 :             src_ptr += 2 * src_stride;
     776    71155700 :             im += 2 * 16;
     777    71155700 :             y -= 2;
     778    71155700 :         } while (y);
     779             :     }
     780     4162370 :     else if (w == 32) {
     781             :         do {
     782    89758000 :             xy_x_8tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
     783    89743900 :             src_ptr += src_stride;
     784    89743900 :             im += 32;
     785    89743900 :         } while (--y);
     786             :     }
     787      972550 :     else if (w == 64) {
     788             :         do {
     789    35313800 :             xy_x_8tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
     790    35311700 :             xy_x_8tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
     791    35311700 :             src_ptr += src_stride;
     792    35311700 :             im += 64;
     793    35311700 :         } while (--y);
     794             :     }
     795             :     else {
     796           0 :         assert(w == 128);
     797             : 
     798             :         do {
     799           0 :             xy_x_8tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
     800           0 :             xy_x_8tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
     801           0 :             xy_x_8tap_32_avx2(src_ptr + 64, 16, coeffs_256, filt_256, im + 64);
     802           0 :             xy_x_8tap_32_avx2(src_ptr + 96, 16, coeffs_256, filt_256, im + 96);
     803           0 :             src_ptr += src_stride;
     804           0 :             im += 128;
     805           0 :         } while (--y);
     806             :     }
     807    16424800 : }
     808             : 
     809    74381700 : static void jnt_convolve_2d_ver_2tap_avx2(
     810             :     const int16_t *const im_block, const int32_t w, const int32_t h,
     811             :     const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
     812             :     const ConvolveParams *const conv_params, uint8_t *dst8,
     813             :     const int32_t dst8_stride) {
     814    74381700 :     const int32_t dst_stride = conv_params->dst_stride;
     815    74381700 :     const int32_t bd = 8;
     816    74381700 :     const int32_t round_0 = 3;
     817    74381700 :     const int16_t *im = im_block;
     818    74381700 :     const int32_t round_1 = COMPOUND_ROUND1_BITS;
     819    74381700 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;      // 19
     820    74381700 :     const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;  // 4
     821    74381700 :     const int32_t round_offset = 1 << (offset_bits - round_1);
     822    74381700 :     const int32_t factor =
     823    74381700 :         conv_params->fwd_offset | (conv_params->bck_offset << 16);
     824    74381700 :     const int32_t offset_comp_avg =
     825    74381700 :         (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
     826    74381700 :         (round_offset << DIST_PRECISION_BITS) -
     827    74381700 :         (round_offset << (DIST_PRECISION_BITS - 1)) +
     828    74381700 :         (1 << (round_bits + DIST_PRECISION_BITS - 1));
     829    74381700 :     const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
     830    74381700 :     const __m128i factor_128 = _mm_set1_epi32(factor);
     831    74381700 :     const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
     832    74381700 :     const __m256i factor_256 = _mm256_set1_epi32(factor);
     833    74381700 :     const int32_t offset_avg = (1 << (round_1 - 1)) +
     834    74381700 :         (1 << (round_bits + round_1)) -
     835    74381700 :         (1 << offset_bits) - (1 << (offset_bits - 1));
     836    74381700 :     const int32_t offset_no_avg =
     837    74381700 :         (1 << (round_1 - 1)) + (1 << offset_bits) + (1 << (offset_bits - 1));
     838    74381700 :     const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
     839    74381700 :     const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
     840    74381700 :     const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
     841    74381700 :     const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
     842    74381700 :     ConvBufType *dst = conv_params->dst;
     843    74381700 :     int32_t y = h;
     844             :     __m128i coeffs_128[4];
     845             :     __m256i coeffs_256[4];
     846             : 
     847    74381700 :     if (w <= 4) {
     848           0 :         prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
     849             : 
     850           0 :         if (w == 2) {
     851             :             __m128i s_32[2];
     852             : 
     853           0 :             s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
     854             : 
     855           0 :             if (conv_params->do_average) {
     856           0 :                 if (conv_params->use_jnt_comp_avg) {
     857             :                     do {
     858             :                         const __m128i res =
     859           0 :                             xy_y_convolve_2tap_2x2_sse2(im, s_32, coeffs_128);
     860           0 :                         jnt_2d_comp_avg_round_store_2x2_sse2(
     861             :                             res,
     862             :                             factor_128,
     863             :                             offset_comp_avg_128,
     864             :                             dst,
     865             :                             dst_stride,
     866             :                             dst8,
     867             :                             dst8_stride);
     868           0 :                         im += 2 * 2;
     869           0 :                         dst += 2 * dst_stride;
     870           0 :                         dst8 += 2 * dst8_stride;
     871           0 :                         y -= 2;
     872           0 :                     } while (y);
     873             :                 }
     874             :                 else {
     875             :                     do {
     876             :                         const __m128i res =
     877           0 :                             xy_y_convolve_2tap_2x2_sse2(im, s_32, coeffs_128);
     878           0 :                         jnt_2d_avg_round_store_2x2_sse2(res,
     879             :                             offset_avg_128,
     880             :                             dst,
     881             :                             dst_stride,
     882             :                             dst8,
     883             :                             dst8_stride);
     884           0 :                         im += 2 * 2;
     885           0 :                         dst += 2 * dst_stride;
     886           0 :                         dst8 += 2 * dst8_stride;
     887           0 :                         y -= 2;
     888           0 :                     } while (y);
     889             :                 }
     890             :             }
     891             :             else {
     892             :                 do {
     893             :                     const __m128i res =
     894           0 :                         xy_y_convolve_2tap_2x2_sse2(im, s_32, coeffs_128);
     895           0 :                     jnt_2d_no_avg_round_store_2x2_sse2(
     896             :                         res, offset_no_avg_128, dst, dst_stride);
     897           0 :                     im += 2 * 2;
     898           0 :                     dst += 2 * dst_stride;
     899           0 :                     y -= 2;
     900           0 :                 } while (y);
     901             :             }
     902             :         }
     903             :         else {
     904             :             __m128i s_64[2], r[2];
     905             : 
     906           0 :             assert(w == 4);
     907             : 
     908           0 :             s_64[0] = _mm_loadl_epi64((__m128i *)im);
     909             : 
     910           0 :             if (conv_params->do_average) {
     911           0 :                 if (conv_params->use_jnt_comp_avg) {
     912             :                     do {
     913           0 :                         xy_y_convolve_2tap_4x2_sse2(im, s_64, coeffs_128, r);
     914           0 :                         jnt_2d_comp_avg_round_store_4x2_sse2(
     915             :                             r,
     916             :                             factor_128,
     917             :                             offset_comp_avg_128,
     918             :                             dst,
     919             :                             dst_stride,
     920             :                             dst8,
     921             :                             dst8_stride);
     922           0 :                         im += 2 * 4;
     923           0 :                         dst += 2 * dst_stride;
     924           0 :                         dst8 += 2 * dst8_stride;
     925           0 :                         y -= 2;
     926           0 :                     } while (y);
     927             :                 }
     928             :                 else {
     929             :                     do {
     930           0 :                         xy_y_convolve_2tap_4x2_sse2(im, s_64, coeffs_128, r);
     931           0 :                         jnt_2d_avg_round_store_4x2_sse2(r,
     932             :                             offset_avg_128,
     933             :                             dst,
     934             :                             dst_stride,
     935             :                             dst8,
     936             :                             dst8_stride);
     937           0 :                         im += 2 * 4;
     938           0 :                         dst += 2 * dst_stride;
     939           0 :                         dst8 += 2 * dst8_stride;
     940           0 :                         y -= 2;
     941           0 :                     } while (y);
     942             :                 }
     943             :             }
     944             :             else {
     945             :                 do {
     946           0 :                     xy_y_convolve_2tap_4x2_sse2(im, s_64, coeffs_128, r);
     947           0 :                     jnt_2d_no_avg_round_store_4x2_sse2(
     948             :                         r, offset_no_avg_128, dst, dst_stride);
     949           0 :                     im += 2 * 4;
     950           0 :                     dst += 2 * dst_stride;
     951           0 :                     y -= 2;
     952           0 :                 } while (y);
     953             :             }
     954             :         }
     955             :     }
     956             :     else {
     957    74381700 :         prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
     958             : 
     959    74443600 :         if (w == 8) {
     960             :             __m128i s_128[2];
     961             :             __m256i r[2];
     962             : 
     963    32584400 :             s_128[0] = _mm_load_si128((__m128i *)im);
     964             : 
     965    32584400 :             if (conv_params->do_average) {
     966     8919240 :                 if (conv_params->use_jnt_comp_avg) {
     967             :                     do {
     968    33826600 :                         xy_y_convolve_2tap_8x2_avx2(im, s_128, coeffs_256, r);
     969    33799700 :                         jnt_2d_comp_avg_round_store_8x2_avx2(
     970             :                             r,
     971             :                             factor_256,
     972             :                             offset_comp_avg_256,
     973             :                             dst,
     974             :                             dst_stride,
     975             :                             dst8,
     976             :                             dst8_stride);
     977    33820200 :                         im += 2 * 8;
     978    33820200 :                         dst += 2 * dst_stride;
     979    33820200 :                         dst8 += 2 * dst8_stride;
     980    33820200 :                         y -= 2;
     981    33820200 :                     } while (y);
     982             :                 }
     983             :                 else {
     984             :                     do {
     985    33815000 :                         xy_y_convolve_2tap_8x2_avx2(im, s_128, coeffs_256, r);
     986    33803500 :                         jnt_2d_avg_round_store_8x2_avx2(r,
     987             :                             offset_avg_256,
     988             :                             dst,
     989             :                             dst_stride,
     990             :                             dst8,
     991             :                             dst8_stride);
     992    33804600 :                         im += 2 * 8;
     993    33804600 :                         dst += 2 * dst_stride;
     994    33804600 :                         dst8 += 2 * dst8_stride;
     995    33804600 :                         y -= 2;
     996    33804600 :                     } while (y);
     997             :                 }
     998             :             }
     999             :             else {
    1000             :                 do {
    1001   168593000 :                     xy_y_convolve_2tap_8x2_avx2(im, s_128, coeffs_256, r);
    1002   168291000 :                     jnt_2d_no_avg_round_store_8x2_avx2(
    1003             :                         r, offset_no_avg_256, dst, dst_stride);
    1004   168598000 :                     im += 2 * 8;
    1005   168598000 :                     dst += 2 * dst_stride;
    1006   168598000 :                     y -= 2;
    1007   168598000 :                 } while (y);
    1008             :             }
    1009             :         }
    1010    41859200 :         else if (w == 16) {
    1011             :             __m256i s_256[2], r[4];
    1012             : 
    1013    24531400 :             s_256[0] = _mm256_load_si256((__m256i *)im);
    1014             : 
    1015    24531400 :             if (conv_params->do_average) {
    1016     6635360 :                 if (conv_params->use_jnt_comp_avg) {
    1017             :                     do {
    1018    33634400 :                         xy_y_convolve_2tap_16x2_avx2(im, s_256, coeffs_256, r);
    1019    33584700 :                         jnt_2d_comp_avg_round_store_16x2_avx2(
    1020             :                             r,
    1021             :                             factor_256,
    1022             :                             offset_comp_avg_256,
    1023             :                             dst,
    1024             :                             dst_stride,
    1025             :                             dst8,
    1026             :                             dst8_stride);
    1027    33627500 :                         im += 2 * 16;
    1028    33627500 :                         dst += 2 * dst_stride;
    1029    33627500 :                         dst8 += 2 * dst8_stride;
    1030    33627500 :                         y -= 2;
    1031    33627500 :                     } while (y);
    1032             :                 }
    1033             :                 else {
    1034             :                     do {
    1035    33635800 :                         xy_y_convolve_2tap_16x2_avx2(im, s_256, coeffs_256, r);
    1036    33594600 :                         jnt_2d_avg_round_store_16x2_avx2(r,
    1037             :                             offset_avg_256,
    1038             :                             dst,
    1039             :                             dst_stride,
    1040             :                             dst8,
    1041             :                             dst8_stride);
    1042    33629900 :                         im += 2 * 16;
    1043    33629900 :                         dst += 2 * dst_stride;
    1044    33629900 :                         dst8 += 2 * dst8_stride;
    1045    33629900 :                         y -= 2;
    1046    33629900 :                     } while (y);
    1047             :                 }
    1048             :             }
    1049             :             else {
    1050             :                 do {
    1051   168345000 :                     xy_y_convolve_2tap_16x2_avx2(im, s_256, coeffs_256, r);
    1052   167555000 :                     jnt_2d_no_avg_round_store_16x2_avx2(
    1053             :                         r, offset_no_avg_256, dst, dst_stride);
    1054   168358000 :                     im += 2 * 16;
    1055   168358000 :                     dst += 2 * dst_stride;
    1056   168358000 :                     y -= 2;
    1057   168358000 :                 } while (y);
    1058             :             }
    1059             :         }
    1060    17327800 :         else if (w == 32) {
    1061             :             __m256i s_256[2][2];
    1062             : 
    1063    13570000 :             s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
    1064    13570000 :             s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
    1065             : 
    1066    13570000 :             if (conv_params->do_average) {
    1067     4051380 :                 if (conv_params->use_jnt_comp_avg) {
    1068             :                     do {
    1069             :                         __m256i r[4];
    1070             : 
    1071    22251500 :                         xy_y_convolve_2tap_32_avx2(
    1072             :                             im + 1 * 32, s_256[0], s_256[1], coeffs_256, r);
    1073    22228300 :                         jnt_2d_comp_avg_round_store_32_avx2(r + 0,
    1074             :                             r + 2,
    1075             :                             factor_256,
    1076             :                             offset_comp_avg_256,
    1077             :                             dst,
    1078             :                             dst8);
    1079             : 
    1080    22249500 :                         xy_y_convolve_2tap_32_avx2(
    1081             :                             im + 2 * 32, s_256[1], s_256[0], coeffs_256, r);
    1082    22228900 :                         jnt_2d_comp_avg_round_store_32_avx2(r + 0,
    1083             :                             r + 2,
    1084             :                             factor_256,
    1085             :                             offset_comp_avg_256,
    1086    22228900 :                             dst + dst_stride,
    1087             :                             dst8 + dst8_stride);
    1088             : 
    1089    22248300 :                         im += 2 * 32;
    1090    22248300 :                         dst += 2 * dst_stride;
    1091    22248300 :                         dst8 += 2 * dst8_stride;
    1092    22248300 :                         y -= 2;
    1093    22248300 :                     } while (y);
    1094             :                 }
    1095             :                 else {
    1096             :                     do {
    1097             :                         __m256i r[4];
    1098             : 
    1099    22249300 :                         xy_y_convolve_2tap_32_avx2(
    1100             :                             im + 1 * 32, s_256[0], s_256[1], coeffs_256, r);
    1101    22233300 :                         jnt_2d_avg_round_store_32_avx2(
    1102             :                             r + 0, r + 2, offset_avg_256, dst, dst8);
    1103             : 
    1104    22246300 :                         xy_y_convolve_2tap_32_avx2(
    1105             :                             im + 2 * 32, s_256[1], s_256[0], coeffs_256, r);
    1106    22233100 :                         jnt_2d_avg_round_store_32_avx2(r + 0,
    1107             :                             r + 2,
    1108             :                             offset_avg_256,
    1109    22233100 :                             dst + dst_stride,
    1110             :                             dst8 + dst8_stride);
    1111             : 
    1112    22244100 :                         im += 2 * 32;
    1113    22244100 :                         dst += 2 * dst_stride;
    1114    22244100 :                         dst8 += 2 * dst8_stride;
    1115    22244100 :                         y -= 2;
    1116    22244100 :                     } while (y);
    1117             :                 }
    1118             :             }
    1119             :             else {
    1120             :                 do {
    1121             :                     __m256i r[4];
    1122             : 
    1123   105580000 :                     xy_y_convolve_2tap_32_avx2(
    1124             :                         im + 1 * 32, s_256[0], s_256[1], coeffs_256, r);
    1125   105315000 :                     jnt_2d_no_avg_round_store_32_avx2(
    1126             :                         r + 0, r + 2, offset_no_avg_256, dst);
    1127             : 
    1128   105586000 :                     xy_y_convolve_2tap_32_avx2(
    1129             :                         im + 2 * 32, s_256[1], s_256[0], coeffs_256, r);
    1130   105340000 :                     jnt_2d_no_avg_round_store_32_avx2(
    1131   105340000 :                         r + 0, r + 2, offset_no_avg_256, dst + dst_stride);
    1132             : 
    1133   105586000 :                     im += 2 * 32;
    1134   105586000 :                     dst += 2 * dst_stride;
    1135   105586000 :                     y -= 2;
    1136   105586000 :                 } while (y);
    1137             :             }
    1138             :         }
    1139     3757820 :         else if (w == 64) {
    1140             :             __m256i s_256[2][4];
    1141             : 
    1142     3835020 :             s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
    1143     3835020 :             s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
    1144     3835020 :             s_256[0][2] = _mm256_load_si256((__m256i *)(im + 2 * 16));
    1145     3835020 :             s_256[0][3] = _mm256_load_si256((__m256i *)(im + 3 * 16));
    1146             : 
    1147     3835020 :             if (conv_params->do_average) {
    1148     1288640 :                 if (conv_params->use_jnt_comp_avg) {
    1149             :                     do {
    1150             :                         __m256i r[4];
    1151             : 
    1152    10199900 :                         xy_y_convolve_2tap_32_avx2(im + 2 * 32,
    1153             :                             s_256[0] + 0,
    1154             :                             s_256[1] + 0,
    1155             :                             coeffs_256,
    1156             :                             r);
    1157    10192600 :                         jnt_2d_comp_avg_round_store_32_avx2(r + 0,
    1158             :                             r + 2,
    1159             :                             factor_256,
    1160             :                             offset_comp_avg_256,
    1161             :                             dst,
    1162             :                             dst8);
    1163             : 
    1164    10199400 :                         xy_y_convolve_2tap_32_avx2(im + 3 * 32,
    1165             :                             s_256[0] + 2,
    1166             :                             s_256[1] + 2,
    1167             :                             coeffs_256,
    1168             :                             r);
    1169    10192100 :                         jnt_2d_comp_avg_round_store_32_avx2(r + 0,
    1170             :                             r + 2,
    1171             :                             factor_256,
    1172             :                             offset_comp_avg_256,
    1173    10192100 :                             dst + 32,
    1174             :                             dst8 + 32);
    1175    10199100 :                         im += 2 * 64;
    1176             : 
    1177    10199100 :                         xy_y_convolve_2tap_32_avx2(im + 0 * 32,
    1178             :                             s_256[1] + 0,
    1179             :                             s_256[0] + 0,
    1180             :                             coeffs_256,
    1181             :                             r);
    1182    10191400 :                         jnt_2d_comp_avg_round_store_32_avx2(r + 0,
    1183             :                             r + 2,
    1184             :                             factor_256,
    1185             :                             offset_comp_avg_256,
    1186    10191400 :                             dst + dst8_stride,
    1187             :                             dst8 + dst8_stride);
    1188             : 
    1189    10199000 :                         xy_y_convolve_2tap_32_avx2(im + 1 * 32,
    1190             :                             s_256[1] + 2,
    1191             :                             s_256[0] + 2,
    1192             :                             coeffs_256,
    1193             :                             r);
    1194    10191600 :                         jnt_2d_comp_avg_round_store_32_avx2(
    1195             :                             r + 0,
    1196             :                             r + 2,
    1197             :                             factor_256,
    1198             :                             offset_comp_avg_256,
    1199    10191600 :                             dst + dst8_stride + 32,
    1200    10191600 :                             dst8 + dst8_stride + 32);
    1201             : 
    1202    10199300 :                         dst += 2 * dst_stride;
    1203    10199300 :                         dst8 += 2 * dst8_stride;
    1204    10199300 :                         y -= 2;
    1205    10199300 :                     } while (y);
    1206             :                 }
    1207             :                 else {
    1208             :                     do {
    1209             :                         __m256i r[4];
    1210             : 
    1211    10198400 :                         xy_y_convolve_2tap_32_avx2(im + 2 * 32,
    1212             :                             s_256[0] + 0,
    1213             :                             s_256[1] + 0,
    1214             :                             coeffs_256,
    1215             :                             r);
    1216    10192700 :                         jnt_2d_avg_round_store_32_avx2(
    1217             :                             r + 0, r + 2, offset_avg_256, dst, dst8);
    1218             : 
    1219    10197600 :                         xy_y_convolve_2tap_32_avx2(im + 3 * 32,
    1220             :                             s_256[0] + 2,
    1221             :                             s_256[1] + 2,
    1222             :                             coeffs_256,
    1223             :                             r);
    1224    10192800 :                         jnt_2d_avg_round_store_32_avx2(
    1225    10192800 :                             r + 0, r + 2, offset_avg_256, dst + 32, dst8 + 32);
    1226    10197600 :                         im += 2 * 64;
    1227             : 
    1228    10197600 :                         xy_y_convolve_2tap_32_avx2(im + 0 * 32,
    1229             :                             s_256[1] + 0,
    1230             :                             s_256[0] + 0,
    1231             :                             coeffs_256,
    1232             :                             r);
    1233    10193300 :                         jnt_2d_avg_round_store_32_avx2(r + 0,
    1234             :                             r + 2,
    1235             :                             offset_avg_256,
    1236    10193300 :                             dst + dst_stride,
    1237             :                             dst8 + dst8_stride);
    1238             : 
    1239    10197600 :                         xy_y_convolve_2tap_32_avx2(im + 1 * 32,
    1240             :                             s_256[1] + 2,
    1241             :                             s_256[0] + 2,
    1242             :                             coeffs_256,
    1243             :                             r);
    1244    10192900 :                         jnt_2d_avg_round_store_32_avx2(r + 0,
    1245             :                             r + 2,
    1246             :                             offset_avg_256,
    1247    10192900 :                             dst + dst_stride + 32,
    1248    10192900 :                             dst8 + dst8_stride + 32);
    1249             : 
    1250    10197100 :                         dst += 2 * dst_stride;
    1251    10197100 :                         dst8 += 2 * dst8_stride;
    1252    10197100 :                         y -= 2;
    1253    10197100 :                     } while (y);
    1254             :                 }
    1255             :             }
    1256             :             else {
    1257             :                 do {
    1258             :                     __m256i r[4];
    1259             : 
    1260    40276100 :                     xy_y_convolve_2tap_32_avx2(
    1261             :                         im + 2 * 32, s_256[0] + 0, s_256[1] + 0, coeffs_256, r);
    1262    40215200 :                     jnt_2d_no_avg_round_store_32_avx2(
    1263             :                         r + 0, r + 2, offset_no_avg_256, dst);
    1264             : 
    1265    40280100 :                     xy_y_convolve_2tap_32_avx2(
    1266             :                         im + 3 * 32, s_256[0] + 2, s_256[1] + 2, coeffs_256, r);
    1267    40219800 :                     jnt_2d_no_avg_round_store_32_avx2(
    1268             :                         r + 0, r + 2, offset_no_avg_256, dst + 32);
    1269    40278900 :                     im += 2 * 64;
    1270             : 
    1271    40278900 :                     xy_y_convolve_2tap_32_avx2(
    1272             :                         im + 0 * 32, s_256[1] + 0, s_256[0] + 0, coeffs_256, r);
    1273    40217800 :                     jnt_2d_no_avg_round_store_32_avx2(
    1274    40217800 :                         r + 0, r + 2, offset_no_avg_256, dst + dst_stride);
    1275             : 
    1276    40277600 :                     xy_y_convolve_2tap_32_avx2(
    1277             :                         im + 1 * 32, s_256[1] + 2, s_256[0] + 2, coeffs_256, r);
    1278    40217000 :                     jnt_2d_no_avg_round_store_32_avx2(
    1279    40217000 :                         r + 0, r + 2, offset_no_avg_256, dst + dst_stride + 32);
    1280             : 
    1281    40278000 :                     dst += 2 * dst_stride;
    1282    40278000 :                     y -= 2;
    1283    40278000 :                 } while (y);
    1284             :             }
    1285             :         }
    1286             :         else {
    1287             :             __m256i s_256[2][8];
    1288             : 
    1289           0 :             assert(w == 128);
    1290             : 
    1291             :             load_16bit_8rows_avx2(im, 16, s_256[0]);
    1292             : 
    1293           0 :             if (conv_params->do_average) {
    1294           0 :                 if (conv_params->use_jnt_comp_avg) {
    1295             :                     do {
    1296             :                         __m256i r[4];
    1297             : 
    1298           0 :                         xy_y_convolve_2tap_32_avx2(im + 4 * 32,
    1299             :                             s_256[0] + 0,
    1300             :                             s_256[1] + 0,
    1301             :                             coeffs_256,
    1302             :                             r);
    1303           0 :                         jnt_2d_comp_avg_round_store_32_avx2(r + 0,
    1304             :                             r + 2,
    1305             :                             factor_256,
    1306             :                             offset_comp_avg_256,
    1307             :                             dst,
    1308             :                             dst8);
    1309             : 
    1310           0 :                         xy_y_convolve_2tap_32_avx2(im + 5 * 32,
    1311             :                             s_256[0] + 2,
    1312             :                             s_256[1] + 2,
    1313             :                             coeffs_256,
    1314             :                             r);
    1315           0 :                         jnt_2d_comp_avg_round_store_32_avx2(r + 0,
    1316             :                             r + 2,
    1317             :                             factor_256,
    1318             :                             offset_comp_avg_256,
    1319           0 :                             dst + 1 * 32,
    1320             :                             dst8 + 1 * 32);
    1321             : 
    1322           0 :                         xy_y_convolve_2tap_32_avx2(im + 6 * 32,
    1323             :                             s_256[0] + 4,
    1324             :                             s_256[1] + 4,
    1325             :                             coeffs_256,
    1326             :                             r);
    1327           0 :                         jnt_2d_comp_avg_round_store_32_avx2(r + 0,
    1328             :                             r + 2,
    1329             :                             factor_256,
    1330             :                             offset_comp_avg_256,
    1331           0 :                             dst + 2 * 32,
    1332             :                             dst8 + 2 * 32);
    1333             : 
    1334           0 :                         xy_y_convolve_2tap_32_avx2(im + 7 * 32,
    1335             :                             s_256[0] + 6,
    1336             :                             s_256[1] + 6,
    1337             :                             coeffs_256,
    1338             :                             r);
    1339           0 :                         jnt_2d_comp_avg_round_store_32_avx2(r + 0,
    1340             :                             r + 2,
    1341             :                             factor_256,
    1342             :                             offset_comp_avg_256,
    1343           0 :                             dst + 3 * 32,
    1344             :                             dst8 + 3 * 32);
    1345           0 :                         im += 2 * 128;
    1346             : 
    1347           0 :                         xy_y_convolve_2tap_32_avx2(im + 0 * 32,
    1348             :                             s_256[1] + 0,
    1349             :                             s_256[0] + 0,
    1350             :                             coeffs_256,
    1351             :                             r);
    1352           0 :                         jnt_2d_comp_avg_round_store_32_avx2(
    1353             :                             r + 0,
    1354             :                             r + 2,
    1355             :                             factor_256,
    1356             :                             offset_comp_avg_256,
    1357           0 :                             dst + dst8_stride + 0 * 32,
    1358             :                             dst8 + dst8_stride + 0 * 32);
    1359             : 
    1360           0 :                         xy_y_convolve_2tap_32_avx2(im + 1 * 32,
    1361             :                             s_256[1] + 2,
    1362             :                             s_256[0] + 2,
    1363             :                             coeffs_256,
    1364             :                             r);
    1365           0 :                         jnt_2d_comp_avg_round_store_32_avx2(
    1366             :                             r + 0,
    1367             :                             r + 2,
    1368             :                             factor_256,
    1369             :                             offset_comp_avg_256,
    1370           0 :                             dst + dst8_stride + 1 * 32,
    1371           0 :                             dst8 + dst8_stride + 1 * 32);
    1372             : 
    1373           0 :                         xy_y_convolve_2tap_32_avx2(im + 2 * 32,
    1374             :                             s_256[1] + 4,
    1375             :                             s_256[0] + 4,
    1376             :                             coeffs_256,
    1377             :                             r);
    1378           0 :                         jnt_2d_comp_avg_round_store_32_avx2(
    1379             :                             r + 0,
    1380             :                             r + 2,
    1381             :                             factor_256,
    1382             :                             offset_comp_avg_256,
    1383           0 :                             dst + dst8_stride + 2 * 32,
    1384           0 :                             dst8 + dst8_stride + 2 * 32);
    1385             : 
    1386           0 :                         xy_y_convolve_2tap_32_avx2(im + 3 * 32,
    1387             :                             s_256[1] + 6,
    1388             :                             s_256[0] + 6,
    1389             :                             coeffs_256,
    1390             :                             r);
    1391           0 :                         jnt_2d_comp_avg_round_store_32_avx2(
    1392             :                             r + 0,
    1393             :                             r + 2,
    1394             :                             factor_256,
    1395             :                             offset_comp_avg_256,
    1396           0 :                             dst + dst8_stride + 3 * 32,
    1397           0 :                             dst8 + dst8_stride + 3 * 32);
    1398             : 
    1399           0 :                         dst += 2 * dst_stride;
    1400           0 :                         dst8 += 2 * dst8_stride;
    1401           0 :                         y -= 2;
    1402           0 :                     } while (y);
    1403             :                 }
    1404             :                 else {
    1405             :                     do {
    1406             :                         __m256i r[4];
    1407             : 
    1408           0 :                         xy_y_convolve_2tap_32_avx2(im + 4 * 32,
    1409             :                             s_256[0] + 0,
    1410             :                             s_256[1] + 0,
    1411             :                             coeffs_256,
    1412             :                             r);
    1413           0 :                         jnt_2d_avg_round_store_32_avx2(r + 0,
    1414             :                             r + 2,
    1415             :                             offset_avg_256,
    1416             :                             dst + 0 * 32,
    1417             :                             dst8 + 0 * 32);
    1418             : 
    1419           0 :                         xy_y_convolve_2tap_32_avx2(im + 5 * 32,
    1420             :                             s_256[0] + 2,
    1421             :                             s_256[1] + 2,
    1422             :                             coeffs_256,
    1423             :                             r);
    1424           0 :                         jnt_2d_avg_round_store_32_avx2(r + 0,
    1425             :                             r + 2,
    1426             :                             offset_avg_256,
    1427           0 :                             dst + 1 * 32,
    1428             :                             dst8 + 1 * 32);
    1429             : 
    1430           0 :                         xy_y_convolve_2tap_32_avx2(im + 6 * 32,
    1431             :                             s_256[0] + 4,
    1432             :                             s_256[1] + 4,
    1433             :                             coeffs_256,
    1434             :                             r);
    1435           0 :                         jnt_2d_avg_round_store_32_avx2(r + 0,
    1436             :                             r + 2,
    1437             :                             offset_avg_256,
    1438           0 :                             dst + 2 * 32,
    1439             :                             dst8 + 2 * 32);
    1440             : 
    1441           0 :                         xy_y_convolve_2tap_32_avx2(im + 7 * 32,
    1442             :                             s_256[0] + 6,
    1443             :                             s_256[1] + 6,
    1444             :                             coeffs_256,
    1445             :                             r);
    1446           0 :                         jnt_2d_avg_round_store_32_avx2(r + 0,
    1447             :                             r + 2,
    1448             :                             offset_avg_256,
    1449           0 :                             dst + 3 * 32,
    1450             :                             dst8 + 3 * 32);
    1451           0 :                         im += 2 * 128;
    1452             : 
    1453           0 :                         xy_y_convolve_2tap_32_avx2(im + 0 * 32,
    1454             :                             s_256[1] + 0,
    1455             :                             s_256[0] + 0,
    1456             :                             coeffs_256,
    1457             :                             r);
    1458           0 :                         jnt_2d_avg_round_store_32_avx2(
    1459             :                             r + 0,
    1460             :                             r + 2,
    1461             :                             offset_avg_256,
    1462           0 :                             dst + dst_stride + 0 * 32,
    1463             :                             dst8 + dst8_stride + 0 * 32);
    1464             : 
    1465           0 :                         xy_y_convolve_2tap_32_avx2(im + 1 * 32,
    1466             :                             s_256[1] + 2,
    1467             :                             s_256[0] + 2,
    1468             :                             coeffs_256,
    1469             :                             r);
    1470           0 :                         jnt_2d_avg_round_store_32_avx2(
    1471             :                             r + 0,
    1472             :                             r + 2,
    1473             :                             offset_avg_256,
    1474           0 :                             dst + dst_stride + 1 * 32,
    1475           0 :                             dst8 + dst8_stride + 1 * 32);
    1476             : 
    1477           0 :                         xy_y_convolve_2tap_32_avx2(im + 2 * 32,
    1478             :                             s_256[1] + 4,
    1479             :                             s_256[0] + 4,
    1480             :                             coeffs_256,
    1481             :                             r);
    1482           0 :                         jnt_2d_avg_round_store_32_avx2(
    1483             :                             r + 0,
    1484             :                             r + 2,
    1485             :                             offset_avg_256,
    1486           0 :                             dst + dst_stride + 2 * 32,
    1487           0 :                             dst8 + dst8_stride + 2 * 32);
    1488             : 
    1489           0 :                         xy_y_convolve_2tap_32_avx2(im + 3 * 32,
    1490             :                             s_256[1] + 6,
    1491             :                             s_256[0] + 6,
    1492             :                             coeffs_256,
    1493             :                             r);
    1494           0 :                         jnt_2d_avg_round_store_32_avx2(
    1495             :                             r + 0,
    1496             :                             r + 2,
    1497             :                             offset_avg_256,
    1498           0 :                             dst + dst_stride + 3 * 32,
    1499           0 :                             dst8 + dst8_stride + 3 * 32);
    1500             : 
    1501           0 :                         dst += 2 * dst_stride;
    1502           0 :                         dst8 += 2 * dst8_stride;
    1503           0 :                         y -= 2;
    1504           0 :                     } while (y);
    1505             :                 }
    1506             :             }
    1507             :             else {
    1508             :                 do {
    1509             :                     __m256i r[4];
    1510             : 
    1511           0 :                     xy_y_convolve_2tap_32_avx2(
    1512             :                         im + 4 * 32, s_256[0] + 0, s_256[1] + 0, coeffs_256, r);
    1513           0 :                     jnt_2d_no_avg_round_store_32_avx2(
    1514             :                         r + 0, r + 2, offset_no_avg_256, dst + 0 * 32);
    1515             : 
    1516           0 :                     xy_y_convolve_2tap_32_avx2(
    1517             :                         im + 5 * 32, s_256[0] + 2, s_256[1] + 2, coeffs_256, r);
    1518           0 :                     jnt_2d_no_avg_round_store_32_avx2(
    1519             :                         r + 0, r + 2, offset_no_avg_256, dst + 1 * 32);
    1520             : 
    1521           0 :                     xy_y_convolve_2tap_32_avx2(
    1522             :                         im + 6 * 32, s_256[0] + 4, s_256[1] + 4, coeffs_256, r);
    1523           0 :                     jnt_2d_no_avg_round_store_32_avx2(
    1524             :                         r + 0, r + 2, offset_no_avg_256, dst + 2 * 32);
    1525             : 
    1526           0 :                     xy_y_convolve_2tap_32_avx2(
    1527             :                         im + 7 * 32, s_256[0] + 6, s_256[1] + 6, coeffs_256, r);
    1528           0 :                     jnt_2d_no_avg_round_store_32_avx2(
    1529             :                         r + 0, r + 2, offset_no_avg_256, dst + 3 * 32);
    1530           0 :                     im += 2 * 128;
    1531             : 
    1532           0 :                     xy_y_convolve_2tap_32_avx2(
    1533             :                         im + 0 * 32, s_256[1] + 0, s_256[0] + 0, coeffs_256, r);
    1534           0 :                     jnt_2d_no_avg_round_store_32_avx2(
    1535             :                         r + 0,
    1536             :                         r + 2,
    1537             :                         offset_no_avg_256,
    1538           0 :                         dst + dst_stride + 0 * 32);
    1539             : 
    1540           0 :                     xy_y_convolve_2tap_32_avx2(
    1541             :                         im + 1 * 32, s_256[1] + 2, s_256[0] + 2, coeffs_256, r);
    1542           0 :                     jnt_2d_no_avg_round_store_32_avx2(
    1543             :                         r + 0,
    1544             :                         r + 2,
    1545             :                         offset_no_avg_256,
    1546           0 :                         dst + dst_stride + 1 * 32);
    1547             : 
    1548           0 :                     xy_y_convolve_2tap_32_avx2(
    1549             :                         im + 2 * 32, s_256[1] + 4, s_256[0] + 4, coeffs_256, r);
    1550           0 :                     jnt_2d_no_avg_round_store_32_avx2(
    1551             :                         r + 0,
    1552             :                         r + 2,
    1553             :                         offset_no_avg_256,
    1554           0 :                         dst + dst_stride + 2 * 32);
    1555             : 
    1556           0 :                     xy_y_convolve_2tap_32_avx2(
    1557             :                         im + 3 * 32, s_256[1] + 6, s_256[0] + 6, coeffs_256, r);
    1558           0 :                     jnt_2d_no_avg_round_store_32_avx2(
    1559             :                         r + 0,
    1560             :                         r + 2,
    1561             :                         offset_no_avg_256,
    1562           0 :                         dst + dst_stride + 3 * 32);
    1563             : 
    1564           0 :                     dst += 2 * dst_stride;
    1565           0 :                     y -= 2;
    1566           0 :                 } while (y);
    1567             :             }
    1568             :         }
    1569             :     }
    1570    74507400 : }
    1571             : 
    1572    18493400 : static void jnt_convolve_2d_ver_2tap_half_avx2(
    1573             :     const int16_t *const im_block, const int32_t w, const int32_t h,
    1574             :     const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
    1575             :     const ConvolveParams *const conv_params, uint8_t *dst8,
    1576             :     const int32_t dst8_stride) {
    1577    18493400 :     const int32_t dst_stride = conv_params->dst_stride;
    1578    18493400 :     const int32_t bd = 8;
    1579    18493400 :     const int32_t round_0 = 3;
    1580    18493400 :     const int16_t *im = im_block;
    1581    18493400 :     const int32_t round_1 = COMPOUND_ROUND1_BITS;
    1582    18493400 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;      // 19
    1583    18493400 :     const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;  // 4
    1584    18493400 :     const int32_t round_offset = 1 << (offset_bits - round_1);
    1585    18493400 :     const int32_t factor =
    1586    18493400 :         conv_params->fwd_offset | (conv_params->bck_offset << 16);
    1587    18493400 :     const int32_t offset_comp_avg =
    1588    18493400 :         (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
    1589    18493400 :         (round_offset << DIST_PRECISION_BITS) -
    1590    18493400 :         (round_offset << (DIST_PRECISION_BITS - 1)) +
    1591    18493400 :         (1 << (round_bits + DIST_PRECISION_BITS - 1));
    1592    18493400 :     const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
    1593    18493400 :     const __m128i factor_128 = _mm_set1_epi32(factor);
    1594    18493400 :     const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
    1595    18493400 :     const __m256i factor_256 = _mm256_set1_epi32(factor);
    1596    18493400 :     const int32_t offset_avg =
    1597    18493400 :         (1 << (round_1 - COMPOUND_ROUND1_BITS)) +
    1598    18493400 :         (1 << (round_bits + round_1 - COMPOUND_ROUND1_BITS + 1)) -
    1599    18493400 :         (1 << (offset_bits - COMPOUND_ROUND1_BITS + 1)) -
    1600    18493400 :         (1 << (offset_bits - COMPOUND_ROUND1_BITS));
    1601    18493400 :     const int32_t offset_no_avg =
    1602    18493400 :         (1 << (round_1 - COMPOUND_ROUND1_BITS)) +
    1603    18493400 :         (1 << (offset_bits - COMPOUND_ROUND1_BITS + 1)) +
    1604    18493400 :         (1 << (offset_bits - COMPOUND_ROUND1_BITS));
    1605    18493400 :     const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
    1606    18493400 :     const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
    1607    18493400 :     const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
    1608    18493400 :     const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
    1609    18493400 :     ConvBufType *dst = conv_params->dst;
    1610    18493400 :     int32_t y = h;
    1611             : 
    1612             :     (void)filter_params_y;
    1613             :     (void)subpel_y_q4;
    1614             : 
    1615    18493400 :     if (w == 2) {
    1616             :         __m128i s_32[2];
    1617             : 
    1618           0 :         s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
    1619             : 
    1620           0 :         if (conv_params->do_average) {
    1621           0 :             if (conv_params->use_jnt_comp_avg) {
    1622             :                 do {
    1623             :                     const __m128i res =
    1624           0 :                         xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
    1625           0 :                     jnt_2d_comp_avg_round_store_half_pel_2x2_sse2(
    1626             :                         res,
    1627             :                         factor_128,
    1628             :                         offset_comp_avg_128,
    1629             :                         dst,
    1630             :                         dst_stride,
    1631             :                         dst8,
    1632             :                         dst8_stride);
    1633           0 :                     im += 2 * 2;
    1634           0 :                     dst += 2 * dst_stride;
    1635           0 :                     dst8 += 2 * dst8_stride;
    1636           0 :                     y -= 2;
    1637           0 :                 } while (y);
    1638             :             }
    1639             :             else {
    1640             :                 do {
    1641             :                     const __m128i res =
    1642           0 :                         xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
    1643           0 :                     jnt_2d_avg_round_store_half_pel_2x2_sse2(res,
    1644             :                         offset_avg_128,
    1645             :                         dst,
    1646             :                         dst_stride,
    1647             :                         dst8,
    1648             :                         dst8_stride);
    1649           0 :                     im += 2 * 2;
    1650           0 :                     dst += 2 * dst_stride;
    1651           0 :                     dst8 += 2 * dst8_stride;
    1652           0 :                     y -= 2;
    1653           0 :                 } while (y);
    1654             :             }
    1655             :         }
    1656             :         else {
    1657             :             do {
    1658             :                 const __m128i res =
    1659           0 :                     xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
    1660           0 :                 jnt_2d_no_avg_round_store_half_pel_2x2_sse2(
    1661             :                     res, offset_no_avg_128, dst, dst_stride);
    1662           0 :                 im += 2 * 2;
    1663           0 :                 dst += 2 * dst_stride;
    1664           0 :                 y -= 2;
    1665           0 :             } while (y);
    1666             :         }
    1667             :     }
    1668    18493400 :     else if (w == 4) {
    1669             :         __m128i s_64[2];
    1670             : 
    1671           0 :         s_64[0] = _mm_loadl_epi64((__m128i *)im);
    1672             : 
    1673           0 :         if (conv_params->do_average) {
    1674           0 :             if (conv_params->use_jnt_comp_avg) {
    1675             :                 do {
    1676             :                     const __m128i res =
    1677           0 :                         xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
    1678           0 :                     jnt_2d_comp_avg_round_store_half_pel_4x2_sse2(
    1679             :                         res,
    1680             :                         factor_128,
    1681             :                         offset_comp_avg_128,
    1682             :                         dst,
    1683             :                         dst_stride,
    1684             :                         dst8,
    1685             :                         dst8_stride);
    1686           0 :                     im += 2 * 4;
    1687           0 :                     dst += 2 * dst_stride;
    1688           0 :                     dst8 += 2 * dst8_stride;
    1689           0 :                     y -= 2;
    1690           0 :                 } while (y);
    1691             :             }
    1692             :             else {
    1693             :                 do {
    1694             :                     const __m128i res =
    1695           0 :                         xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
    1696           0 :                     jnt_2d_avg_round_store_half_pel_4x2_sse2(res,
    1697             :                         offset_avg_128,
    1698             :                         dst,
    1699             :                         dst_stride,
    1700             :                         dst8,
    1701             :                         dst8_stride);
    1702           0 :                     im += 2 * 4;
    1703           0 :                     dst += 2 * dst_stride;
    1704           0 :                     dst8 += 2 * dst8_stride;
    1705           0 :                     y -= 2;
    1706           0 :                 } while (y);
    1707             :             }
    1708             :         }
    1709             :         else {
    1710             :             do {
    1711             :                 const __m128i res =
    1712           0 :                     xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
    1713           0 :                 jnt_2d_no_avg_round_store_half_pel_4x2_sse2(
    1714             :                     res, offset_no_avg_128, dst, dst_stride);
    1715           0 :                 im += 2 * 4;
    1716           0 :                 dst += 2 * dst_stride;
    1717           0 :                 y -= 2;
    1718           0 :             } while (y);
    1719             :         }
    1720             :     }
    1721    18493400 :     else if (w == 8) {
    1722             :         __m128i s_128[2];
    1723             : 
    1724     8011830 :         s_128[0] = _mm_load_si128((__m128i *)im);
    1725             : 
    1726     8011830 :         if (conv_params->do_average) {
    1727     2171040 :             if (conv_params->use_jnt_comp_avg) {
    1728             :                 do {
    1729             :                     const __m256i res =
    1730     8006750 :                         xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
    1731     8006130 :                     jnt_2d_comp_avg_round_store_half_pel_8x2_avx2(
    1732             :                         res,
    1733             :                         factor_256,
    1734             :                         offset_comp_avg_256,
    1735             :                         dst,
    1736             :                         dst_stride,
    1737             :                         dst8,
    1738             :                         dst8_stride);
    1739     8006410 :                     im += 2 * 8;
    1740     8006410 :                     dst += 2 * dst_stride;
    1741     8006410 :                     dst8 += 2 * dst8_stride;
    1742     8006410 :                     y -= 2;
    1743     8006410 :                 } while (y);
    1744             :             }
    1745             :             else {
    1746             :                 do {
    1747             :                     const __m256i res =
    1748     8006490 :                         xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
    1749     8006050 :                     jnt_2d_avg_round_store_half_pel_8x2_avx2(res,
    1750             :                         offset_avg_256,
    1751             :                         dst,
    1752             :                         dst_stride,
    1753             :                         dst8,
    1754             :                         dst8_stride);
    1755     8005970 :                     im += 2 * 8;
    1756     8005970 :                     dst += 2 * dst_stride;
    1757     8005970 :                     dst8 += 2 * dst8_stride;
    1758     8005970 :                     y -= 2;
    1759     8005970 :                 } while (y);
    1760             :             }
    1761             :         }
    1762             :         else {
    1763             :             do {
    1764             :                 const __m256i res =
    1765    40598300 :                     xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
    1766    40595300 :                 jnt_2d_no_avg_round_store_half_pel_8x2_avx2(
    1767             :                     res, offset_no_avg_256, dst, dst_stride);
    1768    40598100 :                 im += 2 * 8;
    1769    40598100 :                 dst += 2 * dst_stride;
    1770    40598100 :                 y -= 2;
    1771    40598100 :             } while (y);
    1772             :         }
    1773             :     }
    1774    10481600 :     else if (w == 16) {
    1775             :         __m256i s_256[2], r[2];
    1776             : 
    1777     6328260 :         s_256[0] = _mm256_load_si256((__m256i *)im);
    1778             : 
    1779     6328260 :         if (conv_params->do_average) {
    1780     1693650 :             if (conv_params->use_jnt_comp_avg) {
    1781             :                 do {
    1782     8432670 :                     xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
    1783     8431980 :                     jnt_2d_comp_avg_round_store_half_pel_16x2_avx2(
    1784             :                         r,
    1785             :                         factor_256,
    1786             :                         offset_comp_avg_256,
    1787             :                         dst,
    1788             :                         dst_stride,
    1789             :                         dst8,
    1790             :                         dst8_stride);
    1791     8432270 :                     im += 2 * 16;
    1792     8432270 :                     dst += 2 * dst_stride;
    1793     8432270 :                     dst8 += 2 * dst8_stride;
    1794     8432270 :                     y -= 2;
    1795     8432270 :                 } while (y);
    1796             :             }
    1797             :             else {
    1798             :                 do {
    1799     8433100 :                     xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
    1800     8432670 :                     jnt_2d_avg_round_store_half_pel_16x2_avx2(
    1801             :                         r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
    1802     8432800 :                     im += 2 * 16;
    1803     8432800 :                     dst += 2 * dst_stride;
    1804     8432800 :                     dst8 += 2 * dst8_stride;
    1805     8432800 :                     y -= 2;
    1806     8432800 :                 } while (y);
    1807             :             }
    1808             :         }
    1809             :         else {
    1810             :             do {
    1811    43079600 :                 xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
    1812    43086400 :                 jnt_2d_no_avg_round_store_half_pel_16x2_avx2(
    1813             :                     r, offset_no_avg_256, dst, dst_stride);
    1814    43079900 :                 im += 2 * 16;
    1815    43079900 :                 dst += 2 * dst_stride;
    1816    43079900 :                 y -= 2;
    1817    43079900 :             } while (y);
    1818             :         }
    1819             :     }
    1820     4153340 :     else if (w == 32) {
    1821             :         __m256i s_256[2][2];
    1822             : 
    1823     3252100 :         s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
    1824     3252100 :         s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
    1825             : 
    1826     3252100 :         if (conv_params->do_average) {
    1827      959619 :             if (conv_params->use_jnt_comp_avg) {
    1828             :                 do {
    1829             :                     __m256i r[2];
    1830             : 
    1831     5377610 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    1832             :                         im + 1 * 32, s_256[0], s_256[1], r);
    1833             :                     jnt_2d_comp_avg_round_store_half_pel_32_avx2(
    1834             :                         r, factor_256, offset_comp_avg_256, dst, dst8);
    1835             : 
    1836     5377580 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    1837             :                         im + 2 * 32, s_256[1], s_256[0], r);
    1838     5377440 :                     jnt_2d_comp_avg_round_store_half_pel_32_avx2(
    1839             :                         r,
    1840             :                         factor_256,
    1841             :                         offset_comp_avg_256,
    1842     5377440 :                         dst + dst_stride,
    1843             :                         dst8 + dst8_stride);
    1844             : 
    1845     5377450 :                     im += 2 * 32;
    1846     5377450 :                     dst += 2 * dst_stride;
    1847     5377450 :                     dst8 += 2 * dst8_stride;
    1848     5377450 :                     y -= 2;
    1849     5377450 :                 } while (y);
    1850             :             }
    1851             :             else {
    1852             :                 do {
    1853             :                     __m256i r[2];
    1854             : 
    1855     5377620 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    1856             :                         im + 1 * 32, s_256[0], s_256[1], r);
    1857     5377580 :                     jnt_2d_avg_round_store_half_pel_32_avx2(
    1858             :                         r, offset_avg_256, dst, dst8);
    1859             : 
    1860     5377480 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    1861             :                         im + 2 * 32, s_256[1], s_256[0], r);
    1862     5377520 :                     jnt_2d_avg_round_store_half_pel_32_avx2(r,
    1863             :                         offset_avg_256,
    1864     5377520 :                         dst + dst_stride,
    1865             :                         dst8 + dst8_stride);
    1866             : 
    1867     5377460 :                     im += 2 * 32;
    1868     5377460 :                     dst += 2 * dst_stride;
    1869     5377460 :                     dst8 += 2 * dst8_stride;
    1870     5377460 :                     y -= 2;
    1871     5377460 :                 } while (y);
    1872             :             }
    1873             :         }
    1874             :         else {
    1875             :             do {
    1876             :                 __m256i r[2];
    1877             : 
    1878    26125500 :                 xy_y_convolve_2tap_half_pel_32_avx2(
    1879             :                     im + 1 * 32, s_256[0], s_256[1], r);
    1880    26130400 :                 jnt_2d_no_avg_round_store_half_pel_32_avx2(
    1881             :                     r, offset_no_avg_256, dst);
    1882             : 
    1883    26127200 :                 xy_y_convolve_2tap_half_pel_32_avx2(
    1884             :                     im + 2 * 32, s_256[1], s_256[0], r);
    1885    26130200 :                 jnt_2d_no_avg_round_store_half_pel_32_avx2(
    1886    26130200 :                     r, offset_no_avg_256, dst + dst_stride);
    1887             : 
    1888    26125700 :                 im += 2 * 32;
    1889    26125700 :                 dst += 2 * dst_stride;
    1890    26125700 :                 y -= 2;
    1891    26125700 :             } while (y);
    1892             :         }
    1893             :     }
    1894      901232 :     else if (w == 64) {
    1895             :         __m256i s_256[2][4];
    1896             : 
    1897      910630 :         s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
    1898      910630 :         s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
    1899      910630 :         s_256[0][2] = _mm256_load_si256((__m256i *)(im + 2 * 16));
    1900      910630 :         s_256[0][3] = _mm256_load_si256((__m256i *)(im + 3 * 16));
    1901             : 
    1902      910630 :         if (conv_params->do_average) {
    1903      301416 :             if (conv_params->use_jnt_comp_avg) {
    1904             :                 do {
    1905             :                     __m256i r[2];
    1906             : 
    1907     2425470 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    1908             :                         im + 2 * 32, s_256[0] + 0, s_256[1] + 0, r);
    1909             :                     jnt_2d_comp_avg_round_store_half_pel_32_avx2(
    1910             :                         r, factor_256, offset_comp_avg_256, dst, dst8);
    1911             : 
    1912     2425400 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    1913             :                         im + 3 * 32, s_256[0] + 2, s_256[1] + 2, r);
    1914     2425350 :                     jnt_2d_comp_avg_round_store_half_pel_32_avx2(
    1915             :                         r,
    1916             :                         factor_256,
    1917             :                         offset_comp_avg_256,
    1918     2425350 :                         dst + 32,
    1919             :                         dst8 + 32);
    1920     2425440 :                     im += 2 * 64;
    1921             : 
    1922     2425440 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    1923             :                         im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
    1924     2425400 :                     jnt_2d_comp_avg_round_store_half_pel_32_avx2(
    1925             :                         r,
    1926             :                         factor_256,
    1927             :                         offset_comp_avg_256,
    1928     2425400 :                         dst + dst_stride,
    1929             :                         dst8 + dst8_stride);
    1930             : 
    1931     2425390 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    1932             :                         im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
    1933     2425370 :                     jnt_2d_comp_avg_round_store_half_pel_32_avx2(
    1934             :                         r,
    1935             :                         factor_256,
    1936             :                         offset_comp_avg_256,
    1937     2425370 :                         dst + dst_stride + 32,
    1938     2425370 :                         dst8 + dst8_stride + 32);
    1939             : 
    1940     2425430 :                     dst += 2 * dst_stride;
    1941     2425430 :                     dst8 += 2 * dst8_stride;
    1942     2425430 :                     y -= 2;
    1943     2425430 :                 } while (y);
    1944             :             }
    1945             :             else {
    1946             :                 do {
    1947             :                     __m256i r[2];
    1948             : 
    1949     2425380 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    1950             :                         im + 2 * 32, s_256[0] + 0, s_256[1] + 0, r);
    1951     2425370 :                     jnt_2d_avg_round_store_half_pel_32_avx2(
    1952             :                         r, offset_avg_256, dst, dst8);
    1953             : 
    1954     2425350 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    1955             :                         im + 3 * 32, s_256[0] + 2, s_256[1] + 2, r);
    1956     2425370 :                     jnt_2d_avg_round_store_half_pel_32_avx2(
    1957     2425370 :                         r, offset_avg_256, dst + 32, dst8 + 32);
    1958     2425340 :                     im += 2 * 64;
    1959             : 
    1960     2425340 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    1961             :                         im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
    1962     2425340 :                     jnt_2d_avg_round_store_half_pel_32_avx2(r,
    1963             :                         offset_avg_256,
    1964     2425340 :                         dst + dst_stride,
    1965             :                         dst8 + dst8_stride);
    1966             : 
    1967     2425340 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    1968             :                         im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
    1969     2425370 :                     jnt_2d_avg_round_store_half_pel_32_avx2(
    1970             :                         r,
    1971             :                         offset_avg_256,
    1972     2425370 :                         dst + dst_stride + 32,
    1973     2425370 :                         dst8 + dst8_stride + 32);
    1974             : 
    1975     2425340 :                     dst += 2 * dst_stride;
    1976     2425340 :                     dst8 += 2 * dst8_stride;
    1977     2425340 :                     y -= 2;
    1978     2425340 :                 } while (y);
    1979             :             }
    1980             :         }
    1981             :         else {
    1982             :             do {
    1983             :                 __m256i r[2];
    1984             : 
    1985     9798030 :                 xy_y_convolve_2tap_half_pel_32_avx2(
    1986             :                     im + 2 * 32, s_256[0] + 0, s_256[1] + 0, r);
    1987     9799170 :                 jnt_2d_no_avg_round_store_half_pel_32_avx2(
    1988             :                     r, offset_no_avg_256, dst);
    1989             : 
    1990     9798260 :                 xy_y_convolve_2tap_half_pel_32_avx2(
    1991             :                     im + 3 * 32, s_256[0] + 2, s_256[1] + 2, r);
    1992     9799090 :                 jnt_2d_no_avg_round_store_half_pel_32_avx2(
    1993             :                     r, offset_no_avg_256, dst + 32);
    1994     9798200 :                 im += 2 * 64;
    1995             : 
    1996     9798200 :                 xy_y_convolve_2tap_half_pel_32_avx2(
    1997             :                     im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
    1998     9799230 :                 jnt_2d_no_avg_round_store_half_pel_32_avx2(
    1999     9799230 :                     r, offset_no_avg_256, dst + dst_stride);
    2000             : 
    2001     9798250 :                 xy_y_convolve_2tap_half_pel_32_avx2(
    2002             :                     im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
    2003     9799080 :                 jnt_2d_no_avg_round_store_half_pel_32_avx2(
    2004     9799080 :                     r, offset_no_avg_256, dst + dst_stride + 32);
    2005             : 
    2006     9798120 :                 dst += 2 * dst_stride;
    2007     9798120 :                 y -= 2;
    2008     9798120 :             } while (y);
    2009             :         }
    2010             :     }
    2011             :     else {
    2012             :         __m256i s_256[2][8];
    2013             : 
    2014           0 :         assert(w == 128);
    2015             : 
    2016             :         load_16bit_8rows_avx2(im, 16, s_256[0]);
    2017             : 
    2018           0 :         if (conv_params->do_average) {
    2019           0 :             if (conv_params->use_jnt_comp_avg) {
    2020             :                 do {
    2021             :                     __m256i r[2];
    2022             : 
    2023           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2024             :                         im + 4 * 32, s_256[0] + 0, s_256[1] + 0, r);
    2025             :                     jnt_2d_comp_avg_round_store_half_pel_32_avx2(
    2026             :                         r,
    2027             :                         factor_256,
    2028             :                         offset_comp_avg_256,
    2029             :                         dst + 0 * 32,
    2030             :                         dst8 + 0 * 32);
    2031             : 
    2032           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2033             :                         im + 5 * 32, s_256[0] + 2, s_256[1] + 2, r);
    2034           0 :                     jnt_2d_comp_avg_round_store_half_pel_32_avx2(
    2035             :                         r,
    2036             :                         factor_256,
    2037             :                         offset_comp_avg_256,
    2038           0 :                         dst + 1 * 32,
    2039             :                         dst8 + 1 * 32);
    2040             : 
    2041           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2042             :                         im + 6 * 32, s_256[0] + 4, s_256[1] + 4, r);
    2043           0 :                     jnt_2d_comp_avg_round_store_half_pel_32_avx2(
    2044             :                         r,
    2045             :                         factor_256,
    2046             :                         offset_comp_avg_256,
    2047           0 :                         dst + 2 * 32,
    2048             :                         dst8 + 2 * 32);
    2049             : 
    2050           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2051             :                         im + 7 * 32, s_256[0] + 6, s_256[1] + 6, r);
    2052           0 :                     jnt_2d_comp_avg_round_store_half_pel_32_avx2(
    2053             :                         r,
    2054             :                         factor_256,
    2055             :                         offset_comp_avg_256,
    2056           0 :                         dst + 3 * 32,
    2057             :                         dst8 + 3 * 32);
    2058           0 :                     im += 2 * 128;
    2059             : 
    2060           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2061             :                         im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
    2062           0 :                     jnt_2d_comp_avg_round_store_half_pel_32_avx2(
    2063             :                         r,
    2064             :                         factor_256,
    2065             :                         offset_comp_avg_256,
    2066           0 :                         dst + dst_stride + 0 * 32,
    2067             :                         dst8 + dst8_stride + 0 * 32);
    2068             : 
    2069           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2070             :                         im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
    2071           0 :                     jnt_2d_comp_avg_round_store_half_pel_32_avx2(
    2072             :                         r,
    2073             :                         factor_256,
    2074             :                         offset_comp_avg_256,
    2075           0 :                         dst + dst_stride + 1 * 32,
    2076           0 :                         dst8 + dst8_stride + 1 * 32);
    2077             : 
    2078           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2079             :                         im + 2 * 32, s_256[1] + 4, s_256[0] + 4, r);
    2080           0 :                     jnt_2d_comp_avg_round_store_half_pel_32_avx2(
    2081             :                         r,
    2082             :                         factor_256,
    2083             :                         offset_comp_avg_256,
    2084           0 :                         dst + dst_stride + 2 * 32,
    2085           0 :                         dst8 + dst8_stride + 2 * 32);
    2086             : 
    2087           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2088             :                         im + 3 * 32, s_256[1] + 6, s_256[0] + 6, r);
    2089           0 :                     jnt_2d_comp_avg_round_store_half_pel_32_avx2(
    2090             :                         r,
    2091             :                         factor_256,
    2092             :                         offset_comp_avg_256,
    2093           0 :                         dst + dst_stride + 3 * 32,
    2094           0 :                         dst8 + dst8_stride + 3 * 32);
    2095             : 
    2096           0 :                     dst += 2 * dst_stride;
    2097           0 :                     dst8 += 2 * dst8_stride;
    2098           0 :                     y -= 2;
    2099           0 :                 } while (y);
    2100             :             }
    2101             :             else {
    2102             :                 do {
    2103             :                     __m256i r[2];
    2104             : 
    2105           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2106             :                         im + 4 * 32, s_256[0] + 0, s_256[1] + 0, r);
    2107           0 :                     jnt_2d_avg_round_store_half_pel_32_avx2(
    2108             :                         r, offset_avg_256, dst + 0 * 32, dst8 + 0 * 32);
    2109             : 
    2110           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2111             :                         im + 5 * 32, s_256[0] + 2, s_256[1] + 2, r);
    2112           0 :                     jnt_2d_avg_round_store_half_pel_32_avx2(
    2113           0 :                         r, offset_avg_256, dst + 1 * 32, dst8 + 1 * 32);
    2114             : 
    2115           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2116             :                         im + 6 * 32, s_256[0] + 4, s_256[1] + 4, r);
    2117           0 :                     jnt_2d_avg_round_store_half_pel_32_avx2(
    2118           0 :                         r, offset_avg_256, dst + 2 * 32, dst8 + 2 * 32);
    2119             : 
    2120           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2121             :                         im + 7 * 32, s_256[0] + 6, s_256[1] + 6, r);
    2122           0 :                     jnt_2d_avg_round_store_half_pel_32_avx2(
    2123           0 :                         r, offset_avg_256, dst + 3 * 32, dst8 + 3 * 32);
    2124           0 :                     im += 2 * 128;
    2125             : 
    2126           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2127             :                         im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
    2128           0 :                     jnt_2d_avg_round_store_half_pel_32_avx2(
    2129             :                         r,
    2130             :                         offset_avg_256,
    2131           0 :                         dst + dst_stride + 0 * 32,
    2132             :                         dst8 + dst8_stride + 0 * 32);
    2133             : 
    2134           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2135             :                         im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
    2136           0 :                     jnt_2d_avg_round_store_half_pel_32_avx2(
    2137             :                         r,
    2138             :                         offset_avg_256,
    2139           0 :                         dst + dst_stride + 1 * 32,
    2140           0 :                         dst8 + dst8_stride + 1 * 32);
    2141             : 
    2142           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2143             :                         im + 2 * 32, s_256[1] + 4, s_256[0] + 4, r);
    2144           0 :                     jnt_2d_avg_round_store_half_pel_32_avx2(
    2145             :                         r,
    2146             :                         offset_avg_256,
    2147           0 :                         dst + dst_stride + 2 * 32,
    2148           0 :                         dst8 + dst8_stride + 2 * 32);
    2149             : 
    2150           0 :                     xy_y_convolve_2tap_half_pel_32_avx2(
    2151             :                         im + 3 * 32, s_256[1] + 6, s_256[0] + 6, r);
    2152           0 :                     jnt_2d_avg_round_store_half_pel_32_avx2(
    2153             :                         r,
    2154             :                         offset_avg_256,
    2155           0 :                         dst + dst_stride + 3 * 32,
    2156           0 :                         dst8 + dst8_stride + 3 * 32);
    2157             : 
    2158           0 :                     dst += 2 * dst_stride;
    2159           0 :                     dst8 += 2 * dst8_stride;
    2160           0 :                     y -= 2;
    2161           0 :                 } while (y);
    2162             :             }
    2163             :         }
    2164             :         else {
    2165             :             do {
    2166             :                 __m256i r[2];
    2167             : 
    2168           0 :                 xy_y_convolve_2tap_half_pel_32_avx2(
    2169             :                     im + 4 * 32, s_256[0] + 0, s_256[1] + 0, r);
    2170           0 :                 jnt_2d_no_avg_round_store_half_pel_32_avx2(
    2171             :                     r, offset_no_avg_256, dst + 0 * 32);
    2172             : 
    2173           0 :                 xy_y_convolve_2tap_half_pel_32_avx2(
    2174             :                     im + 5 * 32, s_256[0] + 2, s_256[1] + 2, r);
    2175           0 :                 jnt_2d_no_avg_round_store_half_pel_32_avx2(
    2176             :                     r, offset_no_avg_256, dst + 1 * 32);
    2177             : 
    2178           0 :                 xy_y_convolve_2tap_half_pel_32_avx2(
    2179             :                     im + 6 * 32, s_256[0] + 4, s_256[1] + 4, r);
    2180           0 :                 jnt_2d_no_avg_round_store_half_pel_32_avx2(
    2181             :                     r, offset_no_avg_256, dst + 2 * 32);
    2182             : 
    2183           0 :                 xy_y_convolve_2tap_half_pel_32_avx2(
    2184             :                     im + 7 * 32, s_256[0] + 6, s_256[1] + 6, r);
    2185           0 :                 jnt_2d_no_avg_round_store_half_pel_32_avx2(
    2186             :                     r, offset_no_avg_256, dst + 3 * 32);
    2187           0 :                 im += 2 * 128;
    2188             : 
    2189           0 :                 xy_y_convolve_2tap_half_pel_32_avx2(
    2190             :                     im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
    2191           0 :                 jnt_2d_no_avg_round_store_half_pel_32_avx2(
    2192           0 :                     r, offset_no_avg_256, dst + dst_stride + 0 * 32);
    2193             : 
    2194           0 :                 xy_y_convolve_2tap_half_pel_32_avx2(
    2195             :                     im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
    2196           0 :                 jnt_2d_no_avg_round_store_half_pel_32_avx2(
    2197           0 :                     r, offset_no_avg_256, dst + dst_stride + 1 * 32);
    2198             : 
    2199           0 :                 xy_y_convolve_2tap_half_pel_32_avx2(
    2200             :                     im + 2 * 32, s_256[1] + 4, s_256[0] + 4, r);
    2201           0 :                 jnt_2d_no_avg_round_store_half_pel_32_avx2(
    2202           0 :                     r, offset_no_avg_256, dst + dst_stride + 2 * 32);
    2203             : 
    2204           0 :                 xy_y_convolve_2tap_half_pel_32_avx2(
    2205             :                     im + 3 * 32, s_256[1] + 6, s_256[0] + 6, r);
    2206           0 :                 jnt_2d_no_avg_round_store_half_pel_32_avx2(
    2207           0 :                     r, offset_no_avg_256, dst + dst_stride + 3 * 32);
    2208             : 
    2209           0 :                 dst += 2 * dst_stride;
    2210           0 :                 y -= 2;
    2211           0 :             } while (y);
    2212             :         }
    2213             :     }
    2214    18501300 : }
    2215             : 
    2216     1653540 : static void jnt_convolve_2d_ver_4tap_avx2(
    2217             :     const int16_t *const im_block, const int32_t w, const int32_t h,
    2218             :     const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
    2219             :     const ConvolveParams *const conv_params, uint8_t *dst8,
    2220             :     const int32_t dst8_stride) {
    2221     1653540 :     const int32_t dst_stride = conv_params->dst_stride;
    2222     1653540 :     const int32_t bd = 8;
    2223     1653540 :     const int32_t round_0 = 3;
    2224     1653540 :     const int16_t *im = im_block;
    2225     1653540 :     const int32_t round_1 = COMPOUND_ROUND1_BITS;
    2226     1653540 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;      // 19
    2227     1653540 :     const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;  // 4
    2228     1653540 :     const int32_t round_offset = 1 << (offset_bits - round_1);
    2229     1653540 :     const int32_t factor =
    2230     1653540 :         conv_params->fwd_offset | (conv_params->bck_offset << 16);
    2231     1653540 :     const int32_t offset_comp_avg =
    2232     1653540 :         (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
    2233     1653540 :         (round_offset << DIST_PRECISION_BITS) -
    2234     1653540 :         (round_offset << (DIST_PRECISION_BITS - 1)) +
    2235     1653540 :         (1 << (round_bits + DIST_PRECISION_BITS - 1));
    2236     1653540 :     const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
    2237     1653540 :     const __m128i factor_128 = _mm_set1_epi32(factor);
    2238     1653540 :     const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
    2239     1653540 :     const __m256i factor_256 = _mm256_set1_epi32(factor);
    2240     1653540 :     const int32_t offset_avg = (1 << (round_1 - 1)) +
    2241     1653540 :         (1 << (round_bits + round_1)) -
    2242     1653540 :         (1 << offset_bits) - (1 << (offset_bits - 1));
    2243     1653540 :     const int32_t offset_no_avg =
    2244     1653540 :         (1 << (round_1 - 1)) + (1 << offset_bits) + (1 << (offset_bits - 1));
    2245     1653540 :     const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
    2246     1653540 :     const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
    2247     1653540 :     const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
    2248     1653540 :     const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
    2249     1653540 :     int32_t y = h;
    2250     1653540 :     ConvBufType *dst = conv_params->dst;
    2251             :     __m128i coeffs_128[4];
    2252             :     __m256i coeffs_256[4];
    2253             : 
    2254     1653540 :     if (w == 2) {
    2255             :         __m128i s_32[4], ss_128[2];
    2256             : 
    2257           0 :         prepare_coeffs_4tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
    2258             : 
    2259           0 :         s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
    2260           0 :         s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
    2261           0 :         s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
    2262             : 
    2263           0 :         const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
    2264           0 :         const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
    2265             : 
    2266           0 :         ss_128[0] = _mm_unpacklo_epi16(src01, src12);
    2267             : 
    2268           0 :         if (conv_params->do_average) {
    2269           0 :             if (conv_params->use_jnt_comp_avg) {
    2270             :                 do {
    2271           0 :                     const __m128i res = xy_y_convolve_4tap_2x2_sse2(
    2272             :                         im, s_32, ss_128, coeffs_128);
    2273           0 :                     jnt_2d_comp_avg_round_store_2x2_sse2(res,
    2274             :                         factor_128,
    2275             :                         offset_comp_avg_128,
    2276             :                         dst,
    2277             :                         dst_stride,
    2278             :                         dst8,
    2279             :                         dst8_stride);
    2280           0 :                     im += 2 * 2;
    2281           0 :                     dst += 2 * dst_stride;
    2282           0 :                     dst8 += 2 * dst8_stride;
    2283           0 :                     y -= 2;
    2284           0 :                 } while (y);
    2285             :             }
    2286             :             else {
    2287             :                 do {
    2288           0 :                     const __m128i res = xy_y_convolve_4tap_2x2_sse2(
    2289             :                         im, s_32, ss_128, coeffs_128);
    2290           0 :                     jnt_2d_avg_round_store_2x2_sse2(res,
    2291             :                         offset_avg_128,
    2292             :                         dst,
    2293             :                         dst_stride,
    2294             :                         dst8,
    2295             :                         dst8_stride);
    2296           0 :                     im += 2 * 2;
    2297           0 :                     dst += 2 * dst_stride;
    2298           0 :                     dst8 += 2 * dst8_stride;
    2299           0 :                     y -= 2;
    2300           0 :                 } while (y);
    2301             :             }
    2302             :         }
    2303             :         else {
    2304             :             do {
    2305             :                 const __m128i res =
    2306           0 :                     xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
    2307           0 :                 jnt_2d_no_avg_round_store_2x2_sse2(
    2308             :                     res, offset_no_avg_128, dst, dst_stride);
    2309           0 :                 im += 2 * 2;
    2310           0 :                 dst += 2 * dst_stride;
    2311           0 :                 y -= 2;
    2312           0 :             } while (y);
    2313             :         }
    2314             :     }
    2315             :     else {
    2316     1653540 :         prepare_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
    2317             : 
    2318     1653560 :         if (w == 4) {
    2319             :             __m128i s_64[4];
    2320             :             __m256i s_256[2], ss_256[2];
    2321             : 
    2322      669237 :             s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
    2323      669237 :             s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
    2324      669237 :             s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
    2325             : 
    2326             :             // Load lines a and b. Line a to lower 128, line b to upper
    2327             :             // 128
    2328      669237 :             s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
    2329      669237 :             s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
    2330             : 
    2331      669237 :             ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
    2332             : 
    2333      669237 :             if (conv_params->do_average) {
    2334      305068 :                 if (conv_params->use_jnt_comp_avg) {
    2335             :                     do {
    2336      192140 :                         const __m256i res = xy_y_convolve_4tap_4x2_avx2(
    2337             :                             im, s_64, ss_256, coeffs_256);
    2338      192140 :                         jnt_2d_comp_avg_round_store_4x2_avx2(
    2339             :                             res,
    2340             :                             factor_256,
    2341             :                             offset_comp_avg_256,
    2342             :                             dst,
    2343             :                             dst_stride,
    2344             :                             dst8,
    2345             :                             dst8_stride);
    2346      192139 :                         im += 2 * 4;
    2347      192139 :                         dst += 2 * dst_stride;
    2348      192139 :                         dst8 += 2 * dst8_stride;
    2349      192139 :                         y -= 2;
    2350      192139 :                     } while (y);
    2351             :                 }
    2352             :                 else {
    2353             :                     do {
    2354      417999 :                         const __m256i res = xy_y_convolve_4tap_4x2_avx2(
    2355             :                             im, s_64, ss_256, coeffs_256);
    2356      417997 :                         jnt_2d_avg_round_store_4x2_avx2(res,
    2357             :                             offset_avg_256,
    2358             :                             dst,
    2359             :                             dst_stride,
    2360             :                             dst8,
    2361             :                             dst8_stride);
    2362      418000 :                         im += 2 * 4;
    2363      418000 :                         dst += 2 * dst_stride;
    2364      418000 :                         dst8 += 2 * dst8_stride;
    2365      418000 :                         y -= 2;
    2366      418000 :                     } while (y);
    2367             :                 }
    2368             :             }
    2369             :             else {
    2370             :                 do {
    2371      728344 :                     const __m256i res = xy_y_convolve_4tap_4x2_avx2(
    2372             :                         im, s_64, ss_256, coeffs_256);
    2373      728342 :                     jnt_2d_no_avg_round_store_4x2_avx2(
    2374             :                         res, offset_no_avg_256, dst, dst_stride);
    2375      728343 :                     im += 2 * 4;
    2376      728343 :                     dst += 2 * dst_stride;
    2377      728343 :                     y -= 2;
    2378      728343 :                 } while (y);
    2379             :             }
    2380             :         }
    2381      984328 :         else if (w == 8) {
    2382             :             __m256i s_256[4], r[2];
    2383             : 
    2384      549716 :             s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
    2385      549716 :             s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
    2386             : 
    2387             :             __m256i ss_256[4];
    2388             : 
    2389      549716 :             ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
    2390      549716 :             ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
    2391             : 
    2392      549716 :             if (conv_params->do_average) {
    2393      245740 :                 if (conv_params->use_jnt_comp_avg) {
    2394             :                     do {
    2395      173488 :                         xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
    2396      173486 :                         jnt_2d_comp_avg_round_store_8x2_avx2(
    2397             :                             r,
    2398             :                             factor_256,
    2399             :                             offset_comp_avg_256,
    2400             :                             dst,
    2401             :                             dst_stride,
    2402             :                             dst8,
    2403             :                             dst8_stride);
    2404      173487 :                         im += 2 * 8;
    2405      173487 :                         dst += 2 * dst_stride;
    2406      173487 :                         dst8 += 2 * dst8_stride;
    2407      173487 :                         y -= 2;
    2408      173487 :                     } while (y);
    2409             :                 }
    2410             :                 else {
    2411             :                     do {
    2412      317991 :                         xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
    2413      317989 :                         jnt_2d_avg_round_store_8x2_avx2(r,
    2414             :                             offset_avg_256,
    2415             :                             dst,
    2416             :                             dst_stride,
    2417             :                             dst8,
    2418             :                             dst8_stride);
    2419      317988 :                         im += 2 * 8;
    2420      317988 :                         dst += 2 * dst_stride;
    2421      317988 :                         dst8 += 2 * dst8_stride;
    2422      317988 :                         y -= 2;
    2423      317988 :                     } while (y);
    2424             :                 }
    2425             :             }
    2426             :             else {
    2427             :                 do {
    2428      607955 :                     xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
    2429      607957 :                     jnt_2d_no_avg_round_store_8x2_avx2(
    2430             :                         r, offset_no_avg_256, dst, dst_stride);
    2431      607962 :                     im += 2 * 8;
    2432      607962 :                     dst += 2 * dst_stride;
    2433      607962 :                     y -= 2;
    2434      607962 :                 } while (y);
    2435             :             }
    2436             :         }
    2437             :         else {
    2438             :             __m256i s_256[5];
    2439             : 
    2440      434612 :             assert(w == 16);
    2441             : 
    2442      434612 :             s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
    2443      434612 :             s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
    2444      434612 :             s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
    2445             : 
    2446             :             __m256i ss_256[4], tt_256[4], r[4];
    2447             : 
    2448      434612 :             ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
    2449      434612 :             ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
    2450             : 
    2451      434612 :             tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
    2452      434612 :             tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
    2453             : 
    2454      434612 :             if (conv_params->do_average) {
    2455      191764 :                 if (conv_params->use_jnt_comp_avg) {
    2456             :                     do {
    2457      151996 :                         xy_y_convolve_4tap_16x2_avx2(
    2458             :                             im, s_256, ss_256, tt_256, coeffs_256, r);
    2459      151996 :                         jnt_2d_comp_avg_round_store_16x2_avx2(
    2460             :                             r,
    2461             :                             factor_256,
    2462             :                             offset_comp_avg_256,
    2463             :                             dst,
    2464             :                             dst_stride,
    2465             :                             dst8,
    2466             :                             dst8_stride);
    2467      151996 :                         im += 2 * 16;
    2468      151996 :                         dst += 2 * dst_stride;
    2469      151996 :                         dst8 += 2 * dst8_stride;
    2470      151996 :                         y -= 2;
    2471      151996 :                     } while (y);
    2472             :                 }
    2473             :                 else {
    2474             :                     do {
    2475      231534 :                         xy_y_convolve_4tap_16x2_avx2(
    2476             :                             im, s_256, ss_256, tt_256, coeffs_256, r);
    2477      231531 :                         jnt_2d_avg_round_store_16x2_avx2(r,
    2478             :                             offset_avg_256,
    2479             :                             dst,
    2480             :                             dst_stride,
    2481             :                             dst8,
    2482             :                             dst8_stride);
    2483      231536 :                         im += 2 * 16;
    2484      231536 :                         dst += 2 * dst_stride;
    2485      231536 :                         dst8 += 2 * dst8_stride;
    2486      231536 :                         y -= 2;
    2487      231536 :                     } while (y);
    2488             :                 }
    2489             :             }
    2490             :             else {
    2491             :                 do {
    2492      485725 :                     xy_y_convolve_4tap_16x2_avx2(
    2493             :                         im, s_256, ss_256, tt_256, coeffs_256, r);
    2494      485742 :                     jnt_2d_no_avg_round_store_16x2_avx2(
    2495             :                         r, offset_no_avg_256, dst, dst_stride);
    2496      485746 :                     im += 2 * 16;
    2497      485746 :                     dst += 2 * dst_stride;
    2498      485746 :                     y -= 2;
    2499      485746 :                 } while (y);
    2500             :             }
    2501             :         }
    2502             :     }
    2503     1653590 : }
    2504             : 
    2505    38603000 : static void jnt_convolve_2d_ver_6tap_avx2(
    2506             :     const int16_t *const im_block, const int32_t w, const int32_t h,
    2507             :     const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
    2508             :     const ConvolveParams *const conv_params, uint8_t *dst8,
    2509             :     const int32_t dst8_stride) {
    2510    38603000 :     const int32_t dst_stride = conv_params->dst_stride;
    2511    38603000 :     const int32_t bd = 8;
    2512    38603000 :     const int32_t round_0 = 3;
    2513    38603000 :     const int16_t *im = im_block;
    2514    38603000 :     const int32_t round_1 = COMPOUND_ROUND1_BITS;
    2515    38603000 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;      // 19
    2516    38603000 :     const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;  // 4
    2517    38603000 :     const int32_t round_offset = 1 << (offset_bits - round_1);
    2518    38603000 :     const int32_t factor =
    2519    38603000 :         conv_params->fwd_offset | (conv_params->bck_offset << 16);
    2520    38603000 :     const int32_t offset_comp_avg =
    2521    38603000 :         (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
    2522    38603000 :         (round_offset << DIST_PRECISION_BITS) -
    2523    38603000 :         (round_offset << (DIST_PRECISION_BITS - 1)) +
    2524    38603000 :         (1 << (round_bits + DIST_PRECISION_BITS - 1));
    2525    38603000 :     const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
    2526    38603000 :     const __m128i factor_128 = _mm_set1_epi32(factor);
    2527    38603000 :     const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
    2528    38603000 :     const __m256i factor_256 = _mm256_set1_epi32(factor);
    2529    38603000 :     const int32_t offset_avg = (1 << (round_1 - 1)) +
    2530    38603000 :         (1 << (round_bits + round_1)) -
    2531    38603000 :         (1 << offset_bits) - (1 << (offset_bits - 1));
    2532    38603000 :     const int32_t offset_no_avg =
    2533    38603000 :         (1 << (round_1 - 1)) + (1 << offset_bits) + (1 << (offset_bits - 1));
    2534    38603000 :     const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
    2535    38603000 :     const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
    2536    38603000 :     const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
    2537    38603000 :     const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
    2538    38603000 :     int32_t y = h;
    2539    38603000 :     ConvBufType *dst = conv_params->dst;
    2540             :     __m128i coeffs_128[4];
    2541             :     __m256i coeffs_256[4];
    2542             : 
    2543    38603000 :     if (w == 2) {
    2544             :         __m128i s_32[6], ss_128[3];
    2545             : 
    2546           0 :         prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
    2547             : 
    2548           0 :         s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
    2549           0 :         s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
    2550           0 :         s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
    2551           0 :         s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
    2552           0 :         s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
    2553             : 
    2554           0 :         const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
    2555           0 :         const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
    2556           0 :         const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
    2557           0 :         const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
    2558             : 
    2559           0 :         ss_128[0] = _mm_unpacklo_epi16(src01, src12);
    2560           0 :         ss_128[1] = _mm_unpacklo_epi16(src23, src34);
    2561             : 
    2562           0 :         y = h;
    2563             : 
    2564           0 :         if (conv_params->do_average) {
    2565           0 :             if (conv_params->use_jnt_comp_avg) {
    2566             :                 do {
    2567           0 :                     const __m128i res = xy_y_convolve_6tap_2x2_sse2(
    2568             :                         im, s_32, ss_128, coeffs_128);
    2569           0 :                     jnt_2d_comp_avg_round_store_2x2_sse2(res,
    2570             :                         factor_128,
    2571             :                         offset_comp_avg_128,
    2572             :                         dst,
    2573             :                         dst_stride,
    2574             :                         dst8,
    2575             :                         dst8_stride);
    2576           0 :                     im += 2 * 2;
    2577           0 :                     dst += 2 * dst_stride;
    2578           0 :                     dst8 += 2 * dst8_stride;
    2579           0 :                     y -= 2;
    2580           0 :                 } while (y);
    2581             :             }
    2582             :             else {
    2583             :                 do {
    2584           0 :                     const __m128i res = xy_y_convolve_6tap_2x2_sse2(
    2585             :                         im, s_32, ss_128, coeffs_128);
    2586           0 :                     jnt_2d_avg_round_store_2x2_sse2(res,
    2587             :                         offset_avg_128,
    2588             :                         dst,
    2589             :                         dst_stride,
    2590             :                         dst8,
    2591             :                         dst8_stride);
    2592           0 :                     im += 2 * 2;
    2593           0 :                     dst += 2 * dst_stride;
    2594           0 :                     dst8 += 2 * dst8_stride;
    2595           0 :                     y -= 2;
    2596           0 :                 } while (y);
    2597             :             }
    2598             :         }
    2599             :         else {
    2600             :             do {
    2601             :                 const __m128i res =
    2602           0 :                     xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
    2603           0 :                 jnt_2d_no_avg_round_store_2x2_sse2(
    2604             :                     res, offset_no_avg_128, dst, dst_stride);
    2605           0 :                 im += 2 * 2;
    2606           0 :                 dst += 2 * dst_stride;
    2607           0 :                 y -= 2;
    2608           0 :             } while (y);
    2609             :         }
    2610             :     }
    2611             :     else {
    2612    38603000 :         prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
    2613             : 
    2614    38610700 :         if (w == 4) {
    2615             :             __m128i s_64[6];
    2616             :             __m256i s_256[6], ss_256[3];
    2617             : 
    2618     1051860 :             s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
    2619     1051860 :             s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
    2620     1051860 :             s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
    2621     1051860 :             s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
    2622     1051860 :             s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
    2623             : 
    2624             :             // Load lines a and b. Line a to lower 128, line b to upper
    2625             :             // 128
    2626     1051860 :             s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
    2627     1051860 :             s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
    2628     1051860 :             s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
    2629     1051860 :             s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
    2630             : 
    2631     1051860 :             ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
    2632     1051860 :             ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
    2633             : 
    2634     1051860 :             y = h;
    2635             : 
    2636     1051860 :             if (conv_params->do_average) {
    2637      474975 :                 if (conv_params->use_jnt_comp_avg) {
    2638             :                     do {
    2639      939236 :                         const __m256i res = xy_y_convolve_6tap_4x2_avx2(
    2640             :                             im, s_64, ss_256, coeffs_256);
    2641      939229 :                         jnt_2d_comp_avg_round_store_4x2_avx2(
    2642             :                             res,
    2643             :                             factor_256,
    2644             :                             offset_comp_avg_256,
    2645             :                             dst,
    2646             :                             dst_stride,
    2647             :                             dst8,
    2648             :                             dst8_stride);
    2649      939226 :                         im += 2 * 4;
    2650      939226 :                         dst += 2 * dst_stride;
    2651      939226 :                         dst8 += 2 * dst8_stride;
    2652      939226 :                         y -= 2;
    2653      939226 :                     } while (y);
    2654             :                 }
    2655             :                 else {
    2656             :                     do {
    2657     1642760 :                         const __m256i res = xy_y_convolve_6tap_4x2_avx2(
    2658             :                             im, s_64, ss_256, coeffs_256);
    2659     1642740 :                         jnt_2d_avg_round_store_4x2_avx2(res,
    2660             :                             offset_avg_256,
    2661             :                             dst,
    2662             :                             dst_stride,
    2663             :                             dst8,
    2664             :                             dst8_stride);
    2665     1642740 :                         im += 2 * 4;
    2666     1642740 :                         dst += 2 * dst_stride;
    2667     1642740 :                         dst8 += 2 * dst8_stride;
    2668     1642740 :                         y -= 2;
    2669     1642740 :                     } while (y);
    2670             :                 }
    2671             :             }
    2672             :             else {
    2673             :                 do {
    2674     3145700 :                     const __m256i res = xy_y_convolve_6tap_4x2_avx2(
    2675             :                         im, s_64, ss_256, coeffs_256);
    2676     3145750 :                     jnt_2d_no_avg_round_store_4x2_avx2(
    2677             :                         res, offset_no_avg_256, dst, dst_stride);
    2678     3145730 :                     im += 2 * 4;
    2679     3145730 :                     dst += 2 * dst_stride;
    2680     3145730 :                     y -= 2;
    2681     3145730 :                 } while (y);
    2682             :             }
    2683             :         }
    2684    37558800 :         else if (w == 8) {
    2685             :             __m256i s_256[6], r[2];
    2686             : 
    2687    17287800 :             s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
    2688    17287800 :             s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
    2689    17287800 :             s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
    2690    17287800 :             s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
    2691    17287800 :             y = h;
    2692             : 
    2693             :             __m256i ss_256[6];
    2694             : 
    2695    17287800 :             ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
    2696    17287800 :             ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
    2697             : 
    2698    17287800 :             ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
    2699    17287800 :             ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
    2700             : 
    2701    17287800 :             if (conv_params->do_average) {
    2702     6784090 :                 if (conv_params->use_jnt_comp_avg) {
    2703             :                     do {
    2704    22128500 :                         xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
    2705    22120800 :                         jnt_2d_comp_avg_round_store_8x2_avx2(
    2706             :                             r,
    2707             :                             factor_256,
    2708             :                             offset_comp_avg_256,
    2709             :                             dst,
    2710             :                             dst_stride,
    2711             :                             dst8,
    2712             :                             dst8_stride);
    2713    22125900 :                         im += 2 * 8;
    2714    22125900 :                         dst += 2 * dst_stride;
    2715    22125900 :                         dst8 += 2 * dst8_stride;
    2716    22125900 :                         y -= 2;
    2717    22125900 :                     } while (y);
    2718             :                 }
    2719             :                 else {
    2720             :                     do {
    2721    28722800 :                         xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
    2722    28718600 :                         jnt_2d_avg_round_store_8x2_avx2(r,
    2723             :                             offset_avg_256,
    2724             :                             dst,
    2725             :                             dst_stride,
    2726             :                             dst8,
    2727             :                             dst8_stride);
    2728    28714800 :                         im += 2 * 8;
    2729    28714800 :                         dst += 2 * dst_stride;
    2730    28714800 :                         dst8 += 2 * dst8_stride;
    2731    28714800 :                         y -= 2;
    2732    28714800 :                     } while (y);
    2733             :                 }
    2734             :             }
    2735             :             else {
    2736             :                 do {
    2737    78557200 :                     xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
    2738    78541800 :                     jnt_2d_no_avg_round_store_8x2_avx2(
    2739             :                         r, offset_no_avg_256, dst, dst_stride);
    2740    78563400 :                     im += 2 * 8;
    2741    78563400 :                     dst += 2 * dst_stride;
    2742    78563400 :                     y -= 2;
    2743    78563400 :                 } while (y);
    2744             :             }
    2745             :         }
    2746    20271000 :         else if (w == 16) {
    2747             :             __m256i s_256[6];
    2748             : 
    2749    11530700 :             s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
    2750    11530700 :             s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
    2751    11530700 :             s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
    2752    11530700 :             s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
    2753    11530700 :             s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 16));
    2754    11530700 :             y = h;
    2755             : 
    2756             :             __m256i ss_256[6], tt_256[6], r[4];
    2757             : 
    2758    11530700 :             ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
    2759    11530700 :             ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
    2760    11530700 :             ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
    2761    11530700 :             ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
    2762             : 
    2763    11530700 :             tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
    2764    11530700 :             tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
    2765    11530700 :             tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
    2766    11530700 :             tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
    2767             : 
    2768    11530700 :             if (conv_params->do_average) {
    2769     4472190 :                 if (conv_params->use_jnt_comp_avg) {
    2770             :                     do {
    2771    17961100 :                         xy_y_convolve_6tap_16x2_avx2(
    2772             :                             im, 16, s_256, ss_256, tt_256, coeffs_256, r);
    2773    17955300 :                         jnt_2d_comp_avg_round_store_16x2_avx2(
    2774             :                             r,
    2775             :                             factor_256,
    2776             :                             offset_comp_avg_256,
    2777             :                             dst,
    2778             :                             dst_stride,
    2779             :                             dst8,
    2780             :                             dst8_stride);
    2781    17958900 :                         im += 2 * 16;
    2782    17958900 :                         dst += 2 * dst_stride;
    2783    17958900 :                         dst8 += 2 * dst8_stride;
    2784    17958900 :                         y -= 2;
    2785    17958900 :                     } while (y);
    2786             :                 }
    2787             :                 else {
    2788             :                     do {
    2789    26439300 :                         xy_y_convolve_6tap_16x2_avx2(
    2790             :                             im, 16, s_256, ss_256, tt_256, coeffs_256, r);
    2791    26428000 :                         jnt_2d_avg_round_store_16x2_avx2(r,
    2792             :                             offset_avg_256,
    2793             :                             dst,
    2794             :                             dst_stride,
    2795             :                             dst8,
    2796             :                             dst8_stride);
    2797    26435000 :                         im += 2 * 16;
    2798    26435000 :                         dst += 2 * dst_stride;
    2799    26435000 :                         dst8 += 2 * dst8_stride;
    2800    26435000 :                         y -= 2;
    2801    26435000 :                     } while (y);
    2802             :                 }
    2803             :             }
    2804             :             else {
    2805             :                 do {
    2806    69692500 :                     xy_y_convolve_6tap_16x2_avx2(
    2807             :                         im, 16, s_256, ss_256, tt_256, coeffs_256, r);
    2808    69654200 :                     jnt_2d_no_avg_round_store_16x2_avx2(
    2809             :                         r, offset_no_avg_256, dst, dst_stride);
    2810    69698200 :                     im += 2 * 16;
    2811    69698200 :                     dst += 2 * dst_stride;
    2812    69698200 :                     y -= 2;
    2813    69698200 :                 } while (y);
    2814             :             }
    2815             :         }
    2816             :         else {
    2817     8740340 :             int32_t x = 0;
    2818             : 
    2819     8740340 :             assert(!(w % 32));
    2820             : 
    2821             :             __m256i s_256[2][6], ss_256[2][6], tt_256[2][6], r0[4], r1[4];
    2822             : 
    2823             :             do {
    2824    10677600 :                 const int16_t *s = im + x;
    2825    10677600 :                 ConvBufType *d = dst + x;
    2826    10677600 :                 uint8_t *d8 = dst8 + x;
    2827             : 
    2828             :                 loadu_unpack_16bit_5rows_avx2(
    2829             :                     s, w, s_256[0], ss_256[0], tt_256[0]);
    2830    10677600 :                 loadu_unpack_16bit_5rows_avx2(
    2831             :                     s + 16, w, s_256[1], ss_256[1], tt_256[1]);
    2832             : 
    2833    10677600 :                 y = h;
    2834             : 
    2835    10677600 :                 if (conv_params->do_average) {
    2836     4202570 :                     if (conv_params->use_jnt_comp_avg) {
    2837             :                         do {
    2838    18810600 :                             xy_y_convolve_6tap_16x2_avx2(s,
    2839             :                                 w,
    2840             :                                 s_256[0],
    2841             :                                 ss_256[0],
    2842             :                                 tt_256[0],
    2843             :                                 coeffs_256,
    2844             :                                 r0);
    2845    18801200 :                             xy_y_convolve_6tap_16x2_avx2(s + 16,
    2846             :                                 w,
    2847             :                                 s_256[1],
    2848             :                                 ss_256[1],
    2849             :                                 tt_256[1],
    2850             :                                 coeffs_256,
    2851             :                                 r1);
    2852             :                             jnt_2d_comp_avg_round_store_32_avx2(
    2853             :                                 r0 + 0,
    2854             :                                 r1 + 0,
    2855             :                                 factor_256,
    2856             :                                 offset_comp_avg_256,
    2857             :                                 d,
    2858             :                                 d8);
    2859    18809100 :                             jnt_2d_comp_avg_round_store_32_avx2(
    2860             :                                 r0 + 2,
    2861             :                                 r1 + 2,
    2862             :                                 factor_256,
    2863             :                                 offset_comp_avg_256,
    2864    18809100 :                                 d + dst_stride,
    2865             :                                 d8 + dst8_stride);
    2866    18808200 :                             s += 2 * w;
    2867    18808200 :                             d += 2 * dst_stride;
    2868    18808200 :                             d8 += 2 * dst8_stride;
    2869    18808200 :                             y -= 2;
    2870    18808200 :                         } while (y);
    2871             :                     }
    2872             :                     else {
    2873             :                         do {
    2874    30279300 :                             xy_y_convolve_6tap_16x2_avx2(s,
    2875             :                                 w,
    2876             :                                 s_256[0],
    2877             :                                 ss_256[0],
    2878             :                                 tt_256[0],
    2879             :                                 coeffs_256,
    2880             :                                 r0);
    2881    30262100 :                             xy_y_convolve_6tap_16x2_avx2(s + 16,
    2882             :                                 w,
    2883             :                                 s_256[1],
    2884             :                                 ss_256[1],
    2885             :                                 tt_256[1],
    2886             :                                 coeffs_256,
    2887             :                                 r1);
    2888             :                             jnt_2d_avg_round_store_32_avx2(
    2889             :                                 r0 + 0, r1 + 0, offset_avg_256, d, d8);
    2890    30268300 :                             jnt_2d_avg_round_store_32_avx2(r0 + 2,
    2891             :                                 r1 + 2,
    2892             :                                 offset_avg_256,
    2893    30268300 :                                 d + dst_stride,
    2894             :                                 d8 + dst8_stride);
    2895    30268600 :                             s += 2 * w;
    2896    30268600 :                             d += 2 * dst_stride;
    2897    30268600 :                             d8 += 2 * dst8_stride;
    2898    30268600 :                             y -= 2;
    2899    30268600 :                         } while (y);
    2900             :                     }
    2901             :                 }
    2902             :                 else {
    2903             :                     do {
    2904    75155600 :                         xy_y_convolve_6tap_16x2_avx2(s,
    2905             :                             w,
    2906             :                             s_256[0],
    2907             :                             ss_256[0],
    2908             :                             tt_256[0],
    2909             :                             coeffs_256,
    2910             :                             r0);
    2911    75131500 :                         xy_y_convolve_6tap_16x2_avx2(s + 16,
    2912             :                             w,
    2913             :                             s_256[1],
    2914             :                             ss_256[1],
    2915             :                             tt_256[1],
    2916             :                             coeffs_256,
    2917             :                             r1);
    2918    75156800 :                         jnt_2d_no_avg_round_store_32_avx2(
    2919             :                             r0 + 0, r1 + 0, offset_no_avg_256, d);
    2920    75172500 :                         jnt_2d_no_avg_round_store_32_avx2(
    2921    75172500 :                             r0 + 2, r1 + 2, offset_no_avg_256, d + dst_stride);
    2922    75190500 :                         s += 2 * w;
    2923    75190500 :                         d += 2 * dst_stride;
    2924    75190500 :                         y -= 2;
    2925    75190500 :                     } while (y);
    2926             :                 }
    2927             : 
    2928    10699300 :                 x += 32;
    2929    10699300 :             } while (x < w);
    2930             :         }
    2931             :     }
    2932    38627300 : }
    2933             : 
    2934    16479000 : static void jnt_convolve_2d_ver_8tap_avx2(
    2935             :     const int16_t *const im_block, const int32_t w, const int32_t h,
    2936             :     const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
    2937             :     const ConvolveParams *const conv_params, uint8_t *dst8,
    2938             :     const int32_t dst8_stride) {
    2939    16479000 :     const int32_t dst_stride = conv_params->dst_stride;
    2940    16479000 :     const int32_t bd = 8;
    2941    16479000 :     const int32_t round_0 = 3;
    2942    16479000 :     const int16_t *im = im_block;
    2943    16479000 :     const int32_t round_1 = COMPOUND_ROUND1_BITS;
    2944    16479000 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;      // 19
    2945    16479000 :     const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;  // 4
    2946    16479000 :     const int32_t round_offset = 1 << (offset_bits - round_1);
    2947    16479000 :     const int32_t factor =
    2948    16479000 :         conv_params->fwd_offset | (conv_params->bck_offset << 16);
    2949    16479000 :     const int32_t offset_comp_avg =
    2950    16479000 :         (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
    2951    16479000 :         (round_offset << DIST_PRECISION_BITS) -
    2952    16479000 :         (round_offset << (DIST_PRECISION_BITS - 1)) +
    2953    16479000 :         (1 << (round_bits + DIST_PRECISION_BITS - 1));
    2954    16479000 :     const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
    2955    16479000 :     const __m128i factor_128 = _mm_set1_epi32(factor);
    2956    16479000 :     const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
    2957    16479000 :     const __m256i factor_256 = _mm256_set1_epi32(factor);
    2958    16479000 :     const int32_t offset_avg = (1 << (round_1 - 1)) +
    2959    16479000 :         (1 << (round_bits + round_1)) -
    2960    16479000 :         (1 << offset_bits) - (1 << (offset_bits - 1));
    2961    16479000 :     const int32_t offset_no_avg =
    2962    16479000 :         (1 << (round_1 - 1)) + (1 << offset_bits) + (1 << (offset_bits - 1));
    2963    16479000 :     const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
    2964    16479000 :     const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
    2965    16479000 :     const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
    2966    16479000 :     const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
    2967    16479000 :     int32_t y = h;
    2968    16479000 :     ConvBufType *dst = conv_params->dst;
    2969             :     __m128i coeffs_128[4];
    2970             :     __m256i coeffs_256[4];
    2971             : 
    2972    16479000 :     if (w == 2) {
    2973             :         __m128i s_32[8], ss_128[4];
    2974             : 
    2975           0 :         prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
    2976             : 
    2977           0 :         s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
    2978           0 :         s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
    2979           0 :         s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
    2980           0 :         s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
    2981           0 :         s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
    2982           0 :         s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2));
    2983           0 :         s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2));
    2984             : 
    2985           0 :         const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
    2986           0 :         const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
    2987           0 :         const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
    2988           0 :         const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
    2989           0 :         const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
    2990           0 :         const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
    2991             : 
    2992           0 :         ss_128[0] = _mm_unpacklo_epi16(src01, src12);
    2993           0 :         ss_128[1] = _mm_unpacklo_epi16(src23, src34);
    2994           0 :         ss_128[2] = _mm_unpacklo_epi16(src45, src56);
    2995             : 
    2996           0 :         y = h;
    2997             : 
    2998           0 :         if (conv_params->do_average) {
    2999           0 :             if (conv_params->use_jnt_comp_avg) {
    3000             :                 do {
    3001           0 :                     const __m128i res = xy_y_convolve_8tap_2x2_sse2(
    3002             :                         im, s_32, ss_128, coeffs_128);
    3003           0 :                     jnt_2d_comp_avg_round_store_2x2_sse2(res,
    3004             :                         factor_128,
    3005             :                         offset_comp_avg_128,
    3006             :                         dst,
    3007             :                         dst_stride,
    3008             :                         dst8,
    3009             :                         dst8_stride);
    3010           0 :                     im += 2 * 2;
    3011           0 :                     dst += 2 * dst_stride;
    3012           0 :                     dst8 += 2 * dst8_stride;
    3013           0 :                     y -= 2;
    3014           0 :                 } while (y);
    3015             :             }
    3016             :             else {
    3017             :                 do {
    3018           0 :                     const __m128i res = xy_y_convolve_8tap_2x2_sse2(
    3019             :                         im, s_32, ss_128, coeffs_128);
    3020           0 :                     jnt_2d_avg_round_store_2x2_sse2(res,
    3021             :                         offset_avg_128,
    3022             :                         dst,
    3023             :                         dst_stride,
    3024             :                         dst8,
    3025             :                         dst8_stride);
    3026           0 :                     im += 2 * 2;
    3027           0 :                     dst += 2 * dst_stride;
    3028           0 :                     dst8 += 2 * dst8_stride;
    3029           0 :                     y -= 2;
    3030           0 :                 } while (y);
    3031             :             }
    3032             :         }
    3033             :         else {
    3034             :             do {
    3035             :                 const __m128i res =
    3036           0 :                     xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
    3037           0 :                 jnt_2d_no_avg_round_store_2x2_sse2(
    3038             :                     res, offset_no_avg_128, dst, dst_stride);
    3039           0 :                 im += 2 * 2;
    3040           0 :                 dst += 2 * dst_stride;
    3041           0 :                 y -= 2;
    3042           0 :             } while (y);
    3043             :         }
    3044             :     }
    3045             :     else {
    3046    16479000 :         prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
    3047             : 
    3048    16479800 :         if (w == 4) {
    3049             :             __m128i s_64[8];
    3050             :             __m256i s_256[8], ss_256[4];
    3051             : 
    3052      121482 :             s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
    3053      121482 :             s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
    3054      121482 :             s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
    3055      121482 :             s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
    3056      121482 :             s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
    3057      121482 :             s_64[5] = _mm_loadl_epi64((__m128i *)(im + 5 * 4));
    3058      121482 :             s_64[6] = _mm_loadl_epi64((__m128i *)(im + 6 * 4));
    3059             : 
    3060             :             // Load lines a and b. Line a to lower 128, line b to upper
    3061             :             // 128
    3062      121482 :             s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
    3063      121482 :             s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
    3064      121482 :             s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
    3065      121482 :             s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
    3066      121482 :             s_256[4] = _mm256_setr_m128i(s_64[4], s_64[5]);
    3067      121482 :             s_256[5] = _mm256_setr_m128i(s_64[5], s_64[6]);
    3068             : 
    3069      121482 :             ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
    3070      121482 :             ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
    3071      121482 :             ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
    3072             : 
    3073      121482 :             y = h;
    3074             : 
    3075      121482 :             if (conv_params->do_average) {
    3076       48774 :                 if (conv_params->use_jnt_comp_avg) {
    3077             :                     do {
    3078      145040 :                         const __m256i res = xy_y_convolve_8tap_4x2_avx2(
    3079             :                             im, s_64, ss_256, coeffs_256);
    3080      145040 :                         jnt_2d_comp_avg_round_store_4x2_avx2(
    3081             :                             res,
    3082             :                             factor_256,
    3083             :                             offset_comp_avg_256,
    3084             :                             dst,
    3085             :                             dst_stride,
    3086             :                             dst8,
    3087             :                             dst8_stride);
    3088      145040 :                         im += 2 * 4;
    3089      145040 :                         dst += 2 * dst_stride;
    3090      145040 :                         dst8 += 2 * dst8_stride;
    3091      145040 :                         y -= 2;
    3092      145040 :                     } while (y);
    3093             :                 }
    3094             :                 else {
    3095             :                     do {
    3096      138568 :                         const __m256i res = xy_y_convolve_8tap_4x2_avx2(
    3097             :                             im, s_64, ss_256, coeffs_256);
    3098      138568 :                         jnt_2d_avg_round_store_4x2_avx2(res,
    3099             :                             offset_avg_256,
    3100             :                             dst,
    3101             :                             dst_stride,
    3102             :                             dst8,
    3103             :                             dst8_stride);
    3104      138568 :                         im += 2 * 4;
    3105      138568 :                         dst += 2 * dst_stride;
    3106      138568 :                         dst8 += 2 * dst8_stride;
    3107      138568 :                         y -= 2;
    3108      138568 :                     } while (y);
    3109             :                 }
    3110             :             }
    3111             :             else {
    3112             :                 do {
    3113      424568 :                     const __m256i res = xy_y_convolve_8tap_4x2_avx2(
    3114             :                         im, s_64, ss_256, coeffs_256);
    3115      424567 :                     jnt_2d_no_avg_round_store_4x2_avx2(
    3116             :                         res, offset_no_avg_256, dst, dst_stride);
    3117      424568 :                     im += 2 * 4;
    3118      424568 :                     dst += 2 * dst_stride;
    3119      424568 :                     y -= 2;
    3120      424568 :                 } while (y);
    3121             :             }
    3122             :         }
    3123    16358300 :         else if (w == 8) {
    3124             :             __m256i s_256[8], r[2];
    3125             : 
    3126     7213610 :             s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
    3127     7213610 :             s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
    3128     7213610 :             s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
    3129     7213610 :             s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
    3130     7213610 :             s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 8));
    3131     7213610 :             s_256[5] = _mm256_loadu_si256((__m256i *)(im + 5 * 8));
    3132     7213610 :             y = h;
    3133             : 
    3134             :             __m256i ss_256[8];
    3135             : 
    3136     7213610 :             convolve_8tap_unapck_avx2(s_256, ss_256);
    3137             : 
    3138     7213720 :             if (conv_params->do_average) {
    3139     2899340 :                 if (conv_params->use_jnt_comp_avg) {
    3140             :                     do {
    3141     9567550 :                         xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
    3142     9566860 :                         jnt_2d_comp_avg_round_store_8x2_avx2(
    3143             :                             r,
    3144             :                             factor_256,
    3145             :                             offset_comp_avg_256,
    3146             :                             dst,
    3147             :                             dst_stride,
    3148             :                             dst8,
    3149             :                             dst8_stride);
    3150     9567050 :                         im += 2 * 8;
    3151     9567050 :                         dst += 2 * dst_stride;
    3152     9567050 :                         dst8 += 2 * dst8_stride;
    3153     9567050 :                         y -= 2;
    3154     9567050 :                     } while (y);
    3155             :                 }
    3156             :                 else {
    3157             :                     do {
    3158    12614700 :                         xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
    3159    12614800 :                         jnt_2d_avg_round_store_8x2_avx2(r,
    3160             :                             offset_avg_256,
    3161             :                             dst,
    3162             :                             dst_stride,
    3163             :                             dst8,
    3164             :                             dst8_stride);
    3165    12613400 :                         im += 2 * 8;
    3166    12613400 :                         dst += 2 * dst_stride;
    3167    12613400 :                         dst8 += 2 * dst8_stride;
    3168    12613400 :                         y -= 2;
    3169    12613400 :                     } while (y);
    3170             :                 }
    3171             :             }
    3172             :             else {
    3173             :                 do {
    3174    32917700 :                     xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
    3175    32920900 :                     jnt_2d_no_avg_round_store_8x2_avx2(
    3176             :                         r, offset_no_avg_256, dst, dst_stride);
    3177    32919100 :                     im += 2 * 8;
    3178    32919100 :                     dst += 2 * dst_stride;
    3179    32919100 :                     y -= 2;
    3180    32919100 :                 } while (y);
    3181             :             }
    3182             :         }
    3183     9144730 :         else if (w == 16) {
    3184             :             __m256i s_256[8], r[4];
    3185             : 
    3186     4986720 :             load_16bit_7rows_avx2(im, 16, s_256);
    3187     4986800 :             y = h;
    3188             : 
    3189             :             __m256i ss_256[8], tt_256[8];
    3190             : 
    3191     4986800 :             convolve_8tap_unapck_avx2(s_256, ss_256);
    3192     4986780 :             convolve_8tap_unapck_avx2(s_256 + 1, tt_256);
    3193             : 
    3194     4990510 :             if (conv_params->do_average) {
    3195     1974960 :                 if (conv_params->use_jnt_comp_avg) {
    3196             :                     do {
    3197             :                         xy_y_convolve_8tap_16x2_avx2(
    3198             :                             im, 16, coeffs_256, s_256, ss_256, tt_256, r);
    3199     8170770 :                         jnt_2d_comp_avg_round_store_16x2_avx2(
    3200             :                             r,
    3201             :                             factor_256,
    3202             :                             offset_comp_avg_256,
    3203             :                             dst,
    3204             :                             dst_stride,
    3205             :                             dst8,
    3206             :                             dst8_stride);
    3207     8170660 :                         im += 2 * 16;
    3208     8170660 :                         dst += 2 * dst_stride;
    3209     8170660 :                         dst8 += 2 * dst8_stride;
    3210     8170660 :                         y -= 2;
    3211     8170660 :                     } while (y);
    3212             :                 }
    3213             :                 else {
    3214             :                     do {
    3215             :                         xy_y_convolve_8tap_16x2_avx2(
    3216             :                             im, 16, coeffs_256, s_256, ss_256, tt_256, r);
    3217    12107200 :                         jnt_2d_avg_round_store_16x2_avx2(r,
    3218             :                             offset_avg_256,
    3219             :                             dst,
    3220             :                             dst_stride,
    3221             :                             dst8,
    3222             :                             dst8_stride);
    3223    12105900 :                         im += 2 * 16;
    3224    12105900 :                         dst += 2 * dst_stride;
    3225    12105900 :                         dst8 += 2 * dst8_stride;
    3226    12105900 :                         y -= 2;
    3227    12105900 :                     } while (y);
    3228             :                 }
    3229             :             }
    3230             :             else {
    3231             :                 do {
    3232             :                     xy_y_convolve_8tap_16x2_avx2(
    3233             :                         im, 16, coeffs_256, s_256, ss_256, tt_256, r);
    3234    30792800 :                     jnt_2d_no_avg_round_store_16x2_avx2(
    3235             :                         r, offset_no_avg_256, dst, dst_stride);
    3236    30789300 :                     im += 2 * 16;
    3237    30789300 :                     dst += 2 * dst_stride;
    3238    30789300 :                     y -= 2;
    3239    30789300 :                 } while (y);
    3240             :             }
    3241             :         }
    3242             :         else {
    3243     4158010 :             int32_t x = 0;
    3244             :             __m256i s_256[2][8], r0[4], r1[4];
    3245             : 
    3246     4158010 :             assert(!(w % 32));
    3247             : 
    3248             :             __m256i ss_256[2][8], tt_256[2][8];
    3249             : 
    3250             :             do {
    3251     5120930 :                 const int16_t *s = im + x;
    3252     5120930 :                 ConvBufType *d = dst + x;
    3253     5120930 :                 uint8_t *d8 = dst8 + x;
    3254             : 
    3255     5120930 :                 load_16bit_7rows_avx2(s, w, s_256[0]);
    3256     5134530 :                 convolve_8tap_unapck_avx2(s_256[0], ss_256[0]);
    3257     5134520 :                 convolve_8tap_unapck_avx2(s_256[0] + 1, tt_256[0]);
    3258             : 
    3259     5134430 :                 load_16bit_7rows_avx2(s + 16, w, s_256[1]);
    3260     5134570 :                 convolve_8tap_unapck_avx2(s_256[1], ss_256[1]);
    3261     5134550 :                 convolve_8tap_unapck_avx2(s_256[1] + 1, tt_256[1]);
    3262             : 
    3263     5134530 :                 y = h;
    3264             : 
    3265     5134530 :                 if (conv_params->do_average) {
    3266     2058960 :                     if (conv_params->use_jnt_comp_avg) {
    3267             :                         do {
    3268             :                             xy_y_convolve_8tap_16x2_avx2(s,
    3269             :                                 w,
    3270             :                                 coeffs_256,
    3271             :                                 s_256[0],
    3272             :                                 ss_256[0],
    3273             :                                 tt_256[0],
    3274             :                                 r0);
    3275     9470950 :                             xy_y_convolve_8tap_16x2_avx2(s + 16,
    3276             :                                 w,
    3277             :                                 coeffs_256,
    3278             :                                 s_256[1],
    3279             :                                 ss_256[1],
    3280             :                                 tt_256[1],
    3281             :                                 r1);
    3282             :                             jnt_2d_comp_avg_round_store_32_avx2(
    3283             :                                 r0 + 0,
    3284             :                                 r1 + 0,
    3285             :                                 factor_256,
    3286             :                                 offset_comp_avg_256,
    3287             :                                 d,
    3288             :                                 d8);
    3289     9470940 :                             jnt_2d_comp_avg_round_store_32_avx2(
    3290             :                                 r0 + 2,
    3291             :                                 r1 + 2,
    3292             :                                 factor_256,
    3293             :                                 offset_comp_avg_256,
    3294     9470940 :                                 d + dst_stride,
    3295             :                                 d8 + dst8_stride);
    3296     9470640 :                             s += 2 * w;
    3297     9470640 :                             d += 2 * dst_stride;
    3298     9470640 :                             d8 += 2 * dst8_stride;
    3299     9470640 :                             y -= 2;
    3300     9470640 :                         } while (y);
    3301             :                     }
    3302             :                     else {
    3303             :                         do {
    3304             :                             xy_y_convolve_8tap_16x2_avx2(s,
    3305             :                                 w,
    3306             :                                 coeffs_256,
    3307             :                                 s_256[0],
    3308             :                                 ss_256[0],
    3309             :                                 tt_256[0],
    3310             :                                 r0);
    3311    15503900 :                             xy_y_convolve_8tap_16x2_avx2(s + 16,
    3312             :                                 w,
    3313             :                                 coeffs_256,
    3314             :                                 s_256[1],
    3315             :                                 ss_256[1],
    3316             :                                 tt_256[1],
    3317             :                                 r1);
    3318             :                             jnt_2d_avg_round_store_32_avx2(
    3319             :                                 r0 + 0, r1 + 0, offset_avg_256, d, d8);
    3320    15500300 :                             jnt_2d_avg_round_store_32_avx2(r0 + 2,
    3321             :                                 r1 + 2,
    3322             :                                 offset_avg_256,
    3323    15500300 :                                 d + dst_stride,
    3324             :                                 d8 + dst8_stride);
    3325    15500200 :                             s += 2 * w;
    3326    15500200 :                             d += 2 * dst_stride;
    3327    15500200 :                             d8 += 2 * dst8_stride;
    3328    15500200 :                             y -= 2;
    3329    15500200 :                         } while (y);
    3330             :                     }
    3331             :                 }
    3332             :                 else {
    3333             :                     do {
    3334             :                         xy_y_convolve_8tap_16x2_avx2(s,
    3335             :                             w,
    3336             :                             coeffs_256,
    3337             :                             s_256[0],
    3338             :                             ss_256[0],
    3339             :                             tt_256[0],
    3340             :                             r0);
    3341    37297500 :                         xy_y_convolve_8tap_16x2_avx2(s + 16,
    3342             :                             w,
    3343             :                             coeffs_256,
    3344             :                             s_256[1],
    3345             :                             ss_256[1],
    3346             :                             tt_256[1],
    3347             :                             r1);
    3348    37308100 :                         jnt_2d_no_avg_round_store_32_avx2(
    3349             :                             r0 + 0, r1 + 0, offset_no_avg_256, d);
    3350    37285300 :                         jnt_2d_no_avg_round_store_32_avx2(
    3351    37285300 :                             r0 + 2, r1 + 2, offset_no_avg_256, d + dst_stride);
    3352    37285300 :                         s += 2 * w;
    3353    37285300 :                         d += 2 * dst_stride;
    3354    37285300 :                         y -= 2;
    3355    37285300 :                     } while (y);
    3356             :                 }
    3357             : 
    3358     5125030 :                 x += 32;
    3359     5125030 :             } while (x < w);
    3360             :         }
    3361             :     }
    3362    16483600 : }
    3363             : 
    3364             : typedef void(*jnt_convolve_2d_hor_tap_func)(
    3365             :     const uint8_t *src, const int32_t src_stride, const int32_t w,
    3366             :     const int32_t h, const InterpFilterParams *filter_params_x,
    3367             :     const int32_t subpel_x_q4, int16_t *const im_block);
    3368             : 
    3369             : typedef void(*jnt_convolve_2d_ver_tap_func)(
    3370             :     const int16_t *const im_block, const int32_t w, const int32_t h,
    3371             :     const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
    3372             :     const ConvolveParams *const conv_params, uint8_t *dst8,
    3373             :     const int32_t dst8_stride);
    3374             : 
    3375   148838000 : void eb_av1_jnt_convolve_2d_avx2(const uint8_t *src, int32_t src_stride,
    3376             :     uint8_t *dst8, int32_t dst8_stride, int32_t w,
    3377             :     int32_t h, InterpFilterParams *filter_params_x,
    3378             :     InterpFilterParams *filter_params_y,
    3379             :     const int32_t subpel_x_q4,
    3380             :     const int32_t subpel_y_q4,
    3381             :     ConvolveParams *conv_params) {
    3382             :     static const jnt_convolve_2d_hor_tap_func
    3383             :         jnt_convolve_2d_hor_tap_func_table[MAX_FILTER_TAP + 1] = {
    3384             :             NULL,
    3385             :             NULL,
    3386             :             jnt_convolve_2d_hor_2tap_avx2,
    3387             :             NULL,
    3388             :             jnt_convolve_2d_hor_4tap_avx2,
    3389             :             NULL,
    3390             :             jnt_convolve_2d_hor_6tap_avx2,
    3391             :             NULL,
    3392             :             jnt_convolve_2d_hor_8tap_avx2 };
    3393             :     static const jnt_convolve_2d_ver_tap_func
    3394             :         jnt_convolve_2d_ver_tap_func_table[MAX_FILTER_TAP + 1] = {
    3395             :             NULL,
    3396             :             jnt_convolve_2d_ver_2tap_half_avx2,
    3397             :             jnt_convolve_2d_ver_2tap_avx2,
    3398             :             jnt_convolve_2d_ver_4tap_avx2,
    3399             :             jnt_convolve_2d_ver_4tap_avx2,
    3400             :             jnt_convolve_2d_ver_6tap_avx2,
    3401             :             jnt_convolve_2d_ver_6tap_avx2,
    3402             :             jnt_convolve_2d_ver_8tap_avx2,
    3403             :             jnt_convolve_2d_ver_8tap_avx2 };
    3404   148838000 :     const int32_t tap_x = get_convolve_tap(filter_params_x->filter_ptr);
    3405   148787000 :     const int32_t tap_y = get_convolve_tap(filter_params_y->filter_ptr);
    3406   148766000 :     const uint8_t *src_ptr =
    3407   148766000 :         src + ((MAX_FILTER_TAP - tap_y) / 2 - 3) * src_stride;
    3408             :     // Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data
    3409             :     //       permutation.
    3410             :     DECLARE_ALIGNED(
    3411             :     32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
    3412             : 
    3413   148766000 :     assert(conv_params->round_0 == 3);
    3414   148766000 :     assert(conv_params->round_1 == COMPOUND_ROUND1_BITS);
    3415             : 
    3416             :     // horizontal filter
    3417             : 
    3418             :     // Have to calculate 1 more row for small widths, since 2 lines are
    3419             :     // calculated in each loop for them.
    3420   148766000 :     const int32_t hh = h + tap_y - (w >= 32);
    3421             : 
    3422   148766000 :     jnt_convolve_2d_hor_tap_func_table[tap_x](
    3423             :         src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block);
    3424             : 
    3425             :     // vertical filter
    3426   149206000 :     jnt_convolve_2d_ver_tap_func_table[tap_y - (subpel_y_q4 == 8)](
    3427             :         im_block,
    3428             :         w,
    3429             :         h,
    3430             :         filter_params_y,
    3431             :         subpel_y_q4,
    3432             :         conv_params,
    3433             :         dst8,
    3434             :         dst8_stride);
    3435   149196000 : }

Generated by: LCOV version 1.14