LCOV - coverage.info - ASM_AVX2/highbd_jnt_convolve

LCOV - code coverage report

Current view:	top level - ASM_AVX2 - highbd_jnt_convolve_avx2.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	0	449	0.0 %
Date:	2019-11-25 17:38:06	Functions:	0	4	0.0 %

          Line data    Source code

       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : /*
       6             : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       7             : *
       8             : * This source code is subject to the terms of the BSD 2 Clause License and
       9             : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      10             : * was not distributed with this source code in the LICENSE file, you can
      11             : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      12             : * Media Patent License 1.0 was not distributed with this source code in the
      13             : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      14             : */
      15             : 
      16             : #include <immintrin.h>
      17             : #include <assert.h>
      18             : 
      19             : #include "EbDefinitions.h"
      20             : #include "aom_dsp_rtcd.h"
      21             : 
      22             : #include "convolve_avx2.h"
      23             : #include "convolve.h"
      24             : 
      25           0 : void eb_av1_highbd_jnt_convolve_2d_copy_avx2(
      26             :     const uint16_t *src, int32_t src_stride, uint16_t *dst0, int32_t dst_stride0, int32_t w,
      27             :     int32_t h, const InterpFilterParams *filter_params_x,
      28             :     const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
      29             :     const int32_t subpel_y_q4, ConvolveParams *conv_params, int32_t bd) {
      30           0 :     ConvBufType *dst = conv_params->dst;
      31           0 :     int32_t dst_stride = conv_params->dst_stride;
      32             :     (void)filter_params_x;
      33             :     (void)filter_params_y;
      34             :     (void)subpel_x_q4;
      35             :     (void)subpel_y_q4;
      36             : 
      37           0 :     const int32_t bits =
      38           0 :         FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
      39           0 :     const __m128i left_shift = _mm_cvtsi32_si128(bits);
      40           0 :     const int32_t do_average = conv_params->do_average;
      41           0 :     const int32_t use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
      42           0 :     const int32_t w0 = conv_params->fwd_offset;
      43           0 :     const int32_t w1 = conv_params->bck_offset;
      44           0 :     const __m256i wt0 = _mm256_set1_epi32(w0);
      45           0 :     const __m256i wt1 = _mm256_set1_epi32(w1);
      46           0 :     const __m256i zero = _mm256_setzero_si256();
      47             :     int32_t i, j;
      48             : 
      49           0 :     const int32_t offset_0 =
      50           0 :         bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
      51           0 :     const int32_t offset = (1 << offset_0) + (1 << (offset_0 - 1));
      52           0 :     const __m256i offset_const = _mm256_set1_epi32(offset);
      53           0 :     const __m256i offset_const_16b = _mm256_set1_epi16(offset);
      54           0 :     const int32_t rounding_shift =
      55           0 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
      56           0 :     const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
      57             :     const __m256i clip_pixel_to_bd =
      58           0 :         _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
      59             : 
      60           0 :     assert(bits <= 4);
      61             : 
      62           0 :     if (!(w % 16)) {
      63           0 :         for (i = 0; i < h; i += 1) {
      64           0 :             for (j = 0; j < w; j += 16) {
      65             :                 const __m256i src_16bit =
      66           0 :                     _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
      67             : 
      68           0 :                 const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
      69             : 
      70           0 :                 if (do_average) {
      71             :                     const __m256i data_0 =
      72           0 :                         _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
      73             : 
      74           0 :                     const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero);
      75           0 :                     const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero);
      76             : 
      77           0 :                     const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
      78           0 :                     const __m256i res_unsigned_lo =
      79           0 :                         _mm256_add_epi32(res_32b_lo, offset_const);
      80             : 
      81           0 :                     const __m256i comp_avg_res_lo = highbd_comp_avg(
      82             :                         &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
      83             : 
      84           0 :                     const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
      85           0 :                     const __m256i res_unsigned_hi =
      86           0 :                         _mm256_add_epi32(res_32b_hi, offset_const);
      87             : 
      88           0 :                     const __m256i comp_avg_res_hi = highbd_comp_avg(
      89             :                         &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
      90             : 
      91           0 :                     const __m256i round_result_lo = highbd_convolve_rounding(
      92             :                         &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
      93           0 :                     const __m256i round_result_hi = highbd_convolve_rounding(
      94             :                         &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
      95             : 
      96             :                     const __m256i res_16b =
      97           0 :                         _mm256_packus_epi32(round_result_lo, round_result_hi);
      98           0 :                     const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
      99             : 
     100           0 :                     _mm256_storeu_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
     101             :                 }
     102             :                 else {
     103             :                     const __m256i res_unsigned_16b =
     104           0 :                         _mm256_adds_epu16(res, offset_const_16b);
     105             : 
     106           0 :                     _mm256_storeu_si256((__m256i *)(&dst[i * dst_stride + j]),
     107             :                         res_unsigned_16b);
     108             :                 }
     109             :             }
     110             :         }
     111             :     }
     112           0 :     else if (!(w % 4)) {
     113           0 :         for (i = 0; i < h; i += 2) {
     114           0 :             for (j = 0; j < w; j += 8) {
     115             :                 const __m128i src_row_0 =
     116           0 :                     _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
     117             :                 const __m128i src_row_1 =
     118           0 :                     _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
     119             :                 // since not all compilers yet support _mm256_set_m128i()
     120           0 :                 const __m256i src_10 = _mm256_insertf128_si256(
     121             :                     _mm256_castsi128_si256(src_row_0), src_row_1, 1);
     122             : 
     123           0 :                 const __m256i res = _mm256_sll_epi16(src_10, left_shift);
     124             : 
     125           0 :                 if (w - j < 8) {
     126           0 :                     if (do_average) {
     127           0 :                         const __m256i data_0 = _mm256_castsi128_si256(
     128           0 :                             _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
     129           0 :                         const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
     130           0 :                             (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
     131             :                         const __m256i data_01 =
     132           0 :                             _mm256_permute2x128_si256(data_0, data_1, 0x20);
     133             : 
     134           0 :                         const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
     135             : 
     136           0 :                         const __m256i res_32b = _mm256_unpacklo_epi16(res, zero);
     137           0 :                         const __m256i res_unsigned_lo =
     138           0 :                             _mm256_add_epi32(res_32b, offset_const);
     139             : 
     140           0 :                         const __m256i comp_avg_res = highbd_comp_avg(
     141             :                             &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
     142             : 
     143           0 :                         const __m256i round_result = highbd_convolve_rounding(
     144             :                             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
     145             : 
     146             :                         const __m256i res_16b =
     147           0 :                             _mm256_packus_epi32(round_result, round_result);
     148             :                         const __m256i res_clip =
     149           0 :                             _mm256_min_epi16(res_16b, clip_pixel_to_bd);
     150             : 
     151           0 :                         const __m128i res_0 = _mm256_castsi256_si128(res_clip);
     152           0 :                         const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
     153             : 
     154           0 :                         _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
     155           0 :                         _mm_storel_epi64(
     156           0 :                             (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
     157             :                     }
     158             :                     else {
     159             :                         const __m256i res_unsigned_16b =
     160           0 :                             _mm256_adds_epu16(res, offset_const_16b);
     161             : 
     162           0 :                         const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
     163           0 :                         const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
     164             : 
     165           0 :                         _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
     166           0 :                         _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
     167             :                             res_1);
     168             :                     }
     169             :                 }
     170             :                 else {
     171           0 :                     if (do_average) {
     172           0 :                         const __m256i data_0 = _mm256_castsi128_si256(
     173           0 :                             _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
     174           0 :                         const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
     175           0 :                             (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
     176             :                         const __m256i data_01 =
     177           0 :                             _mm256_permute2x128_si256(data_0, data_1, 0x20);
     178             : 
     179           0 :                         const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
     180           0 :                         const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
     181             : 
     182           0 :                         const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
     183           0 :                         const __m256i res_unsigned_lo =
     184           0 :                             _mm256_add_epi32(res_32b_lo, offset_const);
     185             : 
     186           0 :                         const __m256i comp_avg_res_lo = highbd_comp_avg(
     187             :                             &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
     188             : 
     189           0 :                         const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
     190           0 :                         const __m256i res_unsigned_hi =
     191           0 :                             _mm256_add_epi32(res_32b_hi, offset_const);
     192             : 
     193           0 :                         const __m256i comp_avg_res_hi = highbd_comp_avg(
     194             :                             &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
     195             : 
     196             :                         const __m256i round_result_lo =
     197           0 :                             highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
     198             :                                 &rounding_const, rounding_shift);
     199             :                         const __m256i round_result_hi =
     200           0 :                             highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
     201             :                                 &rounding_const, rounding_shift);
     202             : 
     203             :                         const __m256i res_16b =
     204           0 :                             _mm256_packus_epi32(round_result_lo, round_result_hi);
     205             :                         const __m256i res_clip =
     206           0 :                             _mm256_min_epi16(res_16b, clip_pixel_to_bd);
     207             : 
     208           0 :                         const __m128i res_0 = _mm256_castsi256_si128(res_clip);
     209           0 :                         const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
     210             : 
     211           0 :                         _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
     212             :                         _mm_store_si128(
     213           0 :                             (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
     214             :                     }
     215             :                     else {
     216             :                         const __m256i res_unsigned_16b =
     217           0 :                             _mm256_adds_epu16(res, offset_const_16b);
     218           0 :                         const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
     219           0 :                         const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
     220             : 
     221           0 :                         _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
     222           0 :                         _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
     223             :                             res_1);
     224             :                     }
     225             :                 }
     226             :             }
     227             :         }
     228             :     }
     229           0 : }
     230             : 
     231           0 : void eb_av1_highbd_jnt_convolve_2d_avx2(
     232             :     const uint16_t *src, int32_t src_stride, uint16_t *dst0, int32_t dst_stride0, int32_t w,
     233             :     int32_t h, const InterpFilterParams *filter_params_x,
     234             :     const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
     235             :     const int32_t subpel_y_q4, ConvolveParams *conv_params, int32_t bd) {
     236             :     DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
     237           0 :     ConvBufType *dst = conv_params->dst;
     238           0 :     int32_t dst_stride = conv_params->dst_stride;
     239           0 :     int32_t im_h = h + filter_params_y->taps - 1;
     240           0 :     int32_t im_stride = 8;
     241             :     int32_t i, j;
     242           0 :     const int32_t fo_vert = filter_params_y->taps / 2 - 1;
     243           0 :     const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
     244           0 :     const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
     245             : 
     246             :     // Check that, even with 12-bit input, the intermediate values will fit
     247             :     // into an unsigned 16-bit intermediate array.
     248           0 :     assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
     249             : 
     250             :     __m256i s[8], coeffs_y[4], coeffs_x[4];
     251           0 :     const int32_t do_average = conv_params->do_average;
     252           0 :     const int32_t use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
     253             : 
     254           0 :     const int32_t w0 = conv_params->fwd_offset;
     255           0 :     const int32_t w1 = conv_params->bck_offset;
     256           0 :     const __m256i wt0 = _mm256_set1_epi32(w0);
     257           0 :     const __m256i wt1 = _mm256_set1_epi32(w1);
     258           0 :     const __m256i zero = _mm256_setzero_si256();
     259             : 
     260           0 :     const __m256i round_const_x = _mm256_set1_epi32(
     261           0 :         ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
     262           0 :     const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
     263             : 
     264           0 :     const __m256i round_const_y = _mm256_set1_epi32(
     265           0 :         ((1 << conv_params->round_1) >> 1) -
     266           0 :         (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
     267           0 :     const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
     268             : 
     269           0 :     const int32_t offset_0 =
     270           0 :         bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     271           0 :     const int32_t offset = (1 << offset_0) + (1 << (offset_0 - 1));
     272           0 :     const __m256i offset_const = _mm256_set1_epi32(offset);
     273           0 :     const int32_t rounding_shift =
     274           0 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     275           0 :     const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
     276             : 
     277             :     const __m256i clip_pixel_to_bd =
     278           0 :         _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
     279             : 
     280           0 :     prepare_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_x);
     281           0 :     prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_y);
     282             : 
     283           0 :     for (j = 0; j < w; j += 8) {
     284             :         /* Horizontal filter */
     285             :         {
     286           0 :             for (i = 0; i < im_h; i += 2) {
     287             :                 const __m256i row0 =
     288           0 :                     _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
     289           0 :                 __m256i row1 = _mm256_set1_epi16(0);
     290           0 :                 if (i + 1 < im_h)
     291             :                     row1 =
     292           0 :                     _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
     293             : 
     294           0 :                 const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
     295           0 :                 const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
     296             : 
     297             :                 // even pixels
     298           0 :                 s[0] = _mm256_alignr_epi8(r1, r0, 0);
     299           0 :                 s[1] = _mm256_alignr_epi8(r1, r0, 4);
     300           0 :                 s[2] = _mm256_alignr_epi8(r1, r0, 8);
     301           0 :                 s[3] = _mm256_alignr_epi8(r1, r0, 12);
     302             : 
     303           0 :                 __m256i res_even = convolve16_8tap_avx2(s, coeffs_x);
     304           0 :                 res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
     305             :                     round_shift_x);
     306             : 
     307             :                 // odd pixels
     308           0 :                 s[0] = _mm256_alignr_epi8(r1, r0, 2);
     309           0 :                 s[1] = _mm256_alignr_epi8(r1, r0, 6);
     310           0 :                 s[2] = _mm256_alignr_epi8(r1, r0, 10);
     311           0 :                 s[3] = _mm256_alignr_epi8(r1, r0, 14);
     312             : 
     313           0 :                 __m256i res_odd = convolve16_8tap_avx2(s, coeffs_x);
     314           0 :                 res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
     315             :                     round_shift_x);
     316             : 
     317           0 :                 __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
     318           0 :                 __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
     319           0 :                 __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
     320             : 
     321           0 :                 _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
     322             :             }
     323             :         }
     324             : 
     325             :         /* Vertical filter */
     326             :         {
     327           0 :             __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
     328           0 :             __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
     329           0 :             __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
     330           0 :             __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
     331           0 :             __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
     332           0 :             __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
     333             : 
     334           0 :             s[0] = _mm256_unpacklo_epi16(s0, s1);
     335           0 :             s[1] = _mm256_unpacklo_epi16(s2, s3);
     336           0 :             s[2] = _mm256_unpacklo_epi16(s4, s5);
     337             : 
     338           0 :             s[4] = _mm256_unpackhi_epi16(s0, s1);
     339           0 :             s[5] = _mm256_unpackhi_epi16(s2, s3);
     340           0 :             s[6] = _mm256_unpackhi_epi16(s4, s5);
     341             : 
     342           0 :             for (i = 0; i < h; i += 2) {
     343           0 :                 const int16_t *data = &im_block[i * im_stride];
     344             : 
     345             :                 const __m256i s6 =
     346           0 :                     _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
     347             :                 const __m256i s7 =
     348           0 :                     _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
     349             : 
     350           0 :                 s[3] = _mm256_unpacklo_epi16(s6, s7);
     351           0 :                 s[7] = _mm256_unpackhi_epi16(s6, s7);
     352             : 
     353           0 :                 const __m256i res_a = convolve16_8tap_avx2(s, coeffs_y);
     354             : 
     355           0 :                 const __m256i res_a_round = _mm256_sra_epi32(
     356             :                     _mm256_add_epi32(res_a, round_const_y), round_shift_y);
     357             : 
     358           0 :                 const __m256i res_unsigned_lo =
     359           0 :                     _mm256_add_epi32(res_a_round, offset_const);
     360             : 
     361           0 :                 if (w - j < 8) {
     362           0 :                     if (do_average) {
     363           0 :                         const __m256i data_0 = _mm256_castsi128_si256(
     364           0 :                             _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
     365           0 :                         const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
     366           0 :                             (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
     367             :                         const __m256i data_01 =
     368           0 :                             _mm256_permute2x128_si256(data_0, data_1, 0x20);
     369             : 
     370           0 :                         const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
     371             : 
     372           0 :                         const __m256i comp_avg_res = highbd_comp_avg(
     373             :                             &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
     374             : 
     375           0 :                         const __m256i round_result = highbd_convolve_rounding(
     376             :                             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
     377             : 
     378             :                         const __m256i res_16b =
     379           0 :                             _mm256_packus_epi32(round_result, round_result);
     380             :                         const __m256i res_clip =
     381           0 :                             _mm256_min_epi16(res_16b, clip_pixel_to_bd);
     382             : 
     383           0 :                         const __m128i res_0 = _mm256_castsi256_si128(res_clip);
     384           0 :                         const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
     385             : 
     386           0 :                         _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
     387           0 :                         _mm_storel_epi64(
     388           0 :                             (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
     389             :                     }
     390             :                     else {
     391             :                         __m256i res_16b =
     392           0 :                             _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
     393           0 :                         const __m128i res_0 = _mm256_castsi256_si128(res_16b);
     394           0 :                         const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
     395             : 
     396           0 :                         _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
     397           0 :                         _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
     398             :                             res_1);
     399             :                     }
     400             :                 }
     401             :                 else {
     402           0 :                     const __m256i res_b = convolve16_8tap_avx2(s + 4, coeffs_y);
     403           0 :                     const __m256i res_b_round = _mm256_sra_epi32(
     404             :                         _mm256_add_epi32(res_b, round_const_y), round_shift_y);
     405             : 
     406           0 :                     __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
     407             : 
     408           0 :                     if (do_average) {
     409           0 :                         const __m256i data_0 = _mm256_castsi128_si256(
     410           0 :                             _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
     411           0 :                         const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
     412           0 :                             (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
     413             :                         const __m256i data_01 =
     414           0 :                             _mm256_permute2x128_si256(data_0, data_1, 0x20);
     415             : 
     416           0 :                         const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
     417           0 :                         const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
     418             : 
     419           0 :                         const __m256i comp_avg_res_lo = highbd_comp_avg(
     420             :                             &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
     421           0 :                         const __m256i comp_avg_res_hi = highbd_comp_avg(
     422             :                             &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
     423             : 
     424             :                         const __m256i round_result_lo =
     425           0 :                             highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
     426             :                                 &rounding_const, rounding_shift);
     427             :                         const __m256i round_result_hi =
     428           0 :                             highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
     429             :                                 &rounding_const, rounding_shift);
     430             : 
     431             :                         const __m256i res_16b =
     432           0 :                             _mm256_packus_epi32(round_result_lo, round_result_hi);
     433             :                         const __m256i res_clip =
     434           0 :                             _mm256_min_epi16(res_16b, clip_pixel_to_bd);
     435             : 
     436           0 :                         const __m128i res_0 = _mm256_castsi256_si128(res_clip);
     437           0 :                         const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
     438             : 
     439           0 :                         _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
     440             :                         _mm_store_si128(
     441           0 :                             (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
     442             :                     }
     443             :                     else {
     444             :                         __m256i res_16b =
     445           0 :                             _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
     446           0 :                         const __m128i res_0 = _mm256_castsi256_si128(res_16b);
     447           0 :                         const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
     448             : 
     449           0 :                         _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
     450           0 :                         _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
     451             :                             res_1);
     452             :                     }
     453             :                 }
     454             : 
     455           0 :                 s[0] = s[1];
     456           0 :                 s[1] = s[2];
     457           0 :                 s[2] = s[3];
     458             : 
     459           0 :                 s[4] = s[5];
     460           0 :                 s[5] = s[6];
     461           0 :                 s[6] = s[7];
     462             :             }
     463             :         }
     464             :     }
     465           0 : }
     466             : 
     467           0 : void eb_av1_highbd_jnt_convolve_x_avx2(
     468             :     const uint16_t *src, int32_t src_stride, uint16_t *dst0, int32_t dst_stride0, int32_t w,
     469             :     int32_t h, const InterpFilterParams *filter_params_x,
     470             :     const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
     471             :     const int32_t subpel_y_q4, ConvolveParams *conv_params, int32_t bd) {
     472           0 :     ConvBufType *dst = conv_params->dst;
     473           0 :     int32_t dst_stride = conv_params->dst_stride;
     474           0 :     const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
     475           0 :     const uint16_t *const src_ptr = src - fo_horiz;
     476           0 :     const int32_t bits = FILTER_BITS - conv_params->round_1;
     477             :     (void)filter_params_y;
     478             :     (void)subpel_y_q4;
     479             : 
     480             :     int32_t i, j;
     481             :     __m256i s[4], coeffs_x[4];
     482             : 
     483           0 :     const int32_t do_average = conv_params->do_average;
     484           0 :     const int32_t use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
     485           0 :     const int32_t w0 = conv_params->fwd_offset;
     486           0 :     const int32_t w1 = conv_params->bck_offset;
     487           0 :     const __m256i wt0 = _mm256_set1_epi32(w0);
     488           0 :     const __m256i wt1 = _mm256_set1_epi32(w1);
     489           0 :     const __m256i zero = _mm256_setzero_si256();
     490             : 
     491             :     const __m256i round_const_x =
     492           0 :         _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
     493           0 :     const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
     494           0 :     const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
     495             : 
     496           0 :     const int32_t offset_0 =
     497           0 :         bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     498           0 :     const int32_t offset = (1 << offset_0) + (1 << (offset_0 - 1));
     499           0 :     const __m256i offset_const = _mm256_set1_epi32(offset);
     500           0 :     const int32_t rounding_shift =
     501           0 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     502           0 :     const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
     503             :     const __m256i clip_pixel_to_bd =
     504           0 :         _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
     505             : 
     506           0 :     assert(bits >= 0);
     507           0 :     prepare_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_x);
     508             : 
     509           0 :     for (j = 0; j < w; j += 8) {
     510             :         /* Horizontal filter */
     511           0 :         for (i = 0; i < h; i += 2) {
     512             :             const __m256i row0 =
     513           0 :                 _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
     514             :             __m256i row1 =
     515           0 :                 _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
     516             : 
     517           0 :             const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
     518           0 :             const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
     519             : 
     520             :             // even pixels
     521           0 :             s[0] = _mm256_alignr_epi8(r1, r0, 0);
     522           0 :             s[1] = _mm256_alignr_epi8(r1, r0, 4);
     523           0 :             s[2] = _mm256_alignr_epi8(r1, r0, 8);
     524           0 :             s[3] = _mm256_alignr_epi8(r1, r0, 12);
     525             : 
     526           0 :             __m256i res_even = convolve16_8tap_avx2(s, coeffs_x);
     527           0 :             res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
     528             :                 round_shift_x);
     529             : 
     530             :             // odd pixels
     531           0 :             s[0] = _mm256_alignr_epi8(r1, r0, 2);
     532           0 :             s[1] = _mm256_alignr_epi8(r1, r0, 6);
     533           0 :             s[2] = _mm256_alignr_epi8(r1, r0, 10);
     534           0 :             s[3] = _mm256_alignr_epi8(r1, r0, 14);
     535             : 
     536           0 :             __m256i res_odd = convolve16_8tap_avx2(s, coeffs_x);
     537           0 :             res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
     538             :                 round_shift_x);
     539             : 
     540           0 :             res_even = _mm256_sll_epi32(res_even, round_shift_bits);
     541           0 :             res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
     542             : 
     543           0 :             __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
     544             : 
     545           0 :             __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const);
     546             : 
     547           0 :             if (w - j < 8) {
     548           0 :                 if (do_average) {
     549           0 :                     const __m256i data_0 = _mm256_castsi128_si256(
     550           0 :                         _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
     551           0 :                     const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
     552           0 :                         (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
     553             :                     const __m256i data_01 =
     554           0 :                         _mm256_permute2x128_si256(data_0, data_1, 0x20);
     555             : 
     556           0 :                     const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
     557             : 
     558           0 :                     const __m256i comp_avg_res = highbd_comp_avg(
     559             :                         &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
     560             : 
     561           0 :                     const __m256i round_result = highbd_convolve_rounding(
     562             :                         &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
     563             : 
     564             :                     const __m256i res_16b =
     565           0 :                         _mm256_packus_epi32(round_result, round_result);
     566           0 :                     const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
     567             : 
     568           0 :                     const __m128i res_0 = _mm256_castsi256_si128(res_clip);
     569           0 :                     const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
     570             : 
     571           0 :                     _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
     572           0 :                     _mm_storel_epi64(
     573           0 :                         (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
     574             :                 }
     575             :                 else {
     576             :                     __m256i res_16b =
     577           0 :                         _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
     578           0 :                     const __m128i res_0 = _mm256_castsi256_si128(res_16b);
     579           0 :                     const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
     580             : 
     581           0 :                     _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
     582           0 :                     _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
     583             :                         res_1);
     584             :                 }
     585             :             }
     586             :             else {
     587           0 :                 __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
     588           0 :                 __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const);
     589             : 
     590           0 :                 if (do_average) {
     591           0 :                     const __m256i data_0 = _mm256_castsi128_si256(
     592           0 :                         _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
     593           0 :                     const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
     594           0 :                         (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
     595             :                     const __m256i data_01 =
     596           0 :                         _mm256_permute2x128_si256(data_0, data_1, 0x20);
     597             : 
     598           0 :                     const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
     599           0 :                     const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
     600             : 
     601           0 :                     const __m256i comp_avg_res_lo = highbd_comp_avg(
     602             :                         &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
     603           0 :                     const __m256i comp_avg_res_hi = highbd_comp_avg(
     604             :                         &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
     605             : 
     606           0 :                     const __m256i round_result_lo = highbd_convolve_rounding(
     607             :                         &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
     608           0 :                     const __m256i round_result_hi = highbd_convolve_rounding(
     609             :                         &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
     610             : 
     611             :                     const __m256i res_16b =
     612           0 :                         _mm256_packus_epi32(round_result_lo, round_result_hi);
     613           0 :                     const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
     614             : 
     615           0 :                     const __m128i res_0 = _mm256_castsi256_si128(res_clip);
     616           0 :                     const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
     617             : 
     618           0 :                     _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
     619           0 :                     _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
     620             :                         res_1);
     621             :                 }
     622             :                 else {
     623             :                     __m256i res_16b =
     624           0 :                         _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
     625           0 :                     const __m128i res_0 = _mm256_castsi256_si128(res_16b);
     626           0 :                     const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
     627             : 
     628           0 :                     _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
     629           0 :                     _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
     630             :                         res_1);
     631             :                 }
     632             :             }
     633             :         }
     634             :     }
     635           0 : }
     636             : 
     637           0 : void eb_av1_highbd_jnt_convolve_y_avx2(
     638             :     const uint16_t *src, int32_t src_stride, uint16_t *dst0, int32_t dst_stride0, int32_t w,
     639             :     int32_t h, const InterpFilterParams *filter_params_x,
     640             :     const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
     641             :     const int32_t subpel_y_q4, ConvolveParams *conv_params, int32_t bd) {
     642           0 :     ConvBufType *dst = conv_params->dst;
     643           0 :     int32_t dst_stride = conv_params->dst_stride;
     644           0 :     const int32_t fo_vert = filter_params_y->taps / 2 - 1;
     645           0 :     const uint16_t *const src_ptr = src - fo_vert * src_stride;
     646           0 :     const int32_t bits = FILTER_BITS - conv_params->round_0;
     647             :     (void)filter_params_x;
     648             :     (void)subpel_x_q4;
     649             : 
     650           0 :     assert(bits >= 0);
     651             :     int32_t i, j;
     652             :     __m256i s[8], coeffs_y[4];
     653           0 :     const int32_t do_average = conv_params->do_average;
     654           0 :     const int32_t use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
     655             : 
     656           0 :     const int32_t w0 = conv_params->fwd_offset;
     657           0 :     const int32_t w1 = conv_params->bck_offset;
     658           0 :     const __m256i wt0 = _mm256_set1_epi32(w0);
     659           0 :     const __m256i wt1 = _mm256_set1_epi32(w1);
     660             :     const __m256i round_const_y =
     661           0 :         _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
     662           0 :     const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
     663           0 :     const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
     664             : 
     665           0 :     const int32_t offset_0 =
     666           0 :         bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     667           0 :     const int32_t offset = (1 << offset_0) + (1 << (offset_0 - 1));
     668           0 :     const __m256i offset_const = _mm256_set1_epi32(offset);
     669           0 :     const int32_t rounding_shift =
     670           0 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     671           0 :     const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
     672             :     const __m256i clip_pixel_to_bd =
     673           0 :         _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
     674           0 :     const __m256i zero = _mm256_setzero_si256();
     675             : 
     676           0 :     prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_y);
     677             : 
     678           0 :     for (j = 0; j < w; j += 8) {
     679           0 :         const uint16_t *data = &src_ptr[j];
     680             :         /* Vertical filter */
     681             :         {
     682             :             __m256i src6;
     683           0 :             __m256i s01 = _mm256_permute2x128_si256(
     684             :                 _mm256_castsi128_si256(
     685             :                     _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
     686             :                 _mm256_castsi128_si256(
     687             :                     _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
     688             :                 0x20);
     689           0 :             __m256i s12 = _mm256_permute2x128_si256(
     690             :                 _mm256_castsi128_si256(
     691             :                     _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
     692             :                 _mm256_castsi128_si256(
     693             :                     _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
     694             :                 0x20);
     695           0 :             __m256i s23 = _mm256_permute2x128_si256(
     696             :                 _mm256_castsi128_si256(
     697             :                     _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
     698             :                 _mm256_castsi128_si256(
     699             :                     _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
     700             :                 0x20);
     701           0 :             __m256i s34 = _mm256_permute2x128_si256(
     702             :                 _mm256_castsi128_si256(
     703             :                     _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
     704             :                 _mm256_castsi128_si256(
     705             :                     _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
     706             :                 0x20);
     707           0 :             __m256i s45 = _mm256_permute2x128_si256(
     708             :                 _mm256_castsi128_si256(
     709             :                     _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
     710             :                 _mm256_castsi128_si256(
     711             :                     _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
     712             :                 0x20);
     713           0 :             src6 = _mm256_castsi128_si256(
     714           0 :                 _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
     715           0 :             __m256i s56 = _mm256_permute2x128_si256(
     716             :                 _mm256_castsi128_si256(
     717             :                     _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
     718             :                 src6, 0x20);
     719             : 
     720           0 :             s[0] = _mm256_unpacklo_epi16(s01, s12);
     721           0 :             s[1] = _mm256_unpacklo_epi16(s23, s34);
     722           0 :             s[2] = _mm256_unpacklo_epi16(s45, s56);
     723             : 
     724           0 :             s[4] = _mm256_unpackhi_epi16(s01, s12);
     725           0 :             s[5] = _mm256_unpackhi_epi16(s23, s34);
     726           0 :             s[6] = _mm256_unpackhi_epi16(s45, s56);
     727             : 
     728           0 :             for (i = 0; i < h; i += 2) {
     729           0 :                 data = &src_ptr[i * src_stride + j];
     730             : 
     731           0 :                 const __m256i s67 = _mm256_permute2x128_si256(
     732             :                     src6,
     733             :                     _mm256_castsi128_si256(
     734             :                         _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
     735             :                     0x20);
     736             : 
     737           0 :                 src6 = _mm256_castsi128_si256(
     738           0 :                     _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
     739             : 
     740           0 :                 const __m256i s78 = _mm256_permute2x128_si256(
     741             :                     _mm256_castsi128_si256(
     742             :                         _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
     743             :                     src6, 0x20);
     744             : 
     745           0 :                 s[3] = _mm256_unpacklo_epi16(s67, s78);
     746           0 :                 s[7] = _mm256_unpackhi_epi16(s67, s78);
     747             : 
     748           0 :                 const __m256i res_a = convolve16_8tap_avx2(s, coeffs_y);
     749             : 
     750           0 :                 __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
     751           0 :                 res_a_round = _mm256_sra_epi32(
     752             :                     _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
     753             : 
     754           0 :                 __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const);
     755             : 
     756           0 :                 if (w - j < 8) {
     757           0 :                     if (do_average) {
     758           0 :                         const __m256i data_0 = _mm256_castsi128_si256(
     759           0 :                             _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
     760           0 :                         const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
     761           0 :                             (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
     762             :                         const __m256i data_01 =
     763           0 :                             _mm256_permute2x128_si256(data_0, data_1, 0x20);
     764             : 
     765           0 :                         const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
     766             : 
     767           0 :                         const __m256i comp_avg_res = highbd_comp_avg(
     768             :                             &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
     769             : 
     770           0 :                         const __m256i round_result = highbd_convolve_rounding(
     771             :                             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
     772             : 
     773             :                         const __m256i res_16b =
     774           0 :                             _mm256_packus_epi32(round_result, round_result);
     775             :                         const __m256i res_clip =
     776           0 :                             _mm256_min_epi16(res_16b, clip_pixel_to_bd);
     777             : 
     778           0 :                         const __m128i res_0 = _mm256_castsi256_si128(res_clip);
     779           0 :                         const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
     780             : 
     781           0 :                         _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
     782           0 :                         _mm_storel_epi64(
     783           0 :                             (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
     784             :                     }
     785             :                     else {
     786             :                         __m256i res_16b =
     787           0 :                             _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
     788           0 :                         const __m128i res_0 = _mm256_castsi256_si128(res_16b);
     789           0 :                         const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
     790             : 
     791           0 :                         _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
     792           0 :                         _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
     793             :                             res_1);
     794             :                     }
     795             :                 }
     796             :                 else {
     797           0 :                     const __m256i res_b = convolve16_8tap_avx2(s + 4, coeffs_y);
     798           0 :                     __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
     799           0 :                     res_b_round = _mm256_sra_epi32(
     800             :                         _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
     801             : 
     802           0 :                     __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
     803             : 
     804           0 :                     if (do_average) {
     805           0 :                         const __m256i data_0 = _mm256_castsi128_si256(
     806           0 :                             _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
     807           0 :                         const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
     808           0 :                             (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
     809             :                         const __m256i data_01 =
     810           0 :                             _mm256_permute2x128_si256(data_0, data_1, 0x20);
     811             : 
     812           0 :                         const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
     813           0 :                         const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
     814             : 
     815           0 :                         const __m256i comp_avg_res_lo = highbd_comp_avg(
     816             :                             &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
     817           0 :                         const __m256i comp_avg_res_hi = highbd_comp_avg(
     818             :                             &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
     819             : 
     820             :                         const __m256i round_result_lo =
     821           0 :                             highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
     822             :                                 &rounding_const, rounding_shift);
     823             :                         const __m256i round_result_hi =
     824           0 :                             highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
     825             :                                 &rounding_const, rounding_shift);
     826             : 
     827             :                         const __m256i res_16b =
     828           0 :                             _mm256_packus_epi32(round_result_lo, round_result_hi);
     829             :                         const __m256i res_clip =
     830           0 :                             _mm256_min_epi16(res_16b, clip_pixel_to_bd);
     831             : 
     832           0 :                         const __m128i res_0 = _mm256_castsi256_si128(res_clip);
     833           0 :                         const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
     834             : 
     835           0 :                         _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
     836             :                         _mm_store_si128(
     837           0 :                             (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
     838             :                     }
     839             :                     else {
     840             :                         __m256i res_16b =
     841           0 :                             _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
     842           0 :                         const __m128i res_0 = _mm256_castsi256_si128(res_16b);
     843           0 :                         const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
     844             : 
     845           0 :                         _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
     846           0 :                         _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
     847             :                             res_1);
     848             :                     }
     849             :                 }
     850           0 :                 s[0] = s[1];
     851           0 :                 s[1] = s[2];
     852           0 :                 s[2] = s[3];
     853             : 
     854           0 :                 s[4] = s[5];
     855           0 :                 s[5] = s[6];
     856           0 :                 s[6] = s[7];
     857             :             }
     858             :         }
     859             :     }
     860           0 : }

Generated by: LCOV version 1.14