LCOV - code coverage report
Current view: top level - ASM_AVX2 - convolve_2d_avx2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 135 179 75.4 %
Date: 2019-11-25 17:38:06 Functions: 2 3 66.7 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <immintrin.h>
      13             : #include "EbDefinitions.h"
      14             : #include "EbMemory_SSE4_1.h"
      15             : #include "aom_dsp_rtcd.h"
      16             : #include "convolve.h"
      17             : #include "convolve_avx2.h"
      18             : #include "synonyms.h"
      19             : 
      20             : #if !OBMC_CONVOLVE
      21             : static INLINE void xy_y_round_store_8x2_avx2(const __m256i res[2],
      22             :     uint8_t *const dst,
      23             :     const int32_t stride) {
      24             :     const __m256i r = xy_y_round_16_avx2(res);
      25             :     pack_store_8x2_avx2(r, dst, stride);
      26             : }
      27             : 
      28             : static INLINE void xy_y_round_store_16x2_avx2(const __m256i res[4],
      29             :     uint8_t *const dst,
      30             :     const int32_t stride) {
      31             :     const __m256i r0 = xy_y_round_16_avx2(res + 0);
      32             :     const __m256i r1 = xy_y_round_16_avx2(res + 2);
      33             :     xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
      34             : }
      35             : static void convolve_2d_sr_hor_2tap_avx2(
      36             :     const uint8_t *const src, const int32_t src_stride, const int32_t w,
      37             :     const int32_t h, const InterpFilterParams *const filter_params_x,
      38             :     const int32_t subpel_x_q4, int16_t *const im_block) {
      39             :     const uint8_t *src_ptr = src;
      40             :     int32_t y = h;
      41             :     int16_t *im = im_block;
      42             :     __m128i coeffs_128[4];
      43             :     __m256i coeffs_256[4];
      44             : 
      45             :     if (w <= 8) {
      46             :         prepare_half_coeffs_2tap_ssse3(
      47             :             filter_params_x, subpel_x_q4, coeffs_128);
      48             : 
      49             :         if (w == 2) {
      50             :             do {
      51             :                 const __m128i r =
      52             :                     x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
      53             :                 xy_x_round_store_2x2_sse2(r, im);
      54             :                 src_ptr += 2 * src_stride;
      55             :                 im += 2 * 2;
      56             :                 y -= 2;
      57             :             } while (y);
      58             :         }
      59             :         else if (w == 4) {
      60             :             do {
      61             :                 const __m128i r =
      62             :                     x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
      63             :                 xy_x_round_store_4x2_sse2(r, im);
      64             :                 src_ptr += 2 * src_stride;
      65             :                 im += 2 * 4;
      66             :                 y -= 2;
      67             :             } while (y);
      68             :         }
      69             :         else {
      70             :             assert(w == 8);
      71             : 
      72             :             do {
      73             :                 __m128i r[2];
      74             : 
      75             :                 x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, r);
      76             :                 xy_x_round_store_8x2_sse2(r, im);
      77             :                 src_ptr += 2 * src_stride;
      78             :                 im += 2 * 8;
      79             :                 y -= 2;
      80             :             } while (y);
      81             :         }
      82             :     }
      83             :     else {
      84             :         prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
      85             : 
      86             :         if (w == 16) {
      87             :             do {
      88             :                 __m256i r[2];
      89             : 
      90             :                 x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
      91             :                 xy_x_round_store_32_avx2(r, im);
      92             :                 src_ptr += 2 * src_stride;
      93             :                 im += 2 * 16;
      94             :                 y -= 2;
      95             :             } while (y);
      96             :         }
      97             :         else if (w == 32) {
      98             :             do {
      99             :                 xy_x_2tap_32_avx2(src_ptr, coeffs_256, im);
     100             :                 src_ptr += src_stride;
     101             :                 im += 32;
     102             :             } while (--y);
     103             :         }
     104             :         else if (w == 64) {
     105             :             do {
     106             :                 xy_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, im + 0 * 32);
     107             :                 xy_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, im + 1 * 32);
     108             :                 src_ptr += src_stride;
     109             :                 im += 64;
     110             :             } while (--y);
     111             :         }
     112             :         else {
     113             :             assert(w == 128);
     114             : 
     115             :             do {
     116             :                 xy_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, im + 0 * 32);
     117             :                 xy_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, im + 1 * 32);
     118             :                 xy_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, im + 2 * 32);
     119             :                 xy_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, im + 3 * 32);
     120             :                 src_ptr += src_stride;
     121             :                 im += 128;
     122             :             } while (--y);
     123             :         }
     124             :     }
     125             : }
     126             : 
     127             : static void convolve_2d_sr_hor_4tap_avx2(
     128             :     const uint8_t *const src, const int32_t src_stride, const int32_t w,
     129             :     const int32_t h, const InterpFilterParams *const filter_params_x,
     130             :     const int32_t subpel_x_q4, int16_t *const im_block) {
     131             :     const uint8_t *src_ptr = src - 1;
     132             :     int32_t y = h;
     133             :     int16_t *im = im_block;
     134             :     __m128i coeffs_128[4];
     135             : 
     136             :     prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
     137             : 
     138             :     if (w == 2) {
     139             :         do {
     140             :             const __m128i r =
     141             :                 x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
     142             :             xy_x_round_store_2x2_sse2(r, im);
     143             :             src_ptr += 2 * src_stride;
     144             :             im += 2 * 2;
     145             :             y -= 2;
     146             :         } while (y);
     147             :     }
     148             :     else {
     149             :         assert(w == 4);
     150             : 
     151             :         do {
     152             :             const __m128i r =
     153             :                 x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
     154             :             xy_x_round_store_4x2_sse2(r, im);
     155             :             src_ptr += 2 * src_stride;
     156             :             im += 2 * 4;
     157             :             y -= 2;
     158             :         } while (y);
     159             :     }
     160             : }
     161             : 
     162             : static void convolve_2d_sr_hor_6tap_avx2(
     163             :     const uint8_t *const src, const int32_t src_stride, const int32_t w,
     164             :     const int32_t h, const InterpFilterParams *const filter_params_x,
     165             :     const int32_t subpel_x_q4, int16_t *const im_block) {
     166             :     const uint8_t *src_ptr = src - 2;
     167             :     int32_t y = h;
     168             :     int16_t *im = im_block;
     169             :     __m256i coeffs_256[4], filt_256[4];
     170             : 
     171             :     filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
     172             :     filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
     173             :     filt_256[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
     174             : 
     175             :     prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
     176             : 
     177             :     if (w == 8) {
     178             :         do {
     179             :             const __m256i res = x_convolve_6tap_8x2_avx2(
     180             :                 src_ptr, src_stride, coeffs_256, filt_256);
     181             :             xy_x_round_store_8x2_avx2(res, im);
     182             :             src_ptr += 2 * src_stride;
     183             :             im += 2 * 8;
     184             :             y -= 2;
     185             :         } while (y);
     186             :     }
     187             :     else if (w == 16) {
     188             :         do {
     189             :             __m256i r[2];
     190             : 
     191             :             x_convolve_6tap_16x2_avx2(
     192             :                 src_ptr, src_stride, coeffs_256, filt_256, r);
     193             :             xy_x_round_store_32_avx2(r, im);
     194             :             src_ptr += 2 * src_stride;
     195             :             im += 2 * 16;
     196             :             y -= 2;
     197             :         } while (y);
     198             :     }
     199             :     else if (w == 32) {
     200             :         do {
     201             :             xy_x_6tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
     202             :             src_ptr += src_stride;
     203             :             im += 32;
     204             :         } while (--y);
     205             :     }
     206             :     else if (w == 64) {
     207             :         do {
     208             :             xy_x_6tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
     209             :             xy_x_6tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
     210             :             src_ptr += src_stride;
     211             :             im += 64;
     212             :         } while (--y);
     213             :     }
     214             :     else {
     215             :         assert(w == 128);
     216             : 
     217             :         do {
     218             :             xy_x_6tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
     219             :             xy_x_6tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
     220             :             xy_x_6tap_32_avx2(src_ptr + 64, 16, coeffs_256, filt_256, im + 64);
     221             :             xy_x_6tap_32_avx2(src_ptr + 96, 16, coeffs_256, filt_256, im + 96);
     222             :             src_ptr += src_stride;
     223             :             im += 128;
     224             :         } while (--y);
     225             :     }
     226             : }
     227             : 
     228             : static void convolve_2d_sr_hor_8tap_avx2(
     229             :     const uint8_t *const src, const int32_t src_stride, const int32_t w,
     230             :     const int32_t h, const InterpFilterParams *const filter_params_x,
     231             :     const int32_t subpel_x_q4, int16_t *const im_block) {
     232             :     const uint8_t *src_ptr = src - 3;
     233             :     int32_t y = h;
     234             :     int16_t *im = im_block;
     235             :     __m256i coeffs_256[4], filt_256[4];
     236             : 
     237             :     filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
     238             :     filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
     239             :     filt_256[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
     240             :     filt_256[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
     241             : 
     242             :     prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
     243             : 
     244             :     if (w == 8) {
     245             :         do {
     246             :             const __m256i res = x_convolve_8tap_8x2_avx2(
     247             :                 src_ptr, src_stride, coeffs_256, filt_256);
     248             :             xy_x_round_store_8x2_avx2(res, im);
     249             :             src_ptr += 2 * src_stride;
     250             :             im += 2 * 8;
     251             :             y -= 2;
     252             :         } while (y);
     253             :     }
     254             :     else if (w == 16) {
     255             :         do {
     256             :             __m256i r[2];
     257             : 
     258             :             x_convolve_8tap_16x2_avx2(
     259             :                 src_ptr, src_stride, coeffs_256, filt_256, r);
     260             :             xy_x_round_store_32_avx2(r, im);
     261             :             src_ptr += 2 * src_stride;
     262             :             im += 2 * 16;
     263             :             y -= 2;
     264             :         } while (y);
     265             :     }
     266             :     else if (w == 32) {
     267             :         do {
     268             :             xy_x_8tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
     269             :             src_ptr += src_stride;
     270             :             im += 32;
     271             :         } while (--y);
     272             :     }
     273             :     else if (w == 64) {
     274             :         do {
     275             :             xy_x_8tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
     276             :             xy_x_8tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
     277             :             src_ptr += src_stride;
     278             :             im += 64;
     279             :         } while (--y);
     280             :     }
     281             :     else {
     282             :         assert(w == 128);
     283             : 
     284             :         do {
     285             :             xy_x_8tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
     286             :             xy_x_8tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
     287             :             xy_x_8tap_32_avx2(src_ptr + 64, 16, coeffs_256, filt_256, im + 64);
     288             :             xy_x_8tap_32_avx2(src_ptr + 96, 16, coeffs_256, filt_256, im + 96);
     289             :             src_ptr += src_stride;
     290             :             im += 128;
     291             :         } while (--y);
     292             :     }
     293             : }
     294             : 
     295             : static void convolve_2d_sr_ver_2tap_avx2(uint8_t *dst, int32_t dst_stride,
     296             :     int32_t w, int32_t h,
     297             :     InterpFilterParams *filter_params_y,
     298             :     const int32_t subpel_y_q4,
     299             :     int16_t *im_block) {
     300             :     int32_t y = h;
     301             :     int16_t *im = im_block;
     302             :     __m128i coeffs_128[4];
     303             :     __m256i coeffs_256[4];
     304             : 
     305             :     if (w <= 4) {
     306             :         prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
     307             : 
     308             :         if (w == 2) {
     309             :             __m128i s_32[2];
     310             : 
     311             :             s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
     312             : 
     313             :             do {
     314             :                 const __m128i res =
     315             :                     xy_y_convolve_2tap_2x2_sse2(im, s_32, coeffs_128);
     316             :                 xy_y_round_store_2x2_sse2(res, dst, dst_stride);
     317             :                 im += 2 * 2;
     318             :                 dst += 2 * dst_stride;
     319             :                 y -= 2;
     320             :             } while (y);
     321             :         }
     322             :         else {
     323             :             __m128i s_64[2], r[2];
     324             : 
     325             :             assert(w == 4);
     326             : 
     327             :             s_64[0] = _mm_loadl_epi64((__m128i *)im);
     328             : 
     329             :             do {
     330             :                 xy_y_convolve_2tap_4x2_sse2(im, s_64, coeffs_128, r);
     331             :                 r[0] = xy_y_round_sse2(r[0]);
     332             :                 r[1] = xy_y_round_sse2(r[1]);
     333             :                 const __m128i rr = _mm_packs_epi32(r[0], r[1]);
     334             :                 pack_store_4x2_sse2(rr, dst, dst_stride);
     335             :                 im += 2 * 4;
     336             :                 dst += 2 * dst_stride;
     337             :                 y -= 2;
     338             :             } while (y);
     339             :         }
     340             :     }
     341             :     else {
     342             :         prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
     343             : 
     344             :         if (w == 8) {
     345             :             __m128i s_128[2];
     346             :             __m256i r[2];
     347             : 
     348             :             s_128[0] = _mm_load_si128((__m128i *)im);
     349             : 
     350             :             do {
     351             :                 xy_y_convolve_2tap_8x2_avx2(im, s_128, coeffs_256, r);
     352             :                 xy_y_round_store_8x2_avx2(r, dst, dst_stride);
     353             :                 im += 2 * 8;
     354             :                 dst += 2 * dst_stride;
     355             :                 y -= 2;
     356             :             } while (y);
     357             :         }
     358             :         else if (w == 16) {
     359             :             __m256i s_256[2], r[4];
     360             : 
     361             :             s_256[0] = _mm256_load_si256((__m256i *)im);
     362             : 
     363             :             do {
     364             :                 xy_y_convolve_2tap_16x2_avx2(im, s_256, coeffs_256, r);
     365             :                 xy_y_round_store_16x2_avx2(r, dst, dst_stride);
     366             :                 im += 2 * 16;
     367             :                 dst += 2 * dst_stride;
     368             :                 y -= 2;
     369             :             } while (y);
     370             :         }
     371             :         else if (w == 32) {
     372             :             __m256i s_256[2][2];
     373             : 
     374             :             s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
     375             :             s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
     376             : 
     377             :             do {
     378             :                 xy_y_convolve_2tap_32_all_avx2(
     379             :                     im + 32, s_256[0], s_256[1], coeffs_256, dst);
     380             :                 xy_y_convolve_2tap_32_all_avx2(im + 2 * 32,
     381             :                     s_256[1],
     382             :                     s_256[0],
     383             :                     coeffs_256,
     384             :                     dst + dst_stride);
     385             :                 im += 2 * 32;
     386             :                 dst += 2 * dst_stride;
     387             :                 y -= 2;
     388             :             } while (y);
     389             :         }
     390             :         else if (w == 64) {
     391             :             __m256i s_256[2][4];
     392             : 
     393             :             s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
     394             :             s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
     395             :             s_256[0][2] = _mm256_load_si256((__m256i *)(im + 2 * 16));
     396             :             s_256[0][3] = _mm256_load_si256((__m256i *)(im + 3 * 16));
     397             : 
     398             :             do {
     399             :                 xy_y_convolve_2tap_32_all_avx2(
     400             :                     im + 64, s_256[0] + 0, s_256[1] + 0, coeffs_256, dst);
     401             :                 xy_y_convolve_2tap_32_all_avx2(
     402             :                     im + 96, s_256[0] + 2, s_256[1] + 2, coeffs_256, dst + 32);
     403             :                 im += 2 * 64;
     404             :                 xy_y_convolve_2tap_32_all_avx2(im,
     405             :                     s_256[1] + 0,
     406             :                     s_256[0] + 0,
     407             :                     coeffs_256,
     408             :                     dst + dst_stride);
     409             :                 xy_y_convolve_2tap_32_all_avx2(im + 32,
     410             :                     s_256[1] + 2,
     411             :                     s_256[0] + 2,
     412             :                     coeffs_256,
     413             :                     dst + dst_stride + 32);
     414             :                 dst += 2 * dst_stride;
     415             :                 y -= 2;
     416             :             } while (y);
     417             :         }
     418             :         else {
     419             :             __m256i s_256[2][8];
     420             : 
     421             :             assert(w == 128);
     422             : 
     423             :             load_16bit_8rows_avx2(im, 16, s_256[0]);
     424             : 
     425             :             do {
     426             :                 xy_y_convolve_2tap_32_all_avx2(
     427             :                     im + 128, s_256[0] + 0, s_256[1] + 0, coeffs_256, dst);
     428             :                 xy_y_convolve_2tap_32_all_avx2(im + 160,
     429             :                     s_256[0] + 2,
     430             :                     s_256[1] + 2,
     431             :                     coeffs_256,
     432             :                     dst + 1 * 32);
     433             :                 xy_y_convolve_2tap_32_all_avx2(im + 192,
     434             :                     s_256[0] + 4,
     435             :                     s_256[1] + 4,
     436             :                     coeffs_256,
     437             :                     dst + 2 * 32);
     438             :                 xy_y_convolve_2tap_32_all_avx2(im + 224,
     439             :                     s_256[0] + 6,
     440             :                     s_256[1] + 6,
     441             :                     coeffs_256,
     442             :                     dst + 3 * 32);
     443             :                 im += 2 * 128;
     444             :                 xy_y_convolve_2tap_32_all_avx2(im,
     445             :                     s_256[1] + 0,
     446             :                     s_256[0] + 0,
     447             :                     coeffs_256,
     448             :                     dst + dst_stride);
     449             :                 xy_y_convolve_2tap_32_all_avx2(im + 32,
     450             :                     s_256[1] + 2,
     451             :                     s_256[0] + 2,
     452             :                     coeffs_256,
     453             :                     dst + dst_stride + 1 * 32);
     454             :                 xy_y_convolve_2tap_32_all_avx2(im + 64,
     455             :                     s_256[1] + 4,
     456             :                     s_256[0] + 4,
     457             :                     coeffs_256,
     458             :                     dst + dst_stride + 2 * 32);
     459             :                 xy_y_convolve_2tap_32_all_avx2(im + 96,
     460             :                     s_256[1] + 6,
     461             :                     s_256[0] + 6,
     462             :                     coeffs_256,
     463             :                     dst + dst_stride + 3 * 32);
     464             :                 dst += 2 * dst_stride;
     465             :                 y -= 2;
     466             :             } while (y);
     467             :         }
     468             :     }
     469             : }
     470             : 
     471             : static void convolve_2d_sr_ver_2tap_half_avx2(
     472             :     uint8_t *dst, int32_t dst_stride, int32_t w, int32_t h,
     473             :     InterpFilterParams *filter_params_y, const int32_t subpel_y_q4,
     474             :     int16_t *im_block) {
     475             :     int32_t y = h;
     476             :     int16_t *im = im_block;
     477             : 
     478             :     (void)filter_params_y;
     479             :     (void)subpel_y_q4;
     480             : 
     481             :     if (w == 2) {
     482             :         __m128i s_32[2];
     483             : 
     484             :         s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
     485             : 
     486             :         do {
     487             :             const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
     488             :             const __m128i r = xy_y_round_half_pel_sse2(res);
     489             :             pack_store_2x2_sse2(r, dst, dst_stride);
     490             :             im += 2 * 2;
     491             :             dst += 2 * dst_stride;
     492             :             y -= 2;
     493             :         } while (y);
     494             :     }
     495             :     else if (w == 4) {
     496             :         __m128i s_64[2];
     497             : 
     498             :         s_64[0] = _mm_loadl_epi64((__m128i *)im);
     499             : 
     500             :         do {
     501             :             const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
     502             :             const __m128i r = xy_y_round_half_pel_sse2(res);
     503             :             pack_store_4x2_sse2(r, dst, dst_stride);
     504             :             im += 2 * 4;
     505             :             dst += 2 * dst_stride;
     506             :             y -= 2;
     507             :         } while (y);
     508             :     }
     509             :     else if (w == 8) {
     510             :         __m128i s_128[2];
     511             : 
     512             :         s_128[0] = _mm_load_si128((__m128i *)im);
     513             : 
     514             :         do {
     515             :             const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
     516             :             const __m256i r = xy_y_round_half_pel_avx2(res);
     517             :             pack_store_8x2_avx2(r, dst, dst_stride);
     518             :             im += 2 * 8;
     519             :             dst += 2 * dst_stride;
     520             :             y -= 2;
     521             :         } while (y);
     522             :     }
     523             :     else if (w == 16) {
     524             :         __m256i s_256[2], r[2];
     525             : 
     526             :         s_256[0] = _mm256_load_si256((__m256i *)im);
     527             : 
     528             :         do {
     529             :             xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
     530             :             const __m256i r0 = xy_y_round_half_pel_avx2(r[0]);
     531             :             const __m256i r1 = xy_y_round_half_pel_avx2(r[1]);
     532             :             xy_y_pack_store_16x2_avx2(r0, r1, dst, dst_stride);
     533             :             im += 2 * 16;
     534             :             dst += 2 * dst_stride;
     535             :             y -= 2;
     536             :         } while (y);
     537             :     }
     538             :     else if (w == 32) {
     539             :         __m256i s_256[2][2];
     540             : 
     541             :         s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
     542             :         s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
     543             : 
     544             :         do {
     545             :             xy_y_convolve_2tap_half_pel_32_all_avx2(
     546             :                 im + 32, s_256[0], s_256[1], dst);
     547             :             xy_y_convolve_2tap_half_pel_32_all_avx2(
     548             :                 im + 2 * 32, s_256[1], s_256[0], dst + dst_stride);
     549             :             im += 2 * 32;
     550             :             dst += 2 * dst_stride;
     551             :             y -= 2;
     552             :         } while (y);
     553             :     }
     554             :     else if (w == 64) {
     555             :         __m256i s_256[2][4];
     556             : 
     557             :         s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
     558             :         s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
     559             :         s_256[0][2] = _mm256_load_si256((__m256i *)(im + 2 * 16));
     560             :         s_256[0][3] = _mm256_load_si256((__m256i *)(im + 3 * 16));
     561             : 
     562             :         do {
     563             :             xy_y_convolve_2tap_half_pel_32_all_avx2(
     564             :                 im + 64, s_256[0] + 0, s_256[1] + 0, dst);
     565             :             xy_y_convolve_2tap_half_pel_32_all_avx2(
     566             :                 im + 96, s_256[0] + 2, s_256[1] + 2, dst + 32);
     567             :             im += 2 * 64;
     568             :             xy_y_convolve_2tap_half_pel_32_all_avx2(
     569             :                 im, s_256[1] + 0, s_256[0] + 0, dst + dst_stride);
     570             :             xy_y_convolve_2tap_half_pel_32_all_avx2(
     571             :                 im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 32);
     572             :             dst += 2 * dst_stride;
     573             :             y -= 2;
     574             :         } while (y);
     575             :     }
     576             :     else {
     577             :         __m256i s_256[2][8];
     578             : 
     579             :         assert(w == 128);
     580             : 
     581             :         load_16bit_8rows_avx2(im, 16, s_256[0]);
     582             : 
     583             :         do {
     584             :             xy_y_convolve_2tap_half_pel_32_all_avx2(
     585             :                 im + 128, s_256[0] + 0, s_256[1] + 0, dst);
     586             :             xy_y_convolve_2tap_half_pel_32_all_avx2(
     587             :                 im + 160, s_256[0] + 2, s_256[1] + 2, dst + 1 * 32);
     588             :             xy_y_convolve_2tap_half_pel_32_all_avx2(
     589             :                 im + 192, s_256[0] + 4, s_256[1] + 4, dst + 2 * 32);
     590             :             xy_y_convolve_2tap_half_pel_32_all_avx2(
     591             :                 im + 224, s_256[0] + 6, s_256[1] + 6, dst + 3 * 32);
     592             :             im += 2 * 128;
     593             :             xy_y_convolve_2tap_half_pel_32_all_avx2(
     594             :                 im, s_256[1] + 0, s_256[0] + 0, dst + dst_stride);
     595             :             xy_y_convolve_2tap_half_pel_32_all_avx2(
     596             :                 im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 1 * 32);
     597             :             xy_y_convolve_2tap_half_pel_32_all_avx2(
     598             :                 im + 64, s_256[1] + 4, s_256[0] + 4, dst + dst_stride + 2 * 32);
     599             :             xy_y_convolve_2tap_half_pel_32_all_avx2(
     600             :                 im + 96, s_256[1] + 6, s_256[0] + 6, dst + dst_stride + 3 * 32);
     601             :             dst += 2 * dst_stride;
     602             :             y -= 2;
     603             :         } while (y);
     604             :     }
     605             : }
     606             : 
     607             : static void convolve_2d_sr_ver_4tap_avx2(uint8_t *dst, int32_t dst_stride,
     608             :     int32_t w, int32_t h,
     609             :     InterpFilterParams *filter_params_y,
     610             :     const int32_t subpel_y_q4,
     611             :     int16_t *im_block) {
     612             :     int32_t y = h;
     613             :     int16_t *im = im_block;
     614             :     __m128i coeffs_128[4];
     615             :     __m256i coeffs_256[4];
     616             : 
     617             :     if (w == 2) {
     618             :         __m128i s_32[4], ss_128[2];
     619             : 
     620             :         prepare_coeffs_4tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
     621             : 
     622             :         s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
     623             :         s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
     624             :         s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
     625             : 
     626             :         const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
     627             :         const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
     628             : 
     629             :         ss_128[0] = _mm_unpacklo_epi16(src01, src12);
     630             : 
     631             :         do {
     632             :             const __m128i res =
     633             :                 xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
     634             :             xy_y_round_store_2x2_sse2(res, dst, dst_stride);
     635             :             im += 2 * 2;
     636             :             dst += 2 * dst_stride;
     637             :             y -= 2;
     638             :         } while (y);
     639             :     }
     640             :     else {
     641             :         prepare_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
     642             : 
     643             :         if (w == 4) {
     644             :             __m128i s_64[4];
     645             :             __m256i s_256[2], ss_256[2];
     646             : 
     647             :             s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
     648             :             s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
     649             :             s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
     650             : 
     651             :             // Load lines a and b. Line a to lower 128, line b to upper 128
     652             :             s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
     653             :             s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
     654             : 
     655             :             ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
     656             : 
     657             :             do {
     658             :                 const __m256i res =
     659             :                     xy_y_convolve_4tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
     660             :                 xy_y_round_store_4x2_avx2(res, dst, dst_stride);
     661             :                 im += 2 * 4;
     662             :                 dst += 2 * dst_stride;
     663             :                 y -= 2;
     664             :             } while (y);
     665             :         }
     666             :         else if (w == 8) {
     667             :             __m256i s_256[4], r[2];
     668             : 
     669             :             s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
     670             :             s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
     671             : 
     672             :             if (subpel_y_q4 != 8) {
     673             :                 __m256i ss_256[4];
     674             : 
     675             :                 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
     676             :                 ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
     677             : 
     678             :                 do {
     679             :                     xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
     680             :                     xy_y_round_store_8x2_avx2(r, dst, dst_stride);
     681             :                     im += 2 * 8;
     682             :                     dst += 2 * dst_stride;
     683             :                     y -= 2;
     684             :                 } while (y);
     685             :             }
     686             :             else {
     687             :                 do {
     688             :                     xy_y_convolve_4tap_8x2_half_pel_avx2(
     689             :                         im, coeffs_256, s_256, r);
     690             :                     xy_y_round_store_8x2_avx2(r, dst, dst_stride);
     691             :                     im += 2 * 8;
     692             :                     dst += 2 * dst_stride;
     693             :                     y -= 2;
     694             :                 } while (y);
     695             :             }
     696             :         }
     697             :         else if (w == 16) {
     698             :             __m256i s_256[5];
     699             : 
     700             :             s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
     701             :             s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
     702             :             s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
     703             : 
     704             :             if (subpel_y_q4 != 8) {
     705             :                 __m256i ss_256[4], tt_256[4], r[4];
     706             : 
     707             :                 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
     708             :                 ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
     709             : 
     710             :                 tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
     711             :                 tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
     712             : 
     713             :                 do {
     714             :                     xy_y_convolve_4tap_16x2_avx2(
     715             :                         im, s_256, ss_256, tt_256, coeffs_256, r);
     716             :                     xy_y_round_store_16x2_avx2(r, dst, dst_stride);
     717             :                     im += 2 * 16;
     718             :                     dst += 2 * dst_stride;
     719             :                     y -= 2;
     720             :                 } while (y);
     721             :             }
     722             :             else {
     723             :                 __m256i r[4];
     724             : 
     725             :                 do {
     726             :                     xy_y_convolve_4tap_16x2_half_pelavx2(
     727             :                         im, s_256, coeffs_256, r);
     728             :                     xy_y_round_store_16x2_avx2(r, dst, dst_stride);
     729             :                     im += 2 * 16;
     730             :                     dst += 2 * dst_stride;
     731             :                     y -= 2;
     732             :                 } while (y);
     733             :             }
     734             :         }
     735             :         else {
     736             :             /*It's a special condition for OBMC. A/c  to Av1 spec 4-tap won't
     737             :             support for width(w)>16, but for OBMC while predicting above block
     738             :             it reduces size block to Wx(h/2), for example, if above block size
     739             :             is 32x8, we get block size as 32x4 for OBMC.*/
     740             :             int32_t x = 0;
     741             : 
     742             :             assert(!(w % 32));
     743             : 
     744             :             __m256i s_256[2][4], ss_256[2][4], tt_256[2][4], r0[4], r1[4];
     745             :             do {
     746             :                 const int16_t *s = im + x;
     747             :                 uint8_t *d = dst + x;
     748             : 
     749             :                 loadu_unpack_16bit_3rows_avx2(
     750             :                     s, w, s_256[0], ss_256[0], tt_256[0]);
     751             :                 loadu_unpack_16bit_3rows_avx2(
     752             :                     s + 16, w, s_256[1], ss_256[1], tt_256[1]);
     753             : 
     754             :                 y = h;
     755             :                 do {
     756             :                     xy_y_convolve_4tap_32x2_avx2(
     757             :                         s, w, s_256[0], ss_256[0], tt_256[0], coeffs_256, r0);
     758             :                     xy_y_convolve_4tap_32x2_avx2(s + 16, w, s_256[1],
     759             :                         ss_256[1], tt_256[1], coeffs_256, r1);
     760             : 
     761             :                     xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
     762             :                     xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
     763             : 
     764             :                     s += 2 * w;
     765             :                     d += 2 * dst_stride;
     766             :                     y -= 2;
     767             :                 } while (y);
     768             : 
     769             :                 x += 32;
     770             :             } while (x < w);
     771             :         }
     772             :     }
     773             : }
     774             : 
     775             : static void convolve_2d_sr_ver_6tap_avx2(uint8_t *dst, int32_t dst_stride,
     776             :     int32_t w, int32_t h,
     777             :     InterpFilterParams *filter_params_y,
     778             :     const int32_t subpel_y_q4,
     779             :     int16_t *im_block) {
     780             :     int32_t y;
     781             :     int16_t *im = im_block;
     782             :     __m128i coeffs_128[4];
     783             :     __m256i coeffs_256[4];
     784             : 
     785             :     if (w == 2) {
     786             :         __m128i s_32[6], ss_128[3];
     787             : 
     788             :         prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
     789             : 
     790             :         s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
     791             :         s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
     792             :         s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
     793             :         s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
     794             :         s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
     795             : 
     796             :         const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
     797             :         const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
     798             :         const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
     799             :         const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
     800             : 
     801             :         ss_128[0] = _mm_unpacklo_epi16(src01, src12);
     802             :         ss_128[1] = _mm_unpacklo_epi16(src23, src34);
     803             : 
     804             :         y = h;
     805             :         do {
     806             :             const __m128i res =
     807             :                 xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
     808             :             xy_y_round_store_2x2_sse2(res, dst, dst_stride);
     809             :             im += 2 * 2;
     810             :             dst += 2 * dst_stride;
     811             :             y -= 2;
     812             :         } while (y);
     813             :     }
     814             :     else {
     815             :         prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
     816             : 
     817             :         if (w == 4) {
     818             :             __m128i s_64[6];
     819             :             __m256i s_256[6], ss_256[3];
     820             : 
     821             :             s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
     822             :             s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
     823             :             s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
     824             :             s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
     825             :             s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
     826             : 
     827             :             // Load lines a and b. Line a to lower 128, line b to upper 128
     828             :             s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
     829             :             s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
     830             :             s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
     831             :             s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
     832             : 
     833             :             ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
     834             :             ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
     835             : 
     836             :             y = h;
     837             :             do {
     838             :                 const __m256i res =
     839             :                     xy_y_convolve_6tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
     840             :                 xy_y_round_store_4x2_avx2(res, dst, dst_stride);
     841             :                 im += 2 * 4;
     842             :                 dst += 2 * dst_stride;
     843             :                 y -= 2;
     844             :             } while (y);
     845             :         }
     846             :         else if (w == 8) {
     847             :             __m256i s_256[6], r[2];
     848             : 
     849             :             s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
     850             :             s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
     851             :             s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
     852             :             s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
     853             :             y = h;
     854             : 
     855             :             if (subpel_y_q4 != 8) {
     856             :                 __m256i ss_256[6];
     857             : 
     858             :                 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
     859             :                 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
     860             : 
     861             :                 ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
     862             :                 ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
     863             : 
     864             :                 do {
     865             :                     xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
     866             :                     xy_y_round_store_8x2_avx2(r, dst, dst_stride);
     867             :                     im += 2 * 8;
     868             :                     dst += 2 * dst_stride;
     869             :                     y -= 2;
     870             :                 } while (y);
     871             :             }
     872             :             else {
     873             :                 do {
     874             :                     xy_y_convolve_6tap_8x2_half_pel_avx2(
     875             :                         im, coeffs_256, s_256, r);
     876             :                     xy_y_round_store_8x2_avx2(r, dst, dst_stride);
     877             :                     im += 2 * 8;
     878             :                     dst += 2 * dst_stride;
     879             :                     y -= 2;
     880             :                 } while (y);
     881             :             }
     882             :         }
     883             :         else if (w == 16) {
     884             :             __m256i s_256[6];
     885             : 
     886             :             s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
     887             :             s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
     888             :             s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
     889             :             s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
     890             :             s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 16));
     891             :             y = h;
     892             : 
     893             :             if (subpel_y_q4 != 8) {
     894             :                 __m256i ss_256[6], tt_256[6], r[4];
     895             : 
     896             :                 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
     897             :                 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
     898             :                 ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
     899             :                 ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
     900             : 
     901             :                 tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
     902             :                 tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
     903             :                 tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
     904             :                 tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
     905             : 
     906             :                 do {
     907             :                     xy_y_convolve_6tap_16x2_avx2(
     908             :                         im, 16, s_256, ss_256, tt_256, coeffs_256, r);
     909             :                     xy_y_round_store_16x2_avx2(r, dst, dst_stride);
     910             :                     im += 2 * 16;
     911             :                     dst += 2 * dst_stride;
     912             :                     y -= 2;
     913             :                 } while (y);
     914             :             }
     915             :             else {
     916             :                 __m256i ss_256[4], r[4];
     917             : 
     918             :                 do {
     919             :                     xy_y_convolve_6tap_16x2_half_pel_avx2(
     920             :                         im, 16, s_256, ss_256, coeffs_256, r);
     921             :                     xy_y_round_store_16x2_avx2(r, dst, dst_stride);
     922             : 
     923             :                     im += 2 * 16;
     924             :                     dst += 2 * dst_stride;
     925             :                     y -= 2;
     926             :                 } while (y);
     927             :             }
     928             :         }
     929             :         else {
     930             :             int32_t x = 0;
     931             : 
     932             :             assert(!(w % 32));
     933             : 
     934             :             __m256i s_256[2][6], ss_256[2][6], tt_256[2][6], r0[4], r1[4];
     935             : 
     936             :             do {
     937             :                 const int16_t *s = im + x;
     938             :                 uint8_t *d = dst + x;
     939             : 
     940             :                 loadu_unpack_16bit_5rows_avx2(
     941             :                     s, w, s_256[0], ss_256[0], tt_256[0]);
     942             :                 loadu_unpack_16bit_5rows_avx2(
     943             :                     s + 16, w, s_256[1], ss_256[1], tt_256[1]);
     944             : 
     945             :                 y = h;
     946             :                 do {
     947             :                     xy_y_convolve_6tap_16x2_avx2(
     948             :                         s, w, s_256[0], ss_256[0], tt_256[0], coeffs_256, r0);
     949             :                     xy_y_convolve_6tap_16x2_avx2(s + 16,
     950             :                         w,
     951             :                         s_256[1],
     952             :                         ss_256[1],
     953             :                         tt_256[1],
     954             :                         coeffs_256,
     955             :                         r1);
     956             : 
     957             :                     xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
     958             :                     xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
     959             : 
     960             :                     s += 2 * w;
     961             :                     d += 2 * dst_stride;
     962             :                     y -= 2;
     963             :                 } while (y);
     964             : 
     965             :                 x += 32;
     966             :             } while (x < w);
     967             :         }
     968             :     }
     969             : }
     970             : 
     971             : static void convolve_2d_sr_ver_8tap_avx2(uint8_t *dst, int32_t dst_stride,
     972             :     int32_t w, int32_t h,
     973             :     InterpFilterParams *filter_params_y,
     974             :     const int32_t subpel_y_q4,
     975             :     int16_t *im_block) {
     976             :     int32_t y;
     977             :     int16_t *im = im_block;
     978             :     __m128i coeffs_128[4];
     979             :     __m256i coeffs_256[4];
     980             : 
     981             :     if (w == 2) {
     982             :         __m128i s_32[8], ss_128[4];
     983             : 
     984             :         prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
     985             : 
     986             :         s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
     987             :         s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
     988             :         s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
     989             :         s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
     990             :         s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
     991             :         s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2));
     992             :         s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2));
     993             : 
     994             :         const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
     995             :         const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
     996             :         const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
     997             :         const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
     998             :         const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
     999             :         const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
    1000             : 
    1001             :         ss_128[0] = _mm_unpacklo_epi16(src01, src12);
    1002             :         ss_128[1] = _mm_unpacklo_epi16(src23, src34);
    1003             :         ss_128[2] = _mm_unpacklo_epi16(src45, src56);
    1004             : 
    1005             :         y = h;
    1006             :         do {
    1007             :             const __m128i res =
    1008             :                 xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
    1009             :             xy_y_round_store_2x2_sse2(res, dst, dst_stride);
    1010             :             im += 2 * 2;
    1011             :             dst += 2 * dst_stride;
    1012             :             y -= 2;
    1013             :         } while (y);
    1014             :     }
    1015             :     else {
    1016             :         prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
    1017             : 
    1018             :         if (w == 4) {
    1019             :             __m128i s_64[8];
    1020             :             __m256i s_256[8], ss_256[4];
    1021             : 
    1022             :             s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
    1023             :             s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
    1024             :             s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
    1025             :             s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
    1026             :             s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
    1027             :             s_64[5] = _mm_loadl_epi64((__m128i *)(im + 5 * 4));
    1028             :             s_64[6] = _mm_loadl_epi64((__m128i *)(im + 6 * 4));
    1029             : 
    1030             :             // Load lines a and b. Line a to lower 128, line b to upper 128
    1031             :             s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
    1032             :             s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
    1033             :             s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
    1034             :             s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
    1035             :             s_256[4] = _mm256_setr_m128i(s_64[4], s_64[5]);
    1036             :             s_256[5] = _mm256_setr_m128i(s_64[5], s_64[6]);
    1037             : 
    1038             :             ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
    1039             :             ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
    1040             :             ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
    1041             : 
    1042             :             y = h;
    1043             :             do {
    1044             :                 const __m256i res =
    1045             :                     xy_y_convolve_8tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
    1046             :                 xy_y_round_store_4x2_avx2(res, dst, dst_stride);
    1047             :                 im += 2 * 4;
    1048             :                 dst += 2 * dst_stride;
    1049             :                 y -= 2;
    1050             :             } while (y);
    1051             :         }
    1052             :         else if (w == 8) {
    1053             :             __m256i s_256[8], r[2];
    1054             : 
    1055             :             s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
    1056             :             s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
    1057             :             s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
    1058             :             s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
    1059             :             s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 8));
    1060             :             s_256[5] = _mm256_loadu_si256((__m256i *)(im + 5 * 8));
    1061             :             y = h;
    1062             : 
    1063             :             if (subpel_y_q4 != 8) {
    1064             :                 __m256i ss_256[8];
    1065             : 
    1066             :                 convolve_8tap_unapck_avx2(s_256, ss_256);
    1067             : 
    1068             :                 do {
    1069             :                     xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
    1070             :                     xy_y_round_store_8x2_avx2(r, dst, dst_stride);
    1071             :                     im += 2 * 8;
    1072             :                     dst += 2 * dst_stride;
    1073             :                     y -= 2;
    1074             :                 } while (y);
    1075             :             }
    1076             :             else {
    1077             :                 do {
    1078             :                     xy_y_convolve_8tap_8x2_half_pel_avx2(
    1079             :                         im, coeffs_256, s_256, r);
    1080             :                     xy_y_round_store_8x2_avx2(r, dst, dst_stride);
    1081             :                     im += 2 * 8;
    1082             :                     dst += 2 * dst_stride;
    1083             :                     y -= 2;
    1084             :                 } while (y);
    1085             :             }
    1086             :         }
    1087             :         else if (w == 16) {
    1088             :             __m256i s_256[8], r[4];
    1089             : 
    1090             :             load_16bit_7rows_avx2(im, 16, s_256);
    1091             :             y = h;
    1092             : 
    1093             :             if (subpel_y_q4 != 8) {
    1094             :                 __m256i ss_256[8], tt_256[8];
    1095             : 
    1096             :                 convolve_8tap_unapck_avx2(s_256, ss_256);
    1097             :                 convolve_8tap_unapck_avx2(s_256 + 1, tt_256);
    1098             : 
    1099             :                 do {
    1100             :                     xy_y_convolve_8tap_16x2_avx2(
    1101             :                         im, 16, coeffs_256, s_256, ss_256, tt_256, r);
    1102             :                     xy_y_round_store_16x2_avx2(r, dst, dst_stride);
    1103             : 
    1104             :                     im += 2 * 16;
    1105             :                     dst += 2 * dst_stride;
    1106             :                     y -= 2;
    1107             :                 } while (y);
    1108             :             }
    1109             :             else {
    1110             :                 do {
    1111             :                     xy_y_convolve_8tap_16x2_half_pel_avx2(
    1112             :                         im, 16, coeffs_256, s_256, r);
    1113             :                     xy_y_round_store_16x2_avx2(r, dst, dst_stride);
    1114             : 
    1115             :                     im += 2 * 16;
    1116             :                     dst += 2 * dst_stride;
    1117             :                     y -= 2;
    1118             :                 } while (y);
    1119             :             }
    1120             :         }
    1121             :         else {
    1122             :             int32_t x = 0;
    1123             :             __m256i s_256[2][8], r0[4], r1[4];
    1124             : 
    1125             :             assert(!(w % 32));
    1126             : 
    1127             :             __m256i ss_256[2][8], tt_256[2][8];
    1128             : 
    1129             :             do {
    1130             :                 const int16_t *s = im + x;
    1131             :                 uint8_t *d = dst + x;
    1132             : 
    1133             :                 load_16bit_7rows_avx2(s, w, s_256[0]);
    1134             :                 convolve_8tap_unapck_avx2(s_256[0], ss_256[0]);
    1135             :                 convolve_8tap_unapck_avx2(s_256[0] + 1, tt_256[0]);
    1136             : 
    1137             :                 load_16bit_7rows_avx2(s + 16, w, s_256[1]);
    1138             :                 convolve_8tap_unapck_avx2(s_256[1], ss_256[1]);
    1139             :                 convolve_8tap_unapck_avx2(s_256[1] + 1, tt_256[1]);
    1140             : 
    1141             :                 y = h;
    1142             :                 do {
    1143             :                     xy_y_convolve_8tap_16x2_avx2(
    1144             :                         s, w, coeffs_256, s_256[0], ss_256[0], tt_256[0], r0);
    1145             :                     xy_y_convolve_8tap_16x2_avx2(s + 16,
    1146             :                         w,
    1147             :                         coeffs_256,
    1148             :                         s_256[1],
    1149             :                         ss_256[1],
    1150             :                         tt_256[1],
    1151             :                         r1);
    1152             :                     xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
    1153             :                     xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
    1154             : 
    1155             :                     s += 2 * w;
    1156             :                     d += 2 * dst_stride;
    1157             :                     y -= 2;
    1158             :                 } while (y);
    1159             : 
    1160             :                 x += 32;
    1161             :             } while (x < w);
    1162             :         }
    1163             :     }
    1164             : }
    1165             : 
    1166             : typedef void(*convolve_2d_sr_hor_tap_func)(
    1167             :     const uint8_t *const src, const int32_t src_stride, const int32_t w,
    1168             :     const int32_t h, const InterpFilterParams *const filter_params_x,
    1169             :     const int32_t subpel_x_q4, int16_t *const im_block);
    1170             : 
    1171             : typedef void(*convolve_2d_sr_ver_tap_func)(uint8_t *dst, int32_t dst_stride,
    1172             :     int32_t w, int32_t h,
    1173             :     InterpFilterParams *filter_params_y,
    1174             :     const int32_t subpel_y_q4,
    1175             :     int16_t *im_block);
    1176             : #endif
    1177             : #if OBMC_CONVOLVE
    1178             : 
    1179   155088000 : void eb_av1_convolve_2d_sr_avx2(const uint8_t *src, int32_t src_stride,
    1180             :     uint8_t *dst, int32_t dst_stride, int32_t w, int32_t h,
    1181             :     InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y,
    1182             :     const int32_t subpel_x_qn, const int32_t subpel_y_qn,
    1183             :     ConvolveParams *conv_params) {
    1184   155088000 :     const int32_t bd = 8;
    1185   155088000 :     const int32_t h_tap = get_convolve_tap(filter_params_x->filter_ptr);
    1186   154954000 :     const int32_t v_tap = get_convolve_tap(filter_params_y->filter_ptr);
    1187   155330000 :     int32_t im_stride = 8;
    1188             :     int32_t i;
    1189             :     DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
    1190   155330000 :     const int32_t bits =
    1191   155330000 :         FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    1192   155330000 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    1193             : 
    1194   155330000 :     assert(conv_params->round_0 > 0);
    1195             : 
    1196   310660000 :     const __m256i round_const_h = _mm256_set1_epi16(
    1197   155330000 :         ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
    1198   155330000 :     const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
    1199             : 
    1200   310660000 :     const __m256i sum_round_v = _mm256_set1_epi32(
    1201   155330000 :         (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
    1202   155330000 :     const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
    1203             : 
    1204   155330000 :     const __m256i round_const_v = _mm256_set1_epi32(
    1205   155330000 :         ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
    1206   155330000 :         ((1 << (offset_bits - conv_params->round_1)) >> 1));
    1207   155330000 :     const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
    1208             : 
    1209             :     __m256i filt[4], coeffs_h[4], coeffs_v[4];
    1210             : 
    1211   155330000 :     filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
    1212   155330000 :     filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
    1213             : 
    1214   155330000 :     prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_qn, coeffs_h);
    1215   155783000 :     prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_qn, coeffs_v);
    1216             : 
    1217   155622000 :     if (h_tap == 2) {
    1218    20136600 :         int32_t im_h = h + filter_params_y->taps - 1;
    1219    20136600 :         const int32_t fo_vert = filter_params_y->taps / 2 - 1;
    1220    20136600 :         const int32_t fo_horiz = 0;
    1221    20136600 :         const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
    1222             : 
    1223    20136600 :         prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_qn, coeffs_h);
    1224             : 
    1225    20147300 :         if (v_tap == 2) {
    1226    20147300 :             const int16_t *const t_block = im_block + 3 * im_stride;
    1227    20147300 :             prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_qn, coeffs_v);
    1228    59977100 :             for (int32_t j = 0; j < w; j += 8) {
    1229  2634610000 :                 CONVOLVE_SR_HORIZONTAL_FILTER_2TAP;
    1230  5943740000 :                 CONVOLVE_SR_VERTICAL_FILTER_2TAP;
    1231             :             }
    1232             :         }
    1233           0 :         else if (v_tap == 4) {
    1234           0 :             const int16_t *const t_block = im_block + 2 * im_stride;
    1235           0 :             for (int32_t j = 0; j < w; j += 8) {
    1236           0 :                 CONVOLVE_SR_HORIZONTAL_FILTER_2TAP;
    1237           0 :                 CONVOLVE_SR_VERTICAL_FILTER_4TAP;
    1238             :             }
    1239             :         }
    1240             :         else {
    1241           0 :             const int16_t *const t_block = im_block;
    1242           0 :             for (int32_t j = 0; j < w; j += 8) {
    1243           0 :                 CONVOLVE_SR_HORIZONTAL_FILTER_2TAP;
    1244           0 :                 CONVOLVE_SR_VERTICAL_FILTER_8TAP;
    1245             :             }
    1246             :         }
    1247             :     }
    1248   135486000 :     else if (v_tap == 2) {
    1249           0 :         int32_t im_h = h + 3;
    1250           0 :         const int32_t fo_vert = 0;
    1251           0 :         const int16_t *const t_block = im_block;
    1252             : 
    1253           0 :         prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_qn, coeffs_v);
    1254           0 :         filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
    1255           0 :         filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
    1256             : 
    1257           0 :         if (h_tap == 4) {
    1258           0 :             const int32_t fo_horiz = 1;
    1259           0 :             const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
    1260           0 :             for (int32_t j = 0; j < w; j += 8) {
    1261           0 :                 CONVOLVE_SR_HORIZONTAL_FILTER_4TAP;
    1262           0 :                 CONVOLVE_SR_VERTICAL_FILTER_2TAP;
    1263             :             }
    1264             :         }
    1265             :         else {
    1266           0 :             const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
    1267           0 :             const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
    1268           0 :             for (int32_t j = 0; j < w; j += 8) {
    1269           0 :                 CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
    1270           0 :                 CONVOLVE_SR_VERTICAL_FILTER_2TAP;
    1271             :             }
    1272             :         }
    1273             :     }
    1274   135486000 :     else if (h_tap == 4) {
    1275    13616900 :         int32_t im_h = h + filter_params_y->taps - 1;
    1276    13616900 :         const int32_t fo_vert = filter_params_y->taps / 2 - 1;
    1277    13616900 :         const int32_t fo_horiz = 1;
    1278    13616900 :         const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
    1279    13616900 :         const int16_t *const t_block = im_block;
    1280             : 
    1281    27211000 :         for (int32_t j = 0; j < w; j += 8) {
    1282   646396000 :             CONVOLVE_SR_HORIZONTAL_FILTER_4TAP;
    1283  1291200000 :             CONVOLVE_SR_VERTICAL_FILTER_8TAP;
    1284             :         }
    1285             :     }
    1286   121869000 :     else if (v_tap == 4) {
    1287    10999300 :         int32_t im_h = h + 3;
    1288    10999300 :         const int32_t fo_vert = 1;
    1289    10999300 :         const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
    1290    10999300 :         const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
    1291    10999300 :         const int16_t *const t_block = im_block;
    1292             : 
    1293    10999300 :         filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
    1294    10999300 :         filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
    1295             : 
    1296    28684500 :         for (int32_t j = 0; j < w; j += 8) {
    1297   352315000 :             CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
    1298   703226000 :             CONVOLVE_SR_VERTICAL_FILTER_4TAP;
    1299             :         }
    1300             :     }
    1301             :     else {
    1302             :         int32_t j;
    1303   110869000 :         int32_t im_h = h + filter_params_y->taps - 1;
    1304   110869000 :         const int32_t fo_vert = filter_params_y->taps / 2 - 1;
    1305   110869000 :         const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
    1306   110869000 :         const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
    1307   110869000 :         const int16_t *const t_block = im_block;
    1308             : 
    1309   110869000 :         filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
    1310   110869000 :         filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
    1311             : 
    1312   377468000 :         for (j = 0; j < w; j += 8) {
    1313 15005000000 :             CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
    1314 37303400000 :             CONVOLVE_SR_VERTICAL_FILTER_8TAP;
    1315             :         }
    1316             :     }
    1317   188210000 : }
    1318             : #else
    1319             : void eb_av1_convolve_2d_sr_avx2(const uint8_t *src, int32_t src_stride,
    1320             :     uint8_t *dst, int32_t dst_stride, int32_t w,
    1321             :     int32_t h, InterpFilterParams *filter_params_x,
    1322             :     InterpFilterParams *filter_params_y,
    1323             :     const int32_t subpel_x_q4,
    1324             :     const int32_t subpel_y_q4,
    1325             :     ConvolveParams *conv_params) {
    1326             :     static const convolve_2d_sr_hor_tap_func
    1327             :         convolve_2d_sr_hor_tap_func_table[MAX_FILTER_TAP + 1] = {
    1328             :             NULL,
    1329             :             NULL,
    1330             :             convolve_2d_sr_hor_2tap_avx2,
    1331             :             NULL,
    1332             :             convolve_2d_sr_hor_4tap_avx2,
    1333             :             NULL,
    1334             :             convolve_2d_sr_hor_6tap_avx2,
    1335             :             NULL,
    1336             :             convolve_2d_sr_hor_8tap_avx2 };
    1337             :     static const convolve_2d_sr_ver_tap_func
    1338             :         convolve_2d_sr_ver_tap_func_table[MAX_FILTER_TAP + 1] = {
    1339             :             NULL,
    1340             :             convolve_2d_sr_ver_2tap_half_avx2,
    1341             :             convolve_2d_sr_ver_2tap_avx2,
    1342             :             convolve_2d_sr_ver_4tap_avx2,
    1343             :             convolve_2d_sr_ver_4tap_avx2,
    1344             :             convolve_2d_sr_ver_6tap_avx2,
    1345             :             convolve_2d_sr_ver_6tap_avx2,
    1346             :             convolve_2d_sr_ver_8tap_avx2,
    1347             :             convolve_2d_sr_ver_8tap_avx2 };
    1348             :     const int32_t tap_x = get_convolve_tap(filter_params_x->filter_ptr);
    1349             :     const int32_t tap_y = get_convolve_tap(filter_params_y->filter_ptr);
    1350             :     const uint8_t *src_ptr =
    1351             :         src + ((MAX_FILTER_TAP - tap_y) / 2 - 3) * src_stride;
    1352             :     // Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data
    1353             :     //       permutation.
    1354             :     DECLARE_ALIGNED(
    1355             :     32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
    1356             : 
    1357             :     (void)conv_params;
    1358             : 
    1359             :     assert(conv_params->round_0 == 3);
    1360             :     assert(conv_params->round_1 == 11);
    1361             : 
    1362             :     // horizontal filter
    1363             : 
    1364             :     // Have to calculate 1 more row for small widths, since 2 lines are
    1365             :     // calculated in each loop for them.
    1366             :     const int32_t hh = h + tap_y - (w >= 32);
    1367             : 
    1368             :     convolve_2d_sr_hor_tap_func_table[tap_x](
    1369             :         src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block);
    1370             : 
    1371             :     // vertical filter
    1372             :     convolve_2d_sr_ver_tap_func_table[tap_y - (subpel_y_q4 == 8)](
    1373             :         dst, dst_stride, w, h, filter_params_y, subpel_y_q4, im_block);
    1374             : }
    1375             : #endif
    1376           0 : static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
    1377             :     __m256i s[4];
    1378           0 :     s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
    1379           0 :     s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
    1380           0 :     s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
    1381           0 :     s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
    1382           0 :     _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
    1383           0 :     _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
    1384           0 :     _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
    1385           0 :     _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
    1386           0 : }
    1387             : 
    1388     8688620 : void eb_av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int32_t src_stride,
    1389             :     uint8_t *dst, int32_t dst_stride, int32_t w, int32_t h,
    1390             :     InterpFilterParams *filter_params_x,
    1391             :     InterpFilterParams *filter_params_y,
    1392             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
    1393             :     ConvolveParams *conv_params) {
    1394             :     (void)filter_params_x;
    1395             :     (void)filter_params_y;
    1396             :     (void)subpel_x_q4;
    1397             :     (void)subpel_y_q4;
    1398             :     (void)conv_params;
    1399             : 
    1400     8688620 :     if (w == 2) {
    1401             :         do {
    1402        9771 :             memcpy(dst, src, 2 * sizeof(*src));
    1403        9771 :             src += src_stride;
    1404        9771 :             dst += dst_stride;
    1405        9771 :             memcpy(dst, src, 2 * sizeof(*src));
    1406        9771 :             src += src_stride;
    1407        9771 :             dst += dst_stride;
    1408        9771 :             h -= 2;
    1409        9771 :         } while (h);
    1410             :     }
    1411     8685260 :     else if (w == 4) {
    1412             :         do {
    1413     3385780 :             memcpy(dst, src, 4 * sizeof(*src));
    1414     3385780 :             src += src_stride;
    1415     3385780 :             dst += dst_stride;
    1416     3385780 :             memcpy(dst, src, 4 * sizeof(*src));
    1417     3385780 :             src += src_stride;
    1418     3385780 :             dst += dst_stride;
    1419     3385780 :             h -= 2;
    1420     3385780 :         } while (h);
    1421             :     }
    1422     7812980 :     else if (w == 8) {
    1423             :         do {
    1424             :             __m128i s[2];
    1425    21817300 :             s[0] = _mm_loadl_epi64((__m128i *)src);
    1426    21817300 :             src += src_stride;
    1427    21817300 :             s[1] = _mm_loadl_epi64((__m128i *)src);
    1428    21817300 :             src += src_stride;
    1429    21817300 :             _mm_storel_epi64((__m128i *)dst, s[0]);
    1430    21817300 :             dst += dst_stride;
    1431    21817300 :             _mm_storel_epi64((__m128i *)dst, s[1]);
    1432    21817300 :             dst += dst_stride;
    1433    21817300 :             h -= 2;
    1434    21817300 :         } while (h);
    1435             :     }
    1436     4262830 :     else if (w == 16) {
    1437             :         do {
    1438             :             __m128i s[2];
    1439    23695300 :             s[0] = _mm_loadu_si128((__m128i *)src);
    1440    23695300 :             src += src_stride;
    1441    23695300 :             s[1] = _mm_loadu_si128((__m128i *)src);
    1442    23695300 :             src += src_stride;
    1443    23695300 :             _mm_storeu_si128((__m128i *)dst, s[0]);
    1444    23695300 :             dst += dst_stride;
    1445    23695300 :             _mm_storeu_si128((__m128i *)dst, s[1]);
    1446    23695300 :             dst += dst_stride;
    1447    23695300 :             h -= 2;
    1448    23695300 :         } while (h);
    1449             :     }
    1450     1626020 :     else if (w == 32) {
    1451             :         do {
    1452             :             __m256i s[2];
    1453    14931300 :             s[0] = _mm256_loadu_si256((__m256i *)src);
    1454    14931300 :             src += src_stride;
    1455    14931300 :             s[1] = _mm256_loadu_si256((__m256i *)src);
    1456    14931300 :             src += src_stride;
    1457    14931300 :             _mm256_storeu_si256((__m256i *)dst, s[0]);
    1458    14931300 :             dst += dst_stride;
    1459    14931300 :             _mm256_storeu_si256((__m256i *)dst, s[1]);
    1460    14931300 :             dst += dst_stride;
    1461    14931300 :             h -= 2;
    1462    14931300 :         } while (h);
    1463             :     }
    1464      365102 :     else if (w == 64) {
    1465             :         do {
    1466             :             __m256i s[4];
    1467     6325020 :             s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
    1468     6325020 :             s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
    1469     6325020 :             src += src_stride;
    1470     6325020 :             s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
    1471     6325020 :             s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
    1472     6325020 :             src += src_stride;
    1473     6325020 :             _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
    1474     6325020 :             _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
    1475     6325020 :             dst += dst_stride;
    1476     6325020 :             _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
    1477     6325020 :             _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
    1478     6325020 :             dst += dst_stride;
    1479     6325020 :             h -= 2;
    1480     6325020 :         } while (h);
    1481             :     }
    1482             :     else {
    1483             :         do {
    1484           0 :             copy_128(src, dst);
    1485           0 :             src += src_stride;
    1486           0 :             dst += dst_stride;
    1487           0 :             copy_128(src, dst);
    1488           0 :             src += src_stride;
    1489           0 :             dst += dst_stride;
    1490           0 :             h -= 2;
    1491           0 :         } while (h);
    1492             :     }
    1493     8688620 : }

Generated by: LCOV version 1.14