LCOV - code coverage report
Current view: top level - ASM_AVX2 - EbBlend_a64_mask_avx2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 290 818 35.5 %
Date: 2019-11-25 17:38:06 Functions: 14 35 40.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : /*
       7             :  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
       8             :  *
       9             :  * This source code is subject to the terms of the BSD 2 Clause License and
      10             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      11             :  * was not distributed with this source code in the LICENSE file, you can
      12             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      13             :  * Media Patent License 1.0 was not distributed with this source code in the
      14             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      15             :  */
      16             : 
      17             : #include <assert.h>
      18             : #include "immintrin.h"
      19             : 
      20             : #include "EbDefinitions.h"
      21             : 
      22             : #include "synonyms.h"
      23             : #include "EbMemory_AVX2.h"
      24             : #include "synonyms_avx2.h"
      25             : #include "convolve_avx2.h"
      26             : #include "EbBlend_sse4.h"
      27             : 
      28             : #include "aom_dsp_rtcd.h"
      29             : 
      30      109072 : static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0,
      31             :     const uint8_t *src1,
      32             :     const __m256i *v_m0_b,
      33             :     const __m256i *v_m1_b,
      34             :     const int32_t bits)
      35             : {
      36      109072 :     const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
      37      109072 :     const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
      38      109072 :     const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
      39      109072 :     const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
      40             : 
      41             :     const __m256i v_p0_w =
      42      327216 :         _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
      43             :             _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
      44             : 
      45      109072 :     const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
      46      109072 :     const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
      47      109072 :     const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
      48      109072 :     return v_res;
      49             : }
      50             : 
      51    85973600 : static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0,
      52             :     const uint8_t *src1,
      53             :     const __m256i *v_m0_b,
      54             :     const __m256i *v_m1_b,
      55             :     const int32_t bits)
      56             : {
      57    85973600 :     const __m256i v_s0_b = yy_loadu_256(src0);
      58    85929700 :     const __m256i v_s1_b = yy_loadu_256(src1);
      59             : 
      60             :     const __m256i v_p0_w =
      61   257806000 :         _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
      62             :             _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
      63             :     const __m256i v_p1_w =
      64   257806000 :         _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
      65             :             _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
      66             : 
      67    85935400 :     const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
      68    85990900 :     const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
      69    85992000 :     const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
      70    85992000 :     return v_res;
      71             : }
      72             : 
      73        9016 : static INLINE void blend_a64_mask_sx_sy_w16_avx2(
      74             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
      75             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
      76             :     const uint8_t *mask, uint32_t mask_stride, int h)
      77             : {
      78        9016 :     const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
      79        9016 :     const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
      80             :     do {
      81      109072 :         const __m256i v_ral_b = yy_loadu_256(mask);
      82      109072 :         const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
      83      109072 :         const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
      84      109072 :         const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
      85             :         const __m256i v_rvsbl_w =
      86      218144 :             _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
      87      109072 :         const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
      88             : 
      89      109072 :         const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
      90      109072 :         const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
      91      109072 :         const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
      92             : 
      93      109072 :         const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
      94             :             AOM_BLEND_A64_ROUND_BITS);
      95             : 
      96      109072 :         xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
      97      109072 :         dst += dst_stride;
      98      109072 :         src0 += src0_stride;
      99      109072 :         src1 += src1_stride;
     100      109072 :         mask += 2 * mask_stride;
     101      109072 :     } while (--h);
     102        9016 : }
     103             : 
     104           0 : static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
     105             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     106             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     107             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     108             : {
     109           0 :     const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     110           0 :     const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
     111             :     do {
     112             :         int c;
     113           0 :         for (c = 0; c < w; c += 32) {
     114           0 :             const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
     115           0 :             const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
     116           0 :             const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
     117           0 :             const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
     118           0 :             const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
     119           0 :             const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
     120           0 :             const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
     121           0 :             const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
     122             :             const __m256i v_rvsbl_w =
     123           0 :                 _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
     124             :             const __m256i v_rvsbh_w =
     125           0 :                 _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
     126           0 :             const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
     127           0 :             const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
     128             : 
     129           0 :             const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
     130           0 :             const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
     131           0 :             const __m256i v_m0_b =
     132           0 :                 _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
     133           0 :             const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
     134             : 
     135           0 :             const __m256i v_res_b = blend_32_u8_avx2(
     136             :                 src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
     137             : 
     138           0 :             yy_storeu_256(dst + c, v_res_b);
     139             :         }
     140           0 :         dst += dst_stride;
     141           0 :         src0 += src0_stride;
     142           0 :         src1 += src1_stride;
     143           0 :         mask += 2 * mask_stride;
     144           0 :     } while (--h);
     145           0 : }
     146             : 
     147       71390 : static INLINE void blend_a64_mask_sx_sy_avx2(
     148             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     149             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     150             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     151             : {
     152       71390 :     const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
     153       71390 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     154       71390 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     155       71390 :     switch (w) {
     156      193848 :     case 4:
     157             :         do {
     158      193848 :             const __m128i v_ra_b = xx_loadl_64(mask);
     159      193848 :             const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     160      193848 :             const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     161      193848 :             const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
     162      193848 :             const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
     163      387696 :             const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
     164      193848 :             const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
     165      193848 :             const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
     166      193848 :             const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
     167      193848 :             const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     168             : 
     169      193848 :             const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     170             : 
     171      193848 :             xx_storel_32(dst, v_res_b);
     172             : 
     173      193848 :             dst += dst_stride;
     174      193848 :             src0 += src0_stride;
     175      193848 :             src1 += src1_stride;
     176      193848 :             mask += 2 * mask_stride;
     177      193848 :         } while (--h);
     178       36928 :         break;
     179      169434 :     case 8:
     180             :         do {
     181      194880 :             const __m128i v_ra_b = xx_loadu_128(mask);
     182      194880 :             const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
     183      194880 :             const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     184      194880 :             const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
     185      194880 :             const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
     186      389760 :             const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
     187      194880 :             const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
     188      194880 :             const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
     189      194880 :             const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
     190      194880 :             const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     191             : 
     192      194880 :             const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     193             : 
     194      194880 :             xx_storel_64(dst, v_res_b);
     195             : 
     196      194880 :             dst += dst_stride;
     197      194880 :             src0 += src0_stride;
     198      194880 :             src1 += src1_stride;
     199      194880 :             mask += 2 * mask_stride;
     200      194880 :         } while (--h);
     201       25446 :         break;
     202        9016 :     case 16:
     203        9016 :         blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
     204             :             src1_stride, mask, mask_stride, h);
     205        9016 :         break;
     206           0 :     default:
     207           0 :         blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
     208             :             src1_stride, mask, mask_stride, w, h);
     209           0 :         break;
     210             :     }
     211       71390 : }
     212             : 
     213           0 : static INLINE void blend_a64_mask_sx_w16_avx2(
     214             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     215             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     216             :     const uint8_t *mask, uint32_t mask_stride, int h)
     217             : {
     218           0 :     const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     219           0 :     const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
     220             :     do {
     221           0 :         const __m256i v_rl_b = yy_loadu_256(mask);
     222             :         const __m256i v_al_b =
     223           0 :             _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
     224             : 
     225           0 :         const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
     226           0 :         const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
     227           0 :         const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
     228             : 
     229           0 :         const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
     230             :             AOM_BLEND_A64_ROUND_BITS);
     231             : 
     232           0 :         xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
     233           0 :         dst += dst_stride;
     234           0 :         src0 += src0_stride;
     235           0 :         src1 += src1_stride;
     236           0 :         mask += mask_stride;
     237           0 :     } while (--h);
     238           0 : }
     239             : 
     240           0 : static INLINE void blend_a64_mask_sx_w32n_avx2(
     241             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     242             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     243             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     244             : {
     245           0 :     const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
     246           0 :     const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     247             :     do {
     248             :         int c;
     249           0 :         for (c = 0; c < w; c += 32) {
     250           0 :             const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
     251           0 :             const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
     252           0 :             const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
     253           0 :             const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
     254             :             const __m256i v_al_b =
     255           0 :                 _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
     256             :             const __m256i v_ah_b =
     257           0 :                 _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
     258             : 
     259           0 :             const __m256i v_m0_b =
     260           0 :                 _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
     261           0 :             const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
     262             : 
     263           0 :             const __m256i v_res_b = blend_32_u8_avx2(
     264             :                 src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
     265             : 
     266           0 :             yy_storeu_256(dst + c, v_res_b);
     267             :         }
     268           0 :         dst += dst_stride;
     269           0 :         src0 += src0_stride;
     270           0 :         src1 += src1_stride;
     271           0 :         mask += mask_stride;
     272           0 :     } while (--h);
     273           0 : }
     274             : 
     275           0 : static INLINE void blend_a64_mask_sx_avx2(
     276             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     277             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     278             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     279             : {
     280           0 :     const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
     281           0 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     282           0 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     283           0 :     switch (w) {
     284           0 :     case 4:
     285             :         do {
     286           0 :             const __m128i v_r_b = xx_loadl_64(mask);
     287           0 :             const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
     288           0 :             const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
     289           0 :             const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
     290           0 :             const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
     291           0 :             const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     292             : 
     293           0 :             const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     294             : 
     295           0 :             xx_storel_32(dst, v_res_b);
     296             : 
     297           0 :             dst += dst_stride;
     298           0 :             src0 += src0_stride;
     299           0 :             src1 += src1_stride;
     300           0 :             mask += mask_stride;
     301           0 :         } while (--h);
     302           0 :         break;
     303           0 :     case 8:
     304             :         do {
     305           0 :             const __m128i v_r_b = xx_loadu_128(mask);
     306           0 :             const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
     307           0 :             const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
     308           0 :             const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
     309           0 :             const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
     310           0 :             const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     311             : 
     312           0 :             const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     313             : 
     314           0 :             xx_storel_64(dst, v_res_b);
     315             : 
     316           0 :             dst += dst_stride;
     317           0 :             src0 += src0_stride;
     318           0 :             src1 += src1_stride;
     319           0 :             mask += mask_stride;
     320           0 :         } while (--h);
     321           0 :         break;
     322           0 :     case 16:
     323           0 :         blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
     324             :             src1_stride, mask, mask_stride, h);
     325           0 :         break;
     326           0 :     default:
     327           0 :         blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
     328             :             src1_stride, mask, mask_stride, w, h);
     329           0 :         break;
     330             :     }
     331           0 : }
     332             : 
     333           0 : static INLINE void blend_a64_mask_sy_w16_avx2(
     334             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     335             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     336             :     const uint8_t *mask, uint32_t mask_stride, int h) {
     337           0 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     338           0 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     339             :     do {
     340           0 :         const __m128i v_ra_b = xx_loadu_128(mask);
     341           0 :         const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
     342           0 :         const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
     343             : 
     344           0 :         const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
     345           0 :         const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     346             : 
     347           0 :         xx_storeu_128(dst, v_res_b);
     348           0 :         dst += dst_stride;
     349           0 :         src0 += src0_stride;
     350           0 :         src1 += src1_stride;
     351           0 :         mask += 2 * mask_stride;
     352           0 :     } while (--h);
     353           0 : }
     354             : 
     355           0 : static INLINE void blend_a64_mask_sy_w32n_avx2(
     356             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     357             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     358             :     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
     359           0 :     const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     360             :     do {
     361             :         int c;
     362           0 :         for (c = 0; c < w; c += 32) {
     363           0 :             const __m256i v_ra_b = yy_loadu_256(mask + c);
     364           0 :             const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
     365           0 :             const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
     366           0 :             const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
     367           0 :             const __m256i v_res_b = blend_32_u8_avx2(
     368             :                 src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
     369             : 
     370           0 :             yy_storeu_256(dst + c, v_res_b);
     371             :         }
     372           0 :         dst += dst_stride;
     373           0 :         src0 += src0_stride;
     374           0 :         src1 += src1_stride;
     375           0 :         mask += 2 * mask_stride;
     376           0 :     } while (--h);
     377           0 : }
     378             : 
     379           0 : static INLINE void blend_a64_mask_sy_avx2(
     380             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     381             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     382             :     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
     383           0 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     384           0 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     385           0 :     switch (w) {
     386           0 :     case 4:
     387             :         do {
     388           0 :             const __m128i v_ra_b = xx_loadl_32(mask);
     389           0 :             const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
     390           0 :             const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
     391           0 :             const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     392           0 :             const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     393             : 
     394           0 :             xx_storel_32(dst, v_res_b);
     395             : 
     396           0 :             dst += dst_stride;
     397           0 :             src0 += src0_stride;
     398           0 :             src1 += src1_stride;
     399           0 :             mask += 2 * mask_stride;
     400           0 :         } while (--h);
     401           0 :         break;
     402           0 :     case 8:
     403             :         do {
     404           0 :             const __m128i v_ra_b = xx_loadl_64(mask);
     405           0 :             const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     406           0 :             const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
     407           0 :             const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     408           0 :             const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     409             : 
     410           0 :             xx_storel_64(dst, v_res_b);
     411             : 
     412           0 :             dst += dst_stride;
     413           0 :             src0 += src0_stride;
     414           0 :             src1 += src1_stride;
     415           0 :             mask += 2 * mask_stride;
     416           0 :         } while (--h);
     417           0 :         break;
     418           0 :     case 16:
     419           0 :         blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
     420             :             src1_stride, mask, mask_stride, h);
     421           0 :         break;
     422           0 :     default:
     423           0 :         blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
     424             :             src1_stride, mask, mask_stride, w, h);
     425             :     }
     426           0 : }
     427             : 
     428     3715530 : static INLINE void blend_a64_mask_w32n_avx2(
     429             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     430             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     431             :     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
     432     3715530 :     const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     433             :     do {
     434             :         int c;
     435   171989000 :         for (c = 0; c < w; c += 32) {
     436    86004800 :             const __m256i v_m0_b = yy_loadu_256(mask + c);
     437    85989600 :             const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
     438             : 
     439    85989600 :             const __m256i v_res_b = blend_32_u8_avx2(
     440             :                 src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
     441             : 
     442    85988500 :             yy_storeu_256(dst + c, v_res_b);
     443             :         }
     444    85983900 :         dst += dst_stride;
     445    85983900 :         src0 += src0_stride;
     446    85983900 :         src1 += src1_stride;
     447    85983900 :         mask += mask_stride;
     448    85983900 :     } while (--h);
     449     3701270 : }
     450             : 
     451    29600200 : static INLINE void blend_a64_mask_avx2(
     452             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     453             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     454             :     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
     455    29600200 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     456    29600200 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     457    29600200 :     switch (w) {
     458      243998 :     case 4:
     459             :         do {
     460      243998 :             const __m128i v_m0_b = xx_loadl_32(mask);
     461      243998 :             const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     462      243998 :             const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     463             : 
     464      243997 :             xx_storel_32(dst, v_res_b);
     465             : 
     466      243998 :             dst += dst_stride;
     467      243998 :             src0 += src0_stride;
     468      243998 :             src1 += src1_stride;
     469      243998 :             mask += mask_stride;
     470      243998 :         } while (--h);
     471       46548 :         break;
     472   153878000 :     case 8:
     473             :         do {
     474   168454000 :             const __m128i v_m0_b = xx_loadl_64(mask);
     475   168312000 :             const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     476   168312000 :             const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     477             : 
     478   168342000 :             xx_storel_64(dst, v_res_b);
     479             : 
     480   168452000 :             dst += dst_stride;
     481   168452000 :             src0 += src0_stride;
     482   168452000 :             src1 += src1_stride;
     483   168452000 :             mask += mask_stride;
     484   168452000 :         } while (--h);
     485    14573700 :         break;
     486   161487000 :     case 16:
     487             :         do {
     488   172749000 :             const __m128i v_m0_b = xx_loadu_128(mask);
     489   172671000 :             const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     490   172671000 :             const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     491             : 
     492   172729000 :             xx_storeu_128(dst, v_res_b);
     493   172764000 :             dst += dst_stride;
     494   172764000 :             src0 += src0_stride;
     495   172764000 :             src1 += src1_stride;
     496   172764000 :             mask += mask_stride;
     497   172764000 :         } while (--h);
     498    11276900 :         break;
     499     3715590 :     default:
     500     3715590 :         blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
     501             :             src1_stride, mask, mask_stride, w, h);
     502             :     }
     503    29612700 : }
     504             : 
     505    29666300 : void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
     506             :     const uint8_t *src0, uint32_t src0_stride,
     507             :     const uint8_t *src1, uint32_t src1_stride,
     508             :     const uint8_t *mask, uint32_t mask_stride, int w,
     509             :     int h, int subx, int suby)
     510             : {
     511    29666300 :     assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
     512    29666300 :     assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
     513             : 
     514    29666300 :     assert(h >= 1);
     515    29666300 :     assert(w >= 1);
     516    29666300 :     assert(IS_POWER_OF_TWO(h));
     517    29666300 :     assert(IS_POWER_OF_TWO(w));
     518             : 
     519    29666300 :     if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     520           0 :         aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
     521             :             mask, mask_stride, w, h, subx, suby);
     522             :     }
     523             :     else {
     524    29666300 :         if (subx & suby) {
     525       71390 :             blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
     526             :                 src1_stride, mask, mask_stride, w, h);
     527             :         }
     528    29594900 :         else if (subx) {
     529           0 :             blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
     530             :                 src1_stride, mask, mask_stride, w, h);
     531             :         }
     532    29594900 :         else if (suby) {
     533           0 :             blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
     534             :                 src1_stride, mask, mask_stride, w, h);
     535             :         }
     536             :         else {
     537    29594900 :             blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
     538             :                 mask, mask_stride, w, h);
     539             :         }
     540             :     }
     541    29674000 : }
     542             : 
     543             : /*Functions from convolve_avx2.c*/
     544   251250000 : static INLINE void blend_a64_d16_mask_w16_avx2(
     545             :     uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
     546             :     const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
     547             :     int shift)
     548             : {
     549   251250000 :     const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
     550   251250000 :     const __m256i s0_0 = yy_loadu_256(src0);
     551   251176000 :     const __m256i s1_0 = yy_loadu_256(src1);
     552   752532000 :     __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
     553             :         _mm256_unpacklo_epi16(*m0, max_minus_m0));
     554   752532000 :     __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
     555             :         _mm256_unpackhi_epi16(*m0, max_minus_m0));
     556             :     res0_lo =
     557   501688000 :         _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
     558             :     res0_hi =
     559   752532000 :         _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
     560   250844000 :     const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
     561   250844000 :     __m256i res = _mm256_packus_epi16(res0, res0);
     562   250844000 :     res = _mm256_permute4x64_epi64(res, 0xd8);
     563   250844000 :     _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
     564   250844000 : }
     565             : 
     566   259858000 : static INLINE void blend_a64_d16_mask_w32_avx2(
     567             :     uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
     568             :     const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
     569             :     const __m256i *v_maxval, int shift)
     570             : {
     571   259858000 :     const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
     572   259858000 :     const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
     573   259858000 :     const __m256i s0_0 = yy_loadu_256(src0);
     574   259623000 :     const __m256i s0_1 = yy_loadu_256(src0 + 16);
     575   259079000 :     const __m256i s1_0 = yy_loadu_256(src1);
     576   258594000 :     const __m256i s1_1 = yy_loadu_256(src1 + 16);
     577   775074000 :     __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
     578             :         _mm256_unpacklo_epi16(*m0, max_minus_m0));
     579   775074000 :     __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
     580             :         _mm256_unpackhi_epi16(*m0, max_minus_m0));
     581   775074000 :     __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
     582             :         _mm256_unpacklo_epi16(*m1, max_minus_m1));
     583   775074000 :     __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
     584             :         _mm256_unpackhi_epi16(*m1, max_minus_m1));
     585             :     res0_lo =
     586   516716000 :         _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
     587             :     res0_hi =
     588   516716000 :         _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
     589             :     res1_lo =
     590   516716000 :         _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
     591             :     res1_hi =
     592   775074000 :         _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
     593   258358000 :     const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
     594   258358000 :     const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
     595   258358000 :     __m256i res = _mm256_packus_epi16(res0, res1);
     596   258358000 :     res = _mm256_permute4x64_epi64(res, 0xd8);
     597             :     _mm256_storeu_si256((__m256i *)(dst), res);
     598   258358000 : }
     599             : 
     600    13567500 : static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
     601             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     602             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     603             :     const uint8_t *mask, uint32_t mask_stride, int h,
     604             :     const __m256i *round_offset, int shift)
     605             : {
     606    13567500 :     const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     607   263514000 :     for (int i = 0; i < h; ++i) {
     608   250036000 :         const __m128i m = xx_loadu_128(mask);
     609   249866000 :         const __m256i m0 = _mm256_cvtepu8_epi16(m);
     610             : 
     611   249866000 :         blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
     612             :             shift);
     613   249946000 :         mask += mask_stride;
     614   249946000 :         dst += dst_stride;
     615   249946000 :         src0 += src0_stride;
     616   249946000 :         src1 += src1_stride;
     617             :     }
     618    13478100 : }
     619             : 
     620     8457080 : static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
     621             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     622             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     623             :     const uint8_t *mask, uint32_t mask_stride, int h, int w,
     624             :     const __m256i *round_offset, int shift)
     625             : {
     626     8457080 :     const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     627   214734000 :     for (int i = 0; i < h; ++i) {
     628   466100000 :         for (int j = 0; j < w; j += 32) {
     629   259823000 :             const __m256i m = yy_loadu_256(mask + j);
     630   259652000 :             const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
     631   259652000 :             const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
     632             : 
     633   259652000 :             blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
     634             :                 round_offset, &v_maxval, shift);
     635             :         }
     636   206277000 :         mask += mask_stride;
     637   206277000 :         dst += dst_stride;
     638   206277000 :         src0 += src0_stride;
     639   206277000 :         src1 += src1_stride;
     640             :     }
     641     8483450 : }
     642             : 
     643      118618 : static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
     644             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     645             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     646             :     const uint8_t *mask, uint32_t mask_stride, int h,
     647             :     const __m256i *round_offset, int shift)
     648             : {
     649      118618 :     const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     650      118618 :     const __m256i one_b = _mm256_set1_epi8(1);
     651      118618 :     const __m256i two_w = _mm256_set1_epi16(2);
     652     1575040 :     for (int i = 0; i < h; ++i) {
     653     1456420 :         const __m256i m_i00 = yy_loadu_256(mask);
     654     1456420 :         const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
     655             : 
     656     1456420 :         const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
     657     1456420 :         const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
     658     1456420 :         const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
     659             : 
     660     1456420 :         blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
     661             :             shift);
     662     1456430 :         mask += mask_stride << 1;
     663     1456430 :         dst += dst_stride;
     664     1456430 :         src0 += src0_stride;
     665     1456430 :         src1 += src1_stride;
     666             :     }
     667      118620 : }
     668             : 
     669       41488 : static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
     670             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     671             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     672             :     const uint8_t *mask, uint32_t mask_stride, int h, int w,
     673             :     const __m256i *round_offset, int shift)
     674             : {
     675       41488 :     const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     676       41488 :     const __m256i one_b = _mm256_set1_epi8(1);
     677       41488 :     const __m256i two_w = _mm256_set1_epi16(2);
     678      662682 :     for (int i = 0; i < h; ++i) {
     679     1242390 :         for (int j = 0; j < w; j += 32) {
     680      621193 :             const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
     681      621191 :             const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
     682      621185 :             const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
     683      621179 :             const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
     684             : 
     685      621175 :             const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
     686      621175 :             const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
     687      621175 :             const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
     688      621175 :             const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
     689     1242350 :             const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
     690      621175 :             const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
     691             : 
     692      621175 :             blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
     693             :                 round_offset, &v_maxval, shift);
     694             :         }
     695      621194 :         mask += mask_stride << 1;
     696      621194 :         dst += dst_stride;
     697      621194 :         src0 += src0_stride;
     698      621194 :         src1 += src1_stride;
     699             :     }
     700       41489 : }
     701             : 
     702           0 : static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
     703             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     704             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     705             :     const uint8_t *mask, uint32_t mask_stride, int h, int w,
     706             :     const __m256i *round_offset, int shift)
     707             : {
     708           0 :     const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     709           0 :     const __m256i one_b = _mm256_set1_epi8(1);
     710           0 :     const __m256i zeros = _mm256_setzero_si256();
     711           0 :     for (int i = 0; i < h; ++i) {
     712           0 :         for (int j = 0; j < w; j += 16) {
     713           0 :             const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
     714           0 :             const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
     715           0 :             const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
     716             : 
     717           0 :             blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
     718             :                 round_offset, &v_maxval, shift);
     719             :         }
     720           0 :         mask += mask_stride;
     721           0 :         dst += dst_stride;
     722           0 :         src0 += src0_stride;
     723           0 :         src1 += src1_stride;
     724             :     }
     725           0 : }
     726             : 
     727           0 : static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
     728             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     729             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     730             :     const uint8_t *mask, uint32_t mask_stride, int h, int w,
     731             :     const __m256i *round_offset, int shift)
     732             : {
     733           0 :     const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     734           0 :     const __m256i one_b = _mm256_set1_epi8(1);
     735           0 :     const __m256i zeros = _mm256_setzero_si256();
     736           0 :     for (int i = 0; i < h; ++i) {
     737           0 :         for (int j = 0; j < w; j += 32) {
     738           0 :             const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
     739           0 :             const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
     740           0 :             const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
     741           0 :             const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
     742           0 :             const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
     743           0 :             const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
     744             : 
     745           0 :             blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
     746             :                 round_offset, &v_maxval, shift);
     747             :         }
     748           0 :         mask += mask_stride;
     749           0 :         dst += dst_stride;
     750           0 :         src0 += src0_stride;
     751           0 :         src1 += src1_stride;
     752             :     }
     753           0 : }
     754             : 
     755           0 : static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
     756             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     757             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     758             :     const uint8_t *mask, uint32_t mask_stride, int h, int w,
     759             :     const __m256i *round_offset, int shift)
     760             : {
     761           0 :     const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     762           0 :     const __m128i zeros = _mm_setzero_si128();
     763           0 :     for (int i = 0; i < h; ++i) {
     764           0 :         for (int j = 0; j < w; j += 16) {
     765           0 :             const __m128i m_i00 = xx_loadu_128(mask + j);
     766           0 :             const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
     767             : 
     768           0 :             const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
     769           0 :             const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
     770             : 
     771           0 :             blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
     772             :                 round_offset, &v_maxval, shift);
     773             :         }
     774           0 :         mask += mask_stride << 1;
     775           0 :         dst += dst_stride;
     776           0 :         src0 += src0_stride;
     777           0 :         src1 += src1_stride;
     778             :     }
     779           0 : }
     780             : 
     781           0 : static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
     782             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     783             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     784             :     const uint8_t *mask, uint32_t mask_stride, int h, int w,
     785             :     const __m256i *round_offset, int shift)
     786             : {
     787           0 :     const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     788           0 :     const __m256i zeros = _mm256_setzero_si256();
     789           0 :     for (int i = 0; i < h; ++i) {
     790           0 :         for (int j = 0; j < w; j += 32) {
     791           0 :             const __m256i m_i00 = yy_loadu_256(mask + j);
     792           0 :             const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
     793             : 
     794             :             const __m256i m_ac =
     795           0 :                 _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
     796           0 :             const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
     797           0 :             const __m256i m1 =
     798           0 :                 _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
     799             : 
     800           0 :             blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
     801             :                 round_offset, &v_maxval, shift);
     802             :         }
     803           0 :         mask += mask_stride << 1;
     804           0 :         dst += dst_stride;
     805           0 :         src0 += src0_stride;
     806           0 :         src1 += src1_stride;
     807             :     }
     808           0 : }
     809             : 
     810    39849000 : void aom_lowbd_blend_a64_d16_mask_avx2(
     811             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     812             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     813             :     const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
     814             :     ConvolveParams *conv_params)
     815             : {
     816    39849000 :     const int bd = 8;
     817    39849000 :     const int round_bits =
     818    39849000 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     819             : 
     820    39849000 :     const int round_offset =
     821    39849000 :         ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
     822    39849000 :         (1 << (round_bits - 1)))
     823             :         << AOM_BLEND_A64_ROUND_BITS;
     824             : 
     825    39849000 :     const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
     826    39849000 :     assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
     827    39849000 :     assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
     828             : 
     829    39849000 :     assert(h >= 4);
     830    39849000 :     assert(w >= 4);
     831    39849000 :     assert(IS_POWER_OF_TWO(h));
     832    39849000 :     assert(IS_POWER_OF_TWO(w));
     833    39849000 :     const __m128i v_round_offset = _mm_set1_epi32(round_offset);
     834    39849000 :     const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
     835             : 
     836    39849000 :     if (subw == 0 && subh == 0) {
     837    39438800 :         switch (w) {
     838           0 :         case 4:
     839           0 :             aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
     840             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     841             :                 mask_stride, h, &v_round_offset, shift);
     842           0 :             break;
     843    17434500 :         case 8:
     844    17434500 :             aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
     845             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     846             :                 mask_stride, h, &v_round_offset, shift);
     847    17435200 :             break;
     848    13568300 :         case 16:
     849    13568300 :             lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
     850             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     851             :                 mask_stride, h, &y_round_offset, shift);
     852    13569100 :             break;
     853     8436110 :         default:
     854     8436110 :             lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
     855             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     856             :                 mask_stride, h, w, &y_round_offset, shift);
     857     8458210 :             break;
     858             :         }
     859             :     }
     860      410126 :     else if (subw == 1 && subh == 1) {
     861      420727 :         switch (w) {
     862      112766 :         case 4:
     863      112766 :             aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
     864             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     865             :                 mask_stride, h, &v_round_offset, shift);
     866      112766 :             break;
     867      147856 :         case 8:
     868      147856 :             aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
     869             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     870             :                 mask_stride, h, &v_round_offset, shift);
     871      147856 :             break;
     872      118618 :         case 16:
     873      118618 :             lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
     874             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     875             :                 mask_stride, h, &y_round_offset, shift);
     876      118618 :             break;
     877       41487 :         default:
     878       41487 :             lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
     879             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     880             :                 mask_stride, h, w, &y_round_offset, shift);
     881       41488 :             break;
     882             :         }
     883             :     }
     884           0 :     else if (subw == 1 && subh == 0) {
     885           0 :         switch (w) {
     886           0 :         case 4:
     887           0 :             aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
     888             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     889             :                 mask_stride, h, &v_round_offset, shift);
     890           0 :             break;
     891           0 :         case 8:
     892           0 :             aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
     893             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     894             :                 mask_stride, h, &v_round_offset, shift);
     895           0 :             break;
     896           0 :         case 16:
     897           0 :             lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
     898             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     899             :                 mask_stride, h, w, &y_round_offset, shift);
     900           0 :             break;
     901           0 :         default:
     902           0 :             lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
     903             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     904             :                 mask_stride, h, w, &y_round_offset, shift);
     905           0 :             break;
     906             :         }
     907             :     }
     908             :     else {
     909           0 :         switch (w) {
     910           0 :         case 4:
     911           0 :             aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
     912             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     913             :                 mask_stride, h, &v_round_offset, shift);
     914           0 :             break;
     915           0 :         case 8:
     916           0 :             aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
     917             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     918             :                 mask_stride, h, &v_round_offset, shift);
     919           0 :             break;
     920           0 :         case 16:
     921           0 :             lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
     922             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     923             :                 mask_stride, h, w, &y_round_offset, shift);
     924           0 :             break;
     925           0 :         default:
     926           0 :             lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
     927             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     928             :                 mask_stride, h, w, &y_round_offset, shift);
     929           0 :             break;
     930             :         }
     931             :     }
     932    39883200 : }
     933             : 
     934             : //////////////////////////////////////////////////////////////////////////////
     935             : // aom_highbd_blend_a64_d16_mask_avx2()
     936             : //////////////////////////////////////////////////////////////////////////////
     937             : 
     938           0 : static INLINE void highbd_blend_a64_d16_mask_w4_avx2(
     939             :     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
     940             :     const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0,
     941             :     const __m256i *round_offset, int shift, const __m256i *clip_low,
     942             :     const __m256i *clip_high, const __m256i *mask_max)
     943             : {
     944             :     // Load 4x u16 pixels from each of 4 rows from each source
     945           0 :     const __m256i s0 = _mm256_set_epi64x(*(uint64_t *)(src0 + 3 * src0_stride),
     946           0 :         *(uint64_t *)(src0 + 2 * src0_stride),
     947           0 :         *(uint64_t *)(src0 + 1 * src0_stride),
     948           0 :         *(uint64_t *)(src0 + 0 * src0_stride));
     949           0 :     const __m256i s1 = _mm256_set_epi64x(*(uint64_t *)(src1 + 3 * src1_stride),
     950           0 :         *(uint64_t *)(src1 + 2 * src1_stride),
     951           0 :         *(uint64_t *)(src1 + 1 * src1_stride),
     952           0 :         *(uint64_t *)(src1 + 0 * src1_stride));
     953             :     // Generate the inverse mask
     954           0 :     const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0);
     955             : 
     956             :     // Multiply each mask by the respective source
     957           0 :     const __m256i mul0_highs = _mm256_mulhi_epu16(*mask0, s0);
     958           0 :     const __m256i mul0_lows = _mm256_mullo_epi16(*mask0, s0);
     959           0 :     const __m256i mul0h = _mm256_unpackhi_epi16(mul0_lows, mul0_highs);
     960           0 :     const __m256i mul0l = _mm256_unpacklo_epi16(mul0_lows, mul0_highs);
     961             :     // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
     962             :     // lanes Later, packs does the same again which cancels this out with no need
     963             :     // for a permute.  The intermediate values being reordered makes no difference
     964             : 
     965           0 :     const __m256i mul1_highs = _mm256_mulhi_epu16(mask1, s1);
     966           0 :     const __m256i mul1_lows = _mm256_mullo_epi16(mask1, s1);
     967           0 :     const __m256i mul1h = _mm256_unpackhi_epi16(mul1_lows, mul1_highs);
     968           0 :     const __m256i mul1l = _mm256_unpacklo_epi16(mul1_lows, mul1_highs);
     969             : 
     970           0 :     const __m256i sumh = _mm256_add_epi32(mul0h, mul1h);
     971           0 :     const __m256i suml = _mm256_add_epi32(mul0l, mul1l);
     972             : 
     973             :     const __m256i roundh =
     974           0 :         _mm256_srai_epi32(_mm256_sub_epi32(sumh, *round_offset), shift);
     975             :     const __m256i roundl =
     976           0 :         _mm256_srai_epi32(_mm256_sub_epi32(suml, *round_offset), shift);
     977             : 
     978           0 :     const __m256i pack = _mm256_packs_epi32(roundl, roundh);
     979             :     const __m256i clip =
     980           0 :         _mm256_min_epi16(_mm256_max_epi16(pack, *clip_low), *clip_high);
     981             : 
     982             :     // _mm256_extract_epi64 doesn't exist on x86, so do it the old-fashioned way:
     983           0 :     const __m128i cliph = _mm256_extracti128_si256(clip, 1);
     984           0 :     xx_storel_64(dst + 3 * dst_stride, _mm_srli_si128(cliph, 8));
     985           0 :     xx_storel_64(dst + 2 * dst_stride, cliph);
     986           0 :     const __m128i clipl = _mm256_castsi256_si128(clip);
     987           0 :     xx_storel_64(dst + 1 * dst_stride, _mm_srli_si128(clipl, 8));
     988           0 :     xx_storel_64(dst + 0 * dst_stride, clipl);
     989           0 : }
     990             : 
     991           0 : static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
     992             :     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     993             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     994             :     const uint8_t *mask, uint32_t mask_stride, int h,
     995             :     const __m256i *round_offset, int shift, const __m256i *clip_low,
     996             :     const __m256i *clip_high, const __m256i *mask_max)
     997             : {
     998             :     do {
     999             :         // Load 8x u8 pixels from each of 4 rows of the mask, pad each to u16
    1000           0 :         const __m128i mask08 = _mm_set_epi32(*(uint32_t *)(mask + 3 * mask_stride),
    1001           0 :             *(uint32_t *)(mask + 2 * mask_stride),
    1002           0 :             *(uint32_t *)(mask + 1 * mask_stride),
    1003           0 :             *(uint32_t *)(mask + 0 * mask_stride));
    1004           0 :         const __m256i mask0 = _mm256_cvtepu8_epi16(mask08);
    1005             : 
    1006           0 :         highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
    1007             :             src1_stride, &mask0, round_offset, shift,
    1008             :             clip_low, clip_high, mask_max);
    1009             : 
    1010           0 :         dst += dst_stride * 4;
    1011           0 :         src0 += src0_stride * 4;
    1012           0 :         src1 += src1_stride * 4;
    1013           0 :         mask += mask_stride * 4;
    1014           0 :     } while (h -= 4);
    1015           0 : }
    1016             : 
    1017           0 : static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
    1018             :     uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    1019             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    1020             :     const uint8_t *mask, uint32_t mask_stride, int h,
    1021             :     const __m256i *round_offset, int shift, const __m256i *clip_low,
    1022             :     const __m256i *clip_high, const __m256i *mask_max)
    1023             : {
    1024           0 :     const __m256i one_b = _mm256_set1_epi8(1);
    1025           0 :     const __m256i two_w = _mm256_set1_epi16(2);
    1026             :     do {
    1027             :         // Load 8 pixels from each of 8 rows of mask,
    1028             :         // (saturating) add together rows then use madd to add adjacent pixels
    1029             :         // Finally, divide each value by 4 (with rounding)
    1030             :         const __m256i m0246 =
    1031           0 :             _mm256_set_epi64x(*(uint64_t *)(mask + 6 * mask_stride),
    1032           0 :                 *(uint64_t *)(mask + 4 * mask_stride),
    1033           0 :                 *(uint64_t *)(mask + 2 * mask_stride),
    1034           0 :                 *(uint64_t *)(mask + 0 * mask_stride));
    1035             :         const __m256i m1357 =
    1036           0 :             _mm256_set_epi64x(*(uint64_t *)(mask + 7 * mask_stride),
    1037           0 :                 *(uint64_t *)(mask + 5 * mask_stride),
    1038           0 :                 *(uint64_t *)(mask + 3 * mask_stride),
    1039           0 :                 *(uint64_t *)(mask + 1 * mask_stride));
    1040           0 :         const __m256i addrows = _mm256_adds_epu8(m0246, m1357);
    1041           0 :         const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b);
    1042           0 :         const __m256i mask0 =
    1043           0 :             _mm256_srli_epi16(_mm256_add_epi16(adjacent, two_w), 2);
    1044             : 
    1045           0 :         highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
    1046             :             src1_stride, &mask0, round_offset, shift,
    1047             :             clip_low, clip_high, mask_max);
    1048             : 
    1049           0 :         dst += dst_stride * 4;
    1050           0 :         src0 += src0_stride * 4;
    1051           0 :         src1 += src1_stride * 4;
    1052           0 :         mask += mask_stride * 8;
    1053           0 :     } while (h -= 4);
    1054           0 : }
    1055             : 
    1056           0 : static INLINE void highbd_blend_a64_d16_mask_w8_avx2(
    1057             :     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
    1058             :     const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
    1059             :     const __m256i *mask0b, const __m256i *round_offset, int shift,
    1060             :     const __m256i *clip_low, const __m256i *clip_high,
    1061             :     const __m256i *mask_max)
    1062             : {
    1063             :     // Load 8x u16 pixels from each of 4 rows from each source
    1064             :     const __m256i s0a =
    1065           0 :         yy_loadu2_128(src0 + 0 * src0_stride, src0 + 1 * src0_stride);
    1066             :     const __m256i s0b =
    1067           0 :         yy_loadu2_128(src0 + 2 * src0_stride, src0 + 3 * src0_stride);
    1068             :     const __m256i s1a =
    1069           0 :         yy_loadu2_128(src1 + 0 * src1_stride, src1 + 1 * src1_stride);
    1070             :     const __m256i s1b =
    1071           0 :         yy_loadu2_128(src1 + 2 * src1_stride, src1 + 3 * src1_stride);
    1072             : 
    1073             :     // Generate inverse masks
    1074           0 :     const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
    1075           0 :     const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
    1076             : 
    1077             :     // Multiply sources by respective masks
    1078           0 :     const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
    1079           0 :     const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
    1080           0 :     const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
    1081           0 :     const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
    1082             :     // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
    1083             :     // lanes Later, packs does the same again which cancels this out with no need
    1084             :     // for a permute.  The intermediate values being reordered makes no difference
    1085             : 
    1086           0 :     const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
    1087           0 :     const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
    1088           0 :     const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
    1089           0 :     const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
    1090             : 
    1091           0 :     const __m256i sumah = _mm256_add_epi32(mul0ah, mul1ah);
    1092           0 :     const __m256i sumal = _mm256_add_epi32(mul0al, mul1al);
    1093             : 
    1094           0 :     const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
    1095           0 :     const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
    1096           0 :     const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
    1097           0 :     const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
    1098             : 
    1099           0 :     const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
    1100           0 :     const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
    1101           0 :     const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
    1102           0 :     const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
    1103             : 
    1104           0 :     const __m256i sumbh = _mm256_add_epi32(mul0bh, mul1bh);
    1105           0 :     const __m256i sumbl = _mm256_add_epi32(mul0bl, mul1bl);
    1106             : 
    1107             :     // Divide down each result, with rounding
    1108             :     const __m256i roundah =
    1109           0 :         _mm256_srai_epi32(_mm256_sub_epi32(sumah, *round_offset), shift);
    1110             :     const __m256i roundal =
    1111           0 :         _mm256_srai_epi32(_mm256_sub_epi32(sumal, *round_offset), shift);
    1112             :     const __m256i roundbh =
    1113           0 :         _mm256_srai_epi32(_mm256_sub_epi32(sumbh, *round_offset), shift);
    1114             :     const __m256i roundbl =
    1115           0 :         _mm256_srai_epi32(_mm256_sub_epi32(sumbl, *round_offset), shift);
    1116             : 
    1117             :     // Pack each i32 down to an i16 with saturation, then clip to valid range
    1118           0 :     const __m256i packa = _mm256_packs_epi32(roundal, roundah);
    1119             :     const __m256i clipa =
    1120           0 :         _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
    1121           0 :     const __m256i packb = _mm256_packs_epi32(roundbl, roundbh);
    1122             :     const __m256i clipb =
    1123           0 :         _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
    1124             : 
    1125             :     // Store 8x u16 pixels to each of 4 rows in the destination
    1126           0 :     yy_storeu2_128(dst + 0 * dst_stride, dst + 1 * dst_stride, clipa);
    1127           0 :     yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb);
    1128           0 : }
    1129             : 
    1130           0 : static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
    1131             :     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
    1132             :     const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
    1133             :     int mask_stride, int h, const __m256i *round_offset, int shift,
    1134             :     const __m256i *clip_low, const __m256i *clip_high,
    1135             :     const __m256i *mask_max)
    1136             : {
    1137             :     do {
    1138             :         // Load 8x u8 pixels from each of 4 rows in the mask
    1139             :         const __m128i mask0a8 =
    1140           0 :             _mm_set_epi64x(*(uint64_t *)mask, *(uint64_t *)(mask + mask_stride));
    1141             :         const __m128i mask0b8 =
    1142           0 :             _mm_set_epi64x(*(uint64_t *)(mask + 2 * mask_stride),
    1143           0 :                 *(uint64_t *)(mask + 3 * mask_stride));
    1144           0 :         const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
    1145           0 :         const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
    1146             : 
    1147           0 :         highbd_blend_a64_d16_mask_w8_avx2(
    1148             :             dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
    1149             :             round_offset, shift, clip_low, clip_high, mask_max);
    1150             : 
    1151           0 :         dst += dst_stride * 4;
    1152           0 :         src0 += src0_stride * 4;
    1153           0 :         src1 += src1_stride * 4;
    1154           0 :         mask += mask_stride * 4;
    1155           0 :     } while (h -= 4);
    1156           0 : }
    1157             : 
    1158           0 : static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
    1159             :     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
    1160             :     const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
    1161             :     int mask_stride, int h, const __m256i *round_offset, int shift,
    1162             :     const __m256i *clip_low, const __m256i *clip_high,
    1163             :     const __m256i *mask_max)
    1164             : {
    1165           0 :     const __m256i one_b = _mm256_set1_epi8(1);
    1166           0 :     const __m256i two_w = _mm256_set1_epi16(2);
    1167             :     do {
    1168             :         // Load 16x u8 pixels from each of 8 rows in the mask,
    1169             :         // (saturating) add together rows then use madd to add adjacent pixels
    1170             :         // Finally, divide each value by 4 (with rounding)
    1171             :         const __m256i m02 =
    1172           0 :             yy_loadu2_128(mask + 0 * mask_stride, mask + 2 * mask_stride);
    1173             :         const __m256i m13 =
    1174           0 :             yy_loadu2_128(mask + 1 * mask_stride, mask + 3 * mask_stride);
    1175             :         const __m256i m0123 =
    1176           0 :             _mm256_maddubs_epi16(_mm256_adds_epu8(m02, m13), one_b);
    1177           0 :         const __m256i mask_0a =
    1178           0 :             _mm256_srli_epi16(_mm256_add_epi16(m0123, two_w), 2);
    1179             :         const __m256i m46 =
    1180           0 :             yy_loadu2_128(mask + 4 * mask_stride, mask + 6 * mask_stride);
    1181             :         const __m256i m57 =
    1182           0 :             yy_loadu2_128(mask + 5 * mask_stride, mask + 7 * mask_stride);
    1183             :         const __m256i m4567 =
    1184           0 :             _mm256_maddubs_epi16(_mm256_adds_epu8(m46, m57), one_b);
    1185           0 :         const __m256i mask_0b =
    1186           0 :             _mm256_srli_epi16(_mm256_add_epi16(m4567, two_w), 2);
    1187             : 
    1188           0 :         highbd_blend_a64_d16_mask_w8_avx2(
    1189             :             dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
    1190             :             &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
    1191             : 
    1192           0 :         dst += dst_stride * 4;
    1193           0 :         src0 += src0_stride * 4;
    1194           0 :         src1 += src1_stride * 4;
    1195           0 :         mask += mask_stride * 8;
    1196           0 :     } while (h -= 4);
    1197           0 : }
    1198             : 
    1199           0 : static INLINE void highbd_blend_a64_d16_mask_w16_avx2(
    1200             :     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
    1201             :     const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
    1202             :     const __m256i *mask0b, const __m256i *round_offset, int shift,
    1203             :     const __m256i *clip_low, const __m256i *clip_high,
    1204             :     const __m256i *mask_max)
    1205             : {
    1206             :     // Load 16x pixels from each of 2 rows from each source
    1207           0 :     const __m256i s0a = yy_loadu_256(src0);
    1208           0 :     const __m256i s0b = yy_loadu_256(src0 + src0_stride);
    1209           0 :     const __m256i s1a = yy_loadu_256(src1);
    1210           0 :     const __m256i s1b = yy_loadu_256(src1 + src1_stride);
    1211             : 
    1212             :     // Calculate inverse masks
    1213           0 :     const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
    1214           0 :     const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
    1215             : 
    1216             :     // Multiply each source by appropriate mask
    1217           0 :     const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
    1218           0 :     const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
    1219           0 :     const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
    1220           0 :     const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
    1221             :     // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
    1222             :     // lanes Later, packs does the same again which cancels this out with no need
    1223             :     // for a permute.  The intermediate values being reordered makes no difference
    1224             : 
    1225           0 :     const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
    1226           0 :     const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
    1227           0 :     const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
    1228           0 :     const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
    1229             : 
    1230           0 :     const __m256i mulah = _mm256_add_epi32(mul0ah, mul1ah);
    1231           0 :     const __m256i mulal = _mm256_add_epi32(mul0al, mul1al);
    1232             : 
    1233           0 :     const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
    1234           0 :     const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
    1235           0 :     const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
    1236           0 :     const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
    1237             : 
    1238           0 :     const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
    1239           0 :     const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
    1240           0 :     const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
    1241           0 :     const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
    1242             : 
    1243           0 :     const __m256i mulbh = _mm256_add_epi32(mul0bh, mul1bh);
    1244           0 :     const __m256i mulbl = _mm256_add_epi32(mul0bl, mul1bl);
    1245             : 
    1246             :     const __m256i resah =
    1247           0 :         _mm256_srai_epi32(_mm256_sub_epi32(mulah, *round_offset), shift);
    1248             :     const __m256i resal =
    1249           0 :         _mm256_srai_epi32(_mm256_sub_epi32(mulal, *round_offset), shift);
    1250             :     const __m256i resbh =
    1251           0 :         _mm256_srai_epi32(_mm256_sub_epi32(mulbh, *round_offset), shift);
    1252             :     const __m256i resbl =
    1253           0 :         _mm256_srai_epi32(_mm256_sub_epi32(mulbl, *round_offset), shift);
    1254             : 
    1255             :     // Signed saturating pack from i32 to i16:
    1256           0 :     const __m256i packa = _mm256_packs_epi32(resal, resah);
    1257           0 :     const __m256i packb = _mm256_packs_epi32(resbl, resbh);
    1258             : 
    1259             :     // Clip the values to the valid range
    1260             :     const __m256i clipa =
    1261           0 :         _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
    1262             :     const __m256i clipb =
    1263           0 :         _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
    1264             : 
    1265             :     // Store 16 pixels
    1266           0 :     yy_storeu_256(dst, clipa);
    1267           0 :     yy_storeu_256(dst + dst_stride, clipb);
    1268           0 : }
    1269             : 
    1270           0 : static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
    1271             :     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
    1272             :     const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
    1273             :     int mask_stride, int h, int w, const __m256i *round_offset, int shift,
    1274             :     const __m256i *clip_low, const __m256i *clip_high,
    1275             :     const __m256i *mask_max)
    1276             : {
    1277           0 :     for (int i = 0; i < h; i += 2) {
    1278           0 :         for (int j = 0; j < w; j += 16) {
    1279             :             // Load 16x u8 alpha-mask values from each of two rows and pad to u16
    1280           0 :             const __m128i masks_a8 = xx_loadu_128(mask + j);
    1281           0 :             const __m128i masks_b8 = xx_loadu_128(mask + mask_stride + j);
    1282           0 :             const __m256i mask0a = _mm256_cvtepu8_epi16(masks_a8);
    1283           0 :             const __m256i mask0b = _mm256_cvtepu8_epi16(masks_b8);
    1284             : 
    1285           0 :             highbd_blend_a64_d16_mask_w16_avx2(
    1286           0 :                 dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
    1287             :                 &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
    1288             :         }
    1289           0 :         dst += dst_stride * 2;
    1290           0 :         src0 += src0_stride * 2;
    1291           0 :         src1 += src1_stride * 2;
    1292           0 :         mask += mask_stride * 2;
    1293             :     }
    1294           0 : }
    1295             : 
    1296           0 : static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
    1297             :     uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
    1298             :     const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
    1299             :     int mask_stride, int h, int w, const __m256i *round_offset, int shift,
    1300             :     const __m256i *clip_low, const __m256i *clip_high,
    1301             :     const __m256i *mask_max)
    1302             : {
    1303           0 :     const __m256i one_b = _mm256_set1_epi8(1);
    1304           0 :     const __m256i two_w = _mm256_set1_epi16(2);
    1305           0 :     for (int i = 0; i < h; i += 2) {
    1306           0 :         for (int j = 0; j < w; j += 16) {
    1307             :             // Load 32x u8 alpha-mask values from each of four rows
    1308             :             // (saturating) add pairs of rows, then use madd to add adjacent values
    1309             :             // Finally, divide down each result with rounding
    1310           0 :             const __m256i m0 = yy_loadu_256(mask + 0 * mask_stride + 2 * j);
    1311           0 :             const __m256i m1 = yy_loadu_256(mask + 1 * mask_stride + 2 * j);
    1312           0 :             const __m256i m2 = yy_loadu_256(mask + 2 * mask_stride + 2 * j);
    1313           0 :             const __m256i m3 = yy_loadu_256(mask + 3 * mask_stride + 2 * j);
    1314             : 
    1315           0 :             const __m256i m01_8 = _mm256_adds_epu8(m0, m1);
    1316           0 :             const __m256i m23_8 = _mm256_adds_epu8(m2, m3);
    1317             : 
    1318           0 :             const __m256i m01 = _mm256_maddubs_epi16(m01_8, one_b);
    1319           0 :             const __m256i m23 = _mm256_maddubs_epi16(m23_8, one_b);
    1320             : 
    1321           0 :             const __m256i mask0a = _mm256_srli_epi16(_mm256_add_epi16(m01, two_w), 2);
    1322           0 :             const __m256i mask0b = _mm256_srli_epi16(_mm256_add_epi16(m23, two_w), 2);
    1323             : 
    1324           0 :             highbd_blend_a64_d16_mask_w16_avx2(
    1325           0 :                 dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
    1326             :                 &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
    1327             :         }
    1328           0 :         dst += dst_stride * 2;
    1329           0 :         src0 += src0_stride * 2;
    1330           0 :         src1 += src1_stride * 2;
    1331           0 :         mask += mask_stride * 4;
    1332             :     }
    1333           0 : }
    1334             : 
    1335           0 : void aom_highbd_blend_a64_d16_mask_avx2(
    1336             :     uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    1337             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    1338             :     const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
    1339             :     ConvolveParams *conv_params, const int bd)
    1340             : {
    1341           0 :     uint16_t *dst = (uint16_t *)(dst8);//CONVERT_TO_SHORTPTR(dst8);
    1342           0 :     const int round_bits =
    1343           0 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    1344           0 :     const int32_t round_offset =
    1345           0 :         ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
    1346           0 :         (1 << (round_bits - 1)))
    1347             :         << AOM_BLEND_A64_ROUND_BITS;
    1348           0 :     const __m256i v_round_offset = _mm256_set1_epi32(round_offset);
    1349           0 :     const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
    1350             : 
    1351           0 :     const __m256i clip_low = _mm256_set1_epi16(0);
    1352           0 :     const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1);
    1353           0 :     const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    1354             : 
    1355           0 :     assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
    1356           0 :     assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
    1357             : 
    1358           0 :     assert(h >= 4);
    1359           0 :     assert(w >= 4);
    1360           0 :     assert(IS_POWER_OF_TWO(h));
    1361           0 :     assert(IS_POWER_OF_TWO(w));
    1362             : 
    1363           0 :     if (subw == 0 && subh == 0) {
    1364           0 :         switch (w) {
    1365           0 :         case 4:
    1366           0 :             highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
    1367             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    1368             :                 mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
    1369             :                 &mask_max);
    1370           0 :             break;
    1371           0 :         case 8:
    1372           0 :             highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
    1373             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    1374             :                 mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
    1375             :                 &mask_max);
    1376           0 :             break;
    1377           0 :         default:  // >= 16
    1378           0 :             highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
    1379             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    1380             :                 mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
    1381             :                 &mask_max);
    1382           0 :             break;
    1383             :         }
    1384             : 
    1385             :     }
    1386           0 :     else if (subw == 1 && subh == 1) {
    1387           0 :         switch (w) {
    1388           0 :         case 4:
    1389           0 :             highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
    1390             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    1391             :                 mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
    1392             :                 &mask_max);
    1393           0 :             break;
    1394           0 :         case 8:
    1395           0 :             highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
    1396             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    1397             :                 mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
    1398             :                 &mask_max);
    1399           0 :             break;
    1400           0 :         default:  // >= 16
    1401           0 :             highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
    1402             :                 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    1403             :                 mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
    1404             :                 &mask_max);
    1405           0 :             break;
    1406             :         }
    1407             :     }
    1408             :     else {
    1409             :         // Sub-sampling in only one axis doesn't seem to happen very much, so fall
    1410             :         // back to the vanilla C implementation instead of having all the optimised
    1411             :         // code for these.
    1412           0 :         aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
    1413             :             src1_stride, mask, mask_stride, w, h, subw,
    1414             :             subh, conv_params, bd);
    1415             :     }
    1416           0 : }

Generated by: LCOV version 1.14