LCOV - code coverage report
Current view: top level - ASM_SSE4_1 - EbBlend_a64_mask_sse4.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 104 553 18.8 %
Date: 2019-11-25 17:38:06 Functions: 9 54 16.7 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : /*
       7             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       8             :  *
       9             :  * This source code is subject to the terms of the BSD 2 Clause License and
      10             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      11             :  * was not distributed with this source code in the LICENSE file, you can
      12             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      13             :  * Media Patent License 1.0 was not distributed with this source code in the
      14             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      15             :  */
      16             : 
      17             : #include <assert.h>
      18             : #include "smmintrin.h"
      19             : 
      20             : #include "EbDefinitions.h"
      21             : 
      22             : #include "EbBlend_sse4.h"
      23             : 
      24             : #include "aom_dsp_rtcd.h"
      25             : 
      26             :  //////////////////////////////////////////////////////////////////////////////
      27             :  // No sub-sampling
      28             :  //////////////////////////////////////////////////////////////////////////////
      29             : 
      30     6631380 : static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
      31             :     const uint8_t *src0, uint32_t src0_stride,
      32             :     const uint8_t *src1, uint32_t src1_stride,
      33             :     const uint8_t *mask, uint32_t mask_stride,
      34             :     int w, int h)
      35             : {
      36             :     (void)w;
      37     6631380 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
      38     6631380 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
      39             :     do {
      40    89012800 :         const __m128i v_m0_b = xx_loadl_32(mask);
      41    89003300 :         const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
      42    89003300 :         const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
      43    89018900 :         xx_storel_32(dst, v_res_b);
      44             : 
      45    89012300 :         dst += dst_stride;
      46    89012300 :         src0 += src0_stride;
      47    89012300 :         src1 += src1_stride;
      48    89012300 :         mask += mask_stride;
      49    89012300 :     } while (--h);
      50     6630880 : }
      51             : 
      52     4184930 : static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
      53             :     const uint8_t *src0, uint32_t src0_stride,
      54             :     const uint8_t *src1, uint32_t src1_stride,
      55             :     const uint8_t *mask, uint32_t mask_stride,
      56             :     int w, int h)
      57             : {
      58             :     (void)w;
      59     4184930 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
      60     4184930 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
      61             :     do {
      62    80632500 :         const __m128i v_m0_b = xx_loadl_64(mask);
      63    80624600 :         const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
      64    80624600 :         const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
      65    80647200 :         xx_storel_64(dst, v_res_b);
      66             : 
      67    80632500 :         dst += dst_stride;
      68    80632500 :         src0 += src0_stride;
      69    80632500 :         src1 += src1_stride;
      70    80632500 :         mask += mask_stride;
      71    80632500 :     } while (--h);
      72     4184880 : }
      73             : 
      74     3208660 : static void blend_a64_mask_w16n_sse4_1(
      75             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
      76             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
      77             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
      78             : {
      79     3208660 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
      80     3208660 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
      81             : 
      82             :     do {
      83             :         int c;
      84   158523000 :         for (c = 0; c < w; c += 16) {
      85    89210500 :             const __m128i v_m0_b = xx_loadu_128(mask + c);
      86    89180200 :             const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
      87             : 
      88             :             const __m128i v_res_b =
      89    89180200 :                 blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
      90             : 
      91    89214500 :             xx_storeu_128(dst + c, v_res_b);
      92             :         }
      93    69312600 :         dst += dst_stride;
      94    69312600 :         src0 += src0_stride;
      95    69312600 :         src1 += src1_stride;
      96    69312600 :         mask += mask_stride;
      97    69312600 :     } while (--h);
      98     3211290 : }
      99             : 
     100             : //////////////////////////////////////////////////////////////////////////////
     101             : // Horizontal sub-sampling
     102             : //////////////////////////////////////////////////////////////////////////////
     103             : 
     104           0 : static void blend_a64_mask_sx_w4_sse4_1(
     105             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     106             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     107             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     108             : {
     109             :     (void)w;
     110             : 
     111           0 :     const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
     112           0 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     113           0 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     114             :     do {
     115           0 :         const __m128i v_r_b = xx_loadl_64(mask);
     116           0 :         const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
     117           0 :         const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
     118           0 :         const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
     119           0 :         const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
     120           0 :         const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     121             : 
     122           0 :         const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     123           0 :         xx_storel_32(dst, v_res_b);
     124             : 
     125           0 :         dst += dst_stride;
     126           0 :         src0 += src0_stride;
     127           0 :         src1 += src1_stride;
     128           0 :         mask += mask_stride;
     129           0 :     } while (--h);
     130           0 : }
     131             : 
     132           0 : static void blend_a64_mask_sx_w8_sse4_1(
     133             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     134             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     135             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     136             : {
     137             :     (void)w;
     138             : 
     139           0 :     const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
     140           0 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     141           0 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     142             :     do {
     143           0 :         const __m128i v_r_b = xx_loadu_128(mask);
     144           0 :         const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
     145           0 :         const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
     146           0 :         const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
     147           0 :         const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
     148           0 :         const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     149             : 
     150           0 :         const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     151             : 
     152           0 :         xx_storel_64(dst, v_res_b);
     153             : 
     154           0 :         dst += dst_stride;
     155           0 :         src0 += src0_stride;
     156           0 :         src1 += src1_stride;
     157           0 :         mask += mask_stride;
     158           0 :     } while (--h);
     159           0 : }
     160             : 
     161           0 : static void blend_a64_mask_sx_w16n_sse4_1(
     162             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     163             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     164             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     165             : {
     166           0 :     const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
     167           0 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     168           0 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     169             : 
     170             :     do {
     171             :         int c;
     172           0 :         for (c = 0; c < w; c += 16) {
     173           0 :             const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
     174           0 :             const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
     175           0 :             const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
     176           0 :             const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
     177           0 :             const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
     178           0 :             const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
     179           0 :             const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
     180           0 :             const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     181             : 
     182             :             const __m128i v_res_b =
     183           0 :                 blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
     184             : 
     185           0 :             xx_storeu_128(dst + c, v_res_b);
     186             :         }
     187           0 :         dst += dst_stride;
     188           0 :         src0 += src0_stride;
     189           0 :         src1 += src1_stride;
     190           0 :         mask += mask_stride;
     191           0 :     } while (--h);
     192           0 : }
     193             : 
     194             : //////////////////////////////////////////////////////////////////////////////
     195             : // Vertical sub-sampling
     196             : //////////////////////////////////////////////////////////////////////////////
     197             : 
     198           0 : static void blend_a64_mask_sy_w4_sse4_1(
     199             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     200             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     201             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     202             : {
     203             :     (void)w;
     204             : 
     205           0 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     206           0 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     207             : 
     208             :     do {
     209           0 :         const __m128i v_ra_b = xx_loadl_32(mask);
     210           0 :         const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
     211           0 :         const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
     212           0 :         const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     213             : 
     214           0 :         const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     215             : 
     216           0 :         xx_storel_32(dst, v_res_b);
     217             : 
     218           0 :         dst += dst_stride;
     219           0 :         src0 += src0_stride;
     220           0 :         src1 += src1_stride;
     221           0 :         mask += 2 * mask_stride;
     222           0 :     } while (--h);
     223           0 : }
     224             : 
     225           0 : static void blend_a64_mask_sy_w8_sse4_1(
     226             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     227             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     228             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     229             : {
     230             :     (void)w;
     231             : 
     232           0 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     233           0 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     234             :     do {
     235           0 :         const __m128i v_ra_b = xx_loadl_64(mask);
     236           0 :         const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     237           0 :         const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
     238           0 :         const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     239           0 :         const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     240             : 
     241           0 :         xx_storel_64(dst, v_res_b);
     242             : 
     243           0 :         dst += dst_stride;
     244           0 :         src0 += src0_stride;
     245           0 :         src1 += src1_stride;
     246           0 :         mask += 2 * mask_stride;
     247           0 :     } while (--h);
     248           0 : }
     249             : 
     250           0 : static void blend_a64_mask_sy_w16n_sse4_1(
     251             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     252             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     253             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     254             : {
     255           0 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     256           0 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     257             :     do {
     258             :         int c;
     259           0 :         for (c = 0; c < w; c += 16) {
     260           0 :             const __m128i v_ra_b = xx_loadu_128(mask + c);
     261           0 :             const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
     262           0 :             const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
     263           0 :             const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     264             : 
     265             :             const __m128i v_res_b =
     266           0 :                 blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
     267             : 
     268           0 :             xx_storeu_128(dst + c, v_res_b);
     269             :         }
     270           0 :         dst += dst_stride;
     271           0 :         src0 += src0_stride;
     272           0 :         src1 += src1_stride;
     273           0 :         mask += 2 * mask_stride;
     274           0 :     } while (--h);
     275           0 : }
     276             : 
     277             : //////////////////////////////////////////////////////////////////////////////
     278             : // Horizontal and Vertical sub-sampling
     279             : //////////////////////////////////////////////////////////////////////////////
     280             : 
     281           0 : static void blend_a64_mask_sx_sy_w4_sse4_1(
     282             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     283             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     284             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     285             : {
     286           0 :     const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
     287           0 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     288           0 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     289             :     (void)w;
     290             : 
     291             :     do {
     292           0 :         const __m128i v_ra_b = xx_loadl_64(mask);
     293           0 :         const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     294           0 :         const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     295           0 :         const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
     296           0 :         const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
     297           0 :         const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
     298           0 :         const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
     299           0 :         const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
     300           0 :         const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
     301           0 :         const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     302             : 
     303           0 :         const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     304             : 
     305           0 :         xx_storel_32(dst, v_res_b);
     306             : 
     307           0 :         dst += dst_stride;
     308           0 :         src0 += src0_stride;
     309           0 :         src1 += src1_stride;
     310           0 :         mask += 2 * mask_stride;
     311           0 :     } while (--h);
     312           0 : }
     313             : 
     314           0 : static void blend_a64_mask_sx_sy_w8_sse4_1(
     315             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     316             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     317             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     318             : {
     319           0 :     const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
     320           0 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     321           0 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     322             :     (void)w;
     323             : 
     324             :     do {
     325           0 :         const __m128i v_ra_b = xx_loadu_128(mask);
     326           0 :         const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
     327             : 
     328           0 :         const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     329           0 :         const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
     330           0 :         const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
     331           0 :         const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
     332           0 :         const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
     333           0 :         const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
     334           0 :         const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
     335           0 :         const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     336             : 
     337           0 :         const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     338             : 
     339           0 :         xx_storel_64(dst, v_res_b);
     340             : 
     341           0 :         dst += dst_stride;
     342           0 :         src0 += src0_stride;
     343           0 :         src1 += src1_stride;
     344           0 :         mask += 2 * mask_stride;
     345           0 :     } while (--h);
     346           0 : }
     347             : 
     348           0 : static void blend_a64_mask_sx_sy_w16n_sse4_1(
     349             :     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     350             :     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     351             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     352             : {
     353           0 :     const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     354             :         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     355           0 :     const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     356           0 :     const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     357             :     do {
     358             :         int c;
     359           0 :         for (c = 0; c < w; c += 16) {
     360           0 :             const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
     361           0 :             const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
     362           0 :             const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
     363           0 :             const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
     364           0 :             const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
     365           0 :             const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
     366           0 :             const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
     367           0 :             const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
     368             :             const __m128i v_rvsbl_w =
     369           0 :                 _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
     370             :             const __m128i v_rvsbh_w =
     371           0 :                 _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
     372           0 :             const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
     373           0 :             const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
     374             : 
     375           0 :             const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
     376           0 :             const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
     377           0 :             const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
     378           0 :             const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     379             : 
     380             :             const __m128i v_res_b =
     381           0 :                 blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
     382             : 
     383           0 :             xx_storeu_128(dst + c, v_res_b);
     384             :         }
     385           0 :         dst += dst_stride;
     386           0 :         src0 += src0_stride;
     387           0 :         src1 += src1_stride;
     388           0 :         mask += 2 * mask_stride;
     389           0 :     } while (--h);
     390           0 : }
     391             : 
     392             : //////////////////////////////////////////////////////////////////////////////
     393             : // Dispatch
     394             : //////////////////////////////////////////////////////////////////////////////
     395             : 
     396    14317000 : void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
     397             :     const uint8_t *src0, uint32_t src0_stride,
     398             :     const uint8_t *src1, uint32_t src1_stride,
     399             :     const uint8_t *mask, uint32_t mask_stride, int w,
     400             :     int h, int subx, int suby)
     401             : {
     402             :     typedef void(*blend_fn)(
     403             :         uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
     404             :         uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     405             :         const uint8_t *mask, uint32_t mask_stride, int w, int h);
     406             : 
     407             :     // Dimensions are: width_index X subx X suby
     408             :     static const blend_fn blend[3][2][2] = {
     409             :       { // w % 16 == 0
     410             :         { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
     411             :         { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
     412             :       { // w == 4
     413             :         { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
     414             :         { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
     415             :       { // w == 8
     416             :         { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
     417             :         { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
     418             :     };
     419             : 
     420    14317000 :     assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
     421    14317000 :     assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
     422             : 
     423    14317000 :     assert(h >= 1);
     424    14317000 :     assert(w >= 1);
     425    14317000 :     assert(IS_POWER_OF_TWO(h));
     426    14317000 :     assert(IS_POWER_OF_TWO(w));
     427             : 
     428    14317000 :     if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     429      295896 :         aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
     430             :             mask, mask_stride, w, h, subx, suby);
     431             :     }
     432             :     else {
     433    14021100 :         blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
     434             :             src0_stride, src1, src1_stride,
     435             :             mask, mask_stride, w, h);
     436             :     }
     437    14317200 : }
     438             : 
     439             : //////////////////////////////////////////////////////////////////////////////
     440             : // No sub-sampling
     441             : //////////////////////////////////////////////////////////////////////////////
     442             : 
     443           0 : static INLINE void blend_a64_mask_bn_w4_sse4_1(
     444             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     445             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     446             :     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend)
     447             : {
     448           0 :     const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     449             : 
     450             :     do {
     451           0 :         const __m128i v_m0_b = xx_loadl_32(mask);
     452           0 :         const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
     453           0 :         const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     454             : 
     455           0 :         const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
     456             : 
     457           0 :         xx_storel_64(dst, v_res_w);
     458             : 
     459           0 :         dst += dst_stride;
     460           0 :         src0 += src0_stride;
     461           0 :         src1 += src1_stride;
     462           0 :         mask += mask_stride;
     463           0 :     } while (--h);
     464           0 : }
     465             : 
     466           0 : static void blend_a64_mask_b10_w4_sse4_1(
     467             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     468             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     469             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     470             : {
     471             :     (void)w;
     472           0 :     blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     473             :         src1_stride, mask, mask_stride, h, blend_4_b10);
     474           0 : }
     475             : 
     476           0 : static void blend_a64_mask_b12_w4_sse4_1(
     477             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     478             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     479             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     480             : {
     481             :     (void)w;
     482           0 :     blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     483             :         src1_stride, mask, mask_stride, h, blend_4_b12);
     484           0 : }
     485             : 
     486           0 : static INLINE void blend_a64_mask_bn_w8n_sse4_1(
     487             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     488             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     489             :     const uint8_t *mask, uint32_t mask_stride, int w, int h,
     490             :     blend_unit_fn blend)
     491             : {
     492           0 :     const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     493             : 
     494             :     do {
     495             :         int c;
     496           0 :         for (c = 0; c < w; c += 8) {
     497           0 :             const __m128i v_m0_b = xx_loadl_64(mask + c);
     498           0 :             const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
     499           0 :             const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     500             : 
     501           0 :             const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
     502             : 
     503           0 :             xx_storeu_128(dst + c, v_res_w);
     504             :         }
     505           0 :         dst += dst_stride;
     506           0 :         src0 += src0_stride;
     507           0 :         src1 += src1_stride;
     508           0 :         mask += mask_stride;
     509           0 :     } while (--h);
     510           0 : }
     511             : 
     512           0 : static void blend_a64_mask_b10_w8n_sse4_1(
     513             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     514             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     515             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     516             : {
     517           0 :     blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     518             :         src1_stride, mask, mask_stride, w, h,
     519             :         blend_8_b10);
     520           0 : }
     521             : 
     522           0 : static void blend_a64_mask_b12_w8n_sse4_1(
     523             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     524             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     525             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     526             : {
     527           0 :     blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     528             :         src1_stride, mask, mask_stride, w, h,
     529             :         blend_8_b12);
     530           0 : }
     531             : 
     532             : //////////////////////////////////////////////////////////////////////////////
     533             : // Horizontal sub-sampling
     534             : //////////////////////////////////////////////////////////////////////////////
     535             : 
     536           0 : static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
     537             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     538             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     539             :     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend)
     540             : {
     541           0 :     const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     542             :         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     543           0 :     const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     544             : 
     545             :     do {
     546           0 :         const __m128i v_r_b = xx_loadl_64(mask);
     547           0 :         const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
     548             : 
     549           0 :         const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
     550           0 :         const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     551             : 
     552           0 :         const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
     553             : 
     554           0 :         xx_storel_64(dst, v_res_w);
     555             : 
     556           0 :         dst += dst_stride;
     557           0 :         src0 += src0_stride;
     558           0 :         src1 += src1_stride;
     559           0 :         mask += mask_stride;
     560           0 :     } while (--h);
     561           0 : }
     562             : 
     563           0 : static void blend_a64_mask_b10_sx_w4_sse4_1(
     564             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     565             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     566             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     567             : {
     568             :     (void)w;
     569           0 :     blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     570             :         src1_stride, mask, mask_stride, h,
     571             :         blend_4_b10);
     572           0 : }
     573             : 
     574           0 : static void blend_a64_mask_b12_sx_w4_sse4_1(
     575             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     576             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     577             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     578             : {
     579             :     (void)w;
     580           0 :     blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     581             :         src1_stride, mask, mask_stride, h,
     582             :         blend_4_b12);
     583           0 : }
     584             : 
     585           0 : static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
     586             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     587             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     588             :     const uint8_t *mask, uint32_t mask_stride, int w, int h,
     589             :     blend_unit_fn blend)
     590             : {
     591           0 :     const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     592             :         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     593           0 :     const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     594             : 
     595             :     do {
     596             :         int c;
     597           0 :         for (c = 0; c < w; c += 8) {
     598           0 :             const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
     599           0 :             const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
     600             : 
     601           0 :             const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
     602           0 :             const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     603             : 
     604           0 :             const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
     605             : 
     606           0 :             xx_storeu_128(dst + c, v_res_w);
     607             :         }
     608           0 :         dst += dst_stride;
     609           0 :         src0 += src0_stride;
     610           0 :         src1 += src1_stride;
     611           0 :         mask += mask_stride;
     612           0 :     } while (--h);
     613           0 : }
     614             : 
     615           0 : static void blend_a64_mask_b10_sx_w8n_sse4_1(
     616             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     617             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     618             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     619             : {
     620           0 :     blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     621             :         src1_stride, mask, mask_stride, w, h,
     622             :         blend_8_b10);
     623           0 : }
     624             : 
     625           0 : static void blend_a64_mask_b12_sx_w8n_sse4_1(
     626             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     627             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     628             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     629             : {
     630           0 :     blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     631             :         src1_stride, mask, mask_stride, w, h,
     632             :         blend_8_b12);
     633           0 : }
     634             : 
     635             : //////////////////////////////////////////////////////////////////////////////
     636             : // Vertical sub-sampling
     637             : //////////////////////////////////////////////////////////////////////////////
     638             : 
     639           0 : static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
     640             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     641             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     642             :     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend)
     643             : {
     644           0 :     const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     645             : 
     646             :     do {
     647           0 :         const __m128i v_ra_b = xx_loadl_32(mask);
     648           0 :         const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
     649           0 :         const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
     650             : 
     651           0 :         const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
     652           0 :         const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     653             : 
     654           0 :         const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
     655             : 
     656           0 :         xx_storel_64(dst, v_res_w);
     657             : 
     658           0 :         dst += dst_stride;
     659           0 :         src0 += src0_stride;
     660           0 :         src1 += src1_stride;
     661           0 :         mask += 2 * mask_stride;
     662           0 :     } while (--h);
     663           0 : }
     664             : 
     665           0 : static void blend_a64_mask_b10_sy_w4_sse4_1(
     666             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     667             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     668             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     669             : {
     670             :     (void)w;
     671           0 :     blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     672             :         src1_stride, mask, mask_stride, h,
     673             :         blend_4_b10);
     674           0 : }
     675             : 
     676           0 : static void blend_a64_mask_b12_sy_w4_sse4_1(
     677             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     678             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     679             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     680             : {
     681             :     (void)w;
     682           0 :     blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     683             :         src1_stride, mask, mask_stride, h,
     684             :         blend_4_b12);
     685           0 : }
     686             : 
     687           0 : static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
     688             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     689             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     690             :     const uint8_t *mask, uint32_t mask_stride, int w, int h,
     691             :     blend_unit_fn blend)
     692             : {
     693           0 :     const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     694             : 
     695             :     do {
     696             :         int c;
     697           0 :         for (c = 0; c < w; c += 8) {
     698           0 :             const __m128i v_ra_b = xx_loadl_64(mask + c);
     699           0 :             const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
     700           0 :             const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
     701             : 
     702           0 :             const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
     703           0 :             const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     704             : 
     705           0 :             const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
     706             : 
     707           0 :             xx_storeu_128(dst + c, v_res_w);
     708             :         }
     709           0 :         dst += dst_stride;
     710           0 :         src0 += src0_stride;
     711           0 :         src1 += src1_stride;
     712           0 :         mask += 2 * mask_stride;
     713           0 :     } while (--h);
     714           0 : }
     715             : 
     716           0 : static void blend_a64_mask_b10_sy_w8n_sse4_1(
     717             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     718             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     719             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     720             : {
     721           0 :     blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     722             :         src1_stride, mask, mask_stride, w, h,
     723             :         blend_8_b10);
     724           0 : }
     725             : 
     726           0 : static void blend_a64_mask_b12_sy_w8n_sse4_1(
     727             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     728             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     729             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     730             : {
     731           0 :     blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     732             :         src1_stride, mask, mask_stride, w, h,
     733             :         blend_8_b12);
     734           0 : }
     735             : 
     736             : //////////////////////////////////////////////////////////////////////////////
     737             : // Horizontal and Vertical sub-sampling
     738             : //////////////////////////////////////////////////////////////////////////////
     739             : 
     740           0 : static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
     741             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     742             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     743             :     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend)
     744             : {
     745           0 :     const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     746             :         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     747           0 :     const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     748             : 
     749             :     do {
     750           0 :         const __m128i v_ra_b = xx_loadl_64(mask);
     751           0 :         const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     752           0 :         const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     753           0 :         const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
     754             :         const __m128i v_rvsb_w =
     755           0 :             _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
     756           0 :         const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
     757             : 
     758           0 :         const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
     759           0 :         const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     760             : 
     761           0 :         const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
     762             : 
     763           0 :         xx_storel_64(dst, v_res_w);
     764             : 
     765           0 :         dst += dst_stride;
     766           0 :         src0 += src0_stride;
     767           0 :         src1 += src1_stride;
     768           0 :         mask += 2 * mask_stride;
     769           0 :     } while (--h);
     770           0 : }
     771             : 
     772           0 : static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
     773             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     774             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     775             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     776             : {
     777             :     (void)w;
     778           0 :     blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     779             :         src1_stride, mask, mask_stride, h,
     780             :         blend_4_b10);
     781           0 : }
     782             : 
     783           0 : static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
     784             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     785             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     786             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     787             : {
     788             :     (void)w;
     789           0 :     blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     790             :         src1_stride, mask, mask_stride, h,
     791             :         blend_4_b12);
     792           0 : }
     793             : 
     794           0 : static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
     795             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     796             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     797             :     const uint8_t *mask, uint32_t mask_stride, int w, int h,
     798             :     blend_unit_fn blend)
     799             : {
     800           0 :     const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
     801             :         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
     802           0 :     const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     803             : 
     804             :     do {
     805             :         int c;
     806           0 :         for (c = 0; c < w; c += 8) {
     807           0 :             const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
     808           0 :             const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
     809           0 :             const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     810           0 :             const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
     811             :             const __m128i v_rvsb_w =
     812           0 :                 _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
     813           0 :             const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
     814             : 
     815           0 :             const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
     816           0 :             const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     817             : 
     818           0 :             const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
     819             : 
     820           0 :             xx_storeu_128(dst + c, v_res_w);
     821             :         }
     822           0 :         dst += dst_stride;
     823           0 :         src0 += src0_stride;
     824           0 :         src1 += src1_stride;
     825           0 :         mask += 2 * mask_stride;
     826           0 :     } while (--h);
     827           0 : }
     828             : 
     829           0 : static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
     830             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     831             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     832             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     833             : {
     834           0 :     blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     835             :         src1_stride, mask, mask_stride, w, h,
     836             :         blend_8_b10);
     837           0 : }
     838             : 
     839           0 : static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
     840             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     841             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     842             :     const uint8_t *mask, uint32_t mask_stride, int w, int h)
     843             : {
     844           0 :     blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
     845             :         src1_stride, mask, mask_stride, w, h,
     846             :         blend_8_b12);
     847           0 : }
     848             : 
     849             : //////////////////////////////////////////////////////////////////////////////
     850             : // Dispatch
     851             : //////////////////////////////////////////////////////////////////////////////
     852             : 
     853           0 : void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
     854             :     const uint8_t *src0_8,
     855             :     uint32_t src0_stride,
     856             :     const uint8_t *src1_8,
     857             :     uint32_t src1_stride, const uint8_t *mask,
     858             :     uint32_t mask_stride, int w, int h,
     859             :     int subx, int suby, int bd)
     860             : {
     861             :     typedef void(*blend_fn)(
     862             :         uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
     863             :         uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     864             :         const uint8_t *mask, uint32_t mask_stride, int w, int h);
     865             : 
     866             :     // Dimensions are: bd_index X width_index X subx X suby
     867             :     static const blend_fn blend[2][2][2][2] = {
     868             :       {   // bd == 8 or 10
     869             :         { // w % 8 == 0
     870             :           { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
     871             :           { blend_a64_mask_b10_sx_w8n_sse4_1,
     872             :             blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
     873             :         { // w == 4
     874             :           { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
     875             :           { blend_a64_mask_b10_sx_w4_sse4_1,
     876             :             blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
     877             :       {   // bd == 12
     878             :         { // w % 8 == 0
     879             :           { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
     880             :           { blend_a64_mask_b12_sx_w8n_sse4_1,
     881             :             blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
     882             :         { // w == 4
     883             :           { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
     884             :           { blend_a64_mask_b12_sx_w4_sse4_1,
     885             :             blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
     886             :     };
     887             : 
     888           0 :     assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
     889           0 :     assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
     890             : 
     891           0 :     assert(h >= 1);
     892           0 :     assert(w >= 1);
     893           0 :     assert(IS_POWER_OF_TWO(h));
     894           0 :     assert(IS_POWER_OF_TWO(w));
     895             : 
     896           0 :     assert(bd == 8 || bd == 10 || bd == 12);
     897           0 :     if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     898           0 :         aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
     899             :             src1_stride, mask, mask_stride, w, h, subx,
     900             :             suby, bd);
     901             :     }
     902             :     else {
     903           0 :         uint16_t *const dst = (uint16_t *)dst_8;
     904           0 :         const uint16_t *const src0 = (uint16_t *)src0_8;
     905           0 :         const uint16_t *const src1 = (uint16_t *)src1_8;
     906             : 
     907           0 :         blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
     908             :             dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
     909             :             mask_stride, w, h);
     910             :     }
     911           0 : }
     912             : 
     913             : /*Vertical mask related blend functions*/
     914       73094 : static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
     915             :     const uint8_t *src0, uint32_t src0_stride,
     916             :     const uint8_t *src1, uint32_t src1_stride,
     917             :     const uint8_t *mask, int w, int h)
     918             : {
     919       73094 :     const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     920             : 
     921             :     (void)w;
     922             : 
     923             :     do {
     924      540062 :         const __m128i v_m0_w = _mm_set1_epi16(*mask);
     925      540062 :         const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     926             : 
     927      540062 :         const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
     928             : 
     929      540063 :         const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
     930             : 
     931      540063 :         xx_storel_32(dst, v_res_b);
     932             : 
     933      540063 :         dst += dst_stride;
     934      540063 :         src0 += src0_stride;
     935      540063 :         src1 += src1_stride;
     936      540063 :         mask += 1;
     937      540063 :     } while (--h);
     938       73095 : }
     939             : 
     940     6134670 : static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
     941             :     const uint8_t *src0, uint32_t src0_stride,
     942             :     const uint8_t *src1, uint32_t src1_stride,
     943             :     const uint8_t *mask, int w, int h)
     944             : {
     945     6134670 :     const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     946             : 
     947             :     (void)w;
     948             : 
     949             :     do {
     950    46285100 :         const __m128i v_m0_w = _mm_set1_epi16(*mask);
     951    46285100 :         const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     952             : 
     953    46285100 :         const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
     954             : 
     955    46280700 :         const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
     956             : 
     957    46280700 :         xx_storel_64(dst, v_res_b);
     958             : 
     959    46278600 :         dst += dst_stride;
     960    46278600 :         src0 += src0_stride;
     961    46278600 :         src1 += src1_stride;
     962    46278600 :         mask += 1;
     963    46278600 :     } while (--h);
     964     6128140 : }
     965             : 
     966     7123800 : static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
     967             :     const uint8_t *src0,
     968             :     uint32_t src0_stride,
     969             :     const uint8_t *src1,
     970             :     uint32_t src1_stride,
     971             :     const uint8_t *mask, int w, int h)
     972             : {
     973     7123800 :     const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     974             : 
     975             :     do {
     976             :         int c;
     977    74972400 :         const __m128i v_m0_w = _mm_set1_epi16(*mask);
     978    74972400 :         const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     979   198830000 :         for (c = 0; c < w; c += 16) {
     980   123929000 :             const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
     981             :             const __m128i v_resh_w =
     982   123946000 :                 blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
     983             : 
     984   123867000 :             const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
     985             : 
     986   123867000 :             xx_storeu_128(dst + c, v_res_b);
     987             :         }
     988    74900300 :         dst += dst_stride;
     989    74900300 :         src0 += src0_stride;
     990    74900300 :         src1 += src1_stride;
     991    74900300 :         mask += 1;
     992    74900300 :     } while (--h);
     993     7051650 : }
     994             : 
     995             : //////////////////////////////////////////////////////////////////////////////
     996             : // Dispatch
     997             : //////////////////////////////////////////////////////////////////////////////
     998             : 
     999    13328400 : void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
    1000             :     const uint8_t *src0, uint32_t src0_stride,
    1001             :     const uint8_t *src1, uint32_t src1_stride,
    1002             :     const uint8_t *mask, int w, int h)
    1003             : {
    1004             : 
    1005             :     typedef void(*blend_fn)(uint8_t * dst, uint32_t dst_stride,
    1006             :         const uint8_t *src0, uint32_t src0_stride,
    1007             :         const uint8_t *src1, uint32_t src1_stride,
    1008             :         const uint8_t *mask, int w, int h);
    1009             : 
    1010             :     // Dimension: width_index
    1011             :     static const blend_fn blend[9] = {
    1012             :       blend_a64_vmask_w16n_sse4_1,  // w % 16 == 0
    1013             :       aom_blend_a64_vmask_c,        // w == 1
    1014             :       aom_blend_a64_vmask_c,        // w == 2
    1015             :       NULL,                         // INVALID
    1016             :       blend_a64_vmask_w4_sse4_1,    // w == 4
    1017             :       NULL,                         // INVALID
    1018             :       NULL,                         // INVALID
    1019             :       NULL,                         // INVALID
    1020             :       blend_a64_vmask_w8_sse4_1,    // w == 8
    1021             :     };
    1022             : 
    1023    13328400 :     assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
    1024    13328400 :     assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
    1025             : 
    1026    13328400 :     assert(h >= 1);
    1027    13328400 :     assert(w >= 1);
    1028    13328400 :     assert(IS_POWER_OF_TWO(h));
    1029    13328400 :     assert(IS_POWER_OF_TWO(w));
    1030             : 
    1031    13328400 :     blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
    1032             :         h);
    1033    13328500 : }
    1034             : 
    1035             : //////////////////////////////////////////////////////////////////////////////
    1036             : // Implementation - No sub-sampling
    1037             : //////////////////////////////////////////////////////////////////////////////
    1038             : 
    1039           0 : static INLINE void blend_a64_vmask_bn_w4_sse4_1(
    1040             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    1041             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    1042             :     const uint8_t *mask, int h, blend_unit_fn blend)
    1043             : {
    1044           0 :     const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    1045             : 
    1046             :     do {
    1047           0 :         const __m128i v_m0_w = _mm_set1_epi16(*mask);
    1048           0 :         const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
    1049             : 
    1050           0 :         const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
    1051             : 
    1052           0 :         xx_storel_64(dst, v_res_w);
    1053             : 
    1054           0 :         dst += dst_stride;
    1055           0 :         src0 += src0_stride;
    1056           0 :         src1 += src1_stride;
    1057           0 :         mask += 1;
    1058           0 :     } while (--h);
    1059           0 : }
    1060             : 
    1061           0 : static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
    1062             :     const uint16_t *src0,
    1063             :     uint32_t src0_stride,
    1064             :     const uint16_t *src1,
    1065             :     uint32_t src1_stride,
    1066             :     const uint8_t *mask, int w, int h)
    1067             : {
    1068             :     (void)w;
    1069           0 :     blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    1070             :         src1_stride, mask, h, blend_4_b10);
    1071           0 : }
    1072             : 
    1073           0 : static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
    1074             :     const uint16_t *src0,
    1075             :     uint32_t src0_stride,
    1076             :     const uint16_t *src1,
    1077             :     uint32_t src1_stride,
    1078             :     const uint8_t *mask, int w, int h)
    1079             : {
    1080             :     (void)w;
    1081           0 :     blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    1082             :         src1_stride, mask, h, blend_4_b12);
    1083           0 : }
    1084             : 
    1085           0 : static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
    1086             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    1087             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    1088             :     const uint8_t *mask, int w, int h, blend_unit_fn blend)
    1089             : {
    1090           0 :     const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    1091             : 
    1092             :     do {
    1093             :         int c;
    1094           0 :         const __m128i v_m0_w = _mm_set1_epi16(*mask);
    1095           0 :         const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
    1096           0 :         for (c = 0; c < w; c += 8) {
    1097           0 :             const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
    1098             : 
    1099           0 :             xx_storeu_128(dst + c, v_res_w);
    1100             :         }
    1101           0 :         dst += dst_stride;
    1102           0 :         src0 += src0_stride;
    1103           0 :         src1 += src1_stride;
    1104           0 :         mask += 1;
    1105           0 :     } while (--h);
    1106           0 : }
    1107             : 
    1108           0 : static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
    1109             :     const uint16_t *src0,
    1110             :     uint32_t src0_stride,
    1111             :     const uint16_t *src1,
    1112             :     uint32_t src1_stride,
    1113             :     const uint8_t *mask, int w, int h)
    1114             : {
    1115           0 :     blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    1116             :         src1_stride, mask, w, h, blend_8_b10);
    1117           0 : }
    1118             : 
    1119           0 : static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
    1120             :     const uint16_t *src0,
    1121             :     uint32_t src0_stride,
    1122             :     const uint16_t *src1,
    1123             :     uint32_t src1_stride,
    1124             :     const uint8_t *mask, int w, int h)
    1125             : {
    1126           0 :     blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    1127             :         src1_stride, mask, w, h, blend_8_b12);
    1128           0 : }
    1129             : 
    1130             : //////////////////////////////////////////////////////////////////////////////
    1131             : // Dispatch
    1132             : //////////////////////////////////////////////////////////////////////////////
    1133             : 
    1134           0 : void aom_highbd_blend_a64_vmask_sse4_1(
    1135             :     uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
    1136             :     uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
    1137             :     const uint8_t *mask, int w, int h, int bd)
    1138             : {
    1139             : 
    1140             :     typedef void(*blend_fn)(uint16_t * dst, uint32_t dst_stride,
    1141             :         const uint16_t *src0, uint32_t src0_stride,
    1142             :         const uint16_t *src1, uint32_t src1_stride,
    1143             :         const uint8_t *mask, int w, int h);
    1144             : 
    1145             :     // Dimensions are: bd_index X width_index
    1146             :     static const blend_fn blend[2][2] = {
    1147             :       {
    1148             :             // bd == 8 or 10
    1149             :             blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
    1150             :             blend_a64_vmask_b10_w4_sse4_1,   // w == 4
    1151             :         },
    1152             :         {
    1153             :             // bd == 12
    1154             :             blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
    1155             :             blend_a64_vmask_b12_w4_sse4_1,   // w == 4
    1156             :         }
    1157             :     };
    1158             : 
    1159           0 :     assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
    1160           0 :     assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
    1161             : 
    1162           0 :     assert(h >= 1);
    1163           0 :     assert(w >= 1);
    1164           0 :     assert(IS_POWER_OF_TWO(h));
    1165           0 :     assert(IS_POWER_OF_TWO(w));
    1166             : 
    1167           0 :     assert(bd == 8 || bd == 10 || bd == 12);
    1168             : 
    1169           0 :     if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
    1170           0 :         aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
    1171             :             src1_stride, mask, w, h, bd);
    1172             :     }
    1173             :     else {
    1174           0 :         uint16_t *const dst = (uint16_t *)(dst_8);// CONVERT_TO_SHORTPTR(dst_8);
    1175           0 :         const uint16_t *const src0 = (uint16_t *)(src0_8); //CONVERT_TO_SHORTPTR(src0_8);
    1176           0 :         const uint16_t *const src1 = (uint16_t *)(src1_8); //CONVERT_TO_SHORTPTR(src1_8);
    1177             : 
    1178           0 :         blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
    1179             :             src1_stride, mask, w, h);
    1180             :     }
    1181           0 : }
    1182             : 
    1183             : /*Horizontal related blend functions*/
    1184             : 
    1185             : // To start out, just dispatch to the function using the 2D mask and
    1186             :  // pass mask stride as 0. This can be improved upon if necessary.
    1187             : 
    1188    14316700 : void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
    1189             :     const uint8_t *src0, uint32_t src0_stride,
    1190             :     const uint8_t *src1, uint32_t src1_stride,
    1191             :     const uint8_t *mask, int w, int h)
    1192             : {
    1193    14316700 :     aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    1194             :         src1_stride, mask, 0, w, h, 0, 0);
    1195    14317100 : }
    1196             : 
    1197           0 : void aom_highbd_blend_a64_hmask_sse4_1(
    1198             :     uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
    1199             :     uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
    1200             :     const uint8_t *mask, int w, int h, int bd)
    1201             : {
    1202           0 :     aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
    1203             :         src1_8, src1_stride, mask, 0, w, h, 0, 0,
    1204             :         bd);
    1205           0 : }
    1206             : 
    1207           0 : void eb_aom_highbd_blend_a64_mask_sse4_1(uint16_t *dst, uint32_t dst_stride,
    1208             :                                       const uint16_t *src0,
    1209             :                                       uint32_t src0_stride,
    1210             :                                       const uint16_t *src1,
    1211             :                                       uint32_t src1_stride, const uint8_t *mask,
    1212             :                                       uint32_t mask_stride, int w, int h,
    1213             :                                       int subw, int subh, int bd) {
    1214             :   typedef void (*blend_fn)(
    1215             :       uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
    1216             :       uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    1217             :       const uint8_t *mask, uint32_t mask_stride, int w, int h);
    1218             : 
    1219             :   // Dimensions are: bd_index X width_index X subw X subh
    1220             :   static const blend_fn blend[2][2][2][2] = {
    1221             :     {   // bd == 8 or 10
    1222             :       { // w % 8 == 0
    1223             :         { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
    1224             :         { blend_a64_mask_b10_sx_w8n_sse4_1,
    1225             :           blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
    1226             :       { // w == 4
    1227             :         { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
    1228             :         { blend_a64_mask_b10_sx_w4_sse4_1,
    1229             :           blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
    1230             :     {   // bd == 12
    1231             :       { // w % 8 == 0
    1232             :         { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
    1233             :         { blend_a64_mask_b12_sx_w8n_sse4_1,
    1234             :           blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
    1235             :       { // w == 4
    1236             :         { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
    1237             :         { blend_a64_mask_b12_sx_w4_sse4_1,
    1238             :           blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
    1239             :   };
    1240             : 
    1241           0 :   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
    1242           0 :   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
    1243             : 
    1244           0 :   assert(h >= 1);
    1245           0 :   assert(w >= 1);
    1246           0 :   assert(IS_POWER_OF_TWO(h));
    1247           0 :   assert(IS_POWER_OF_TWO(w));
    1248             : 
    1249           0 :   assert(bd == 8 || bd == 10 || bd == 12);
    1250           0 :   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
    1251           0 :     aom_highbd_blend_a64_mask_c((uint8_t*) dst, dst_stride, (uint8_t*)src0, src0_stride,  (uint8_t*)src1,
    1252             :                                 src1_stride, mask, mask_stride, w, h, subw,
    1253             :                                 subh, bd);
    1254             :   } else {
    1255             :     //uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
    1256             :     //const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
    1257             :     //const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
    1258             : 
    1259           0 :     blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0](
    1260             :         dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    1261             :         mask_stride, w, h);
    1262             :   }
    1263           0 : }
    1264           0 : void eb_aom_highbd_blend_a64_hmask_sse4_1(
    1265             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    1266             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    1267             :     const uint8_t *mask, int w, int h, int bd) {
    1268           0 :   eb_aom_highbd_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride,
    1269             :                                    src1, src1_stride, mask, 0, w, h, 0, 0,
    1270             :                                    bd);
    1271           0 : }
    1272             : 
    1273           0 : void eb_aom_highbd_blend_a64_vmask_sse4_1(
    1274             :     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    1275             :     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    1276             :     const uint8_t *mask, int w, int h, int bd) {
    1277             :   typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
    1278             :                            const uint16_t *src0, uint32_t src0_stride,
    1279             :                            const uint16_t *src1, uint32_t src1_stride,
    1280             :                            const uint8_t *mask, int w, int h);
    1281             : 
    1282             :   // Dimensions are: bd_index X width_index
    1283             :   static const blend_fn blend[2][2] = {
    1284             :     {
    1285             :         // bd == 8 or 10
    1286             :         blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
    1287             :         blend_a64_vmask_b10_w4_sse4_1,   // w == 4
    1288             :     },
    1289             :     {
    1290             :         // bd == 12
    1291             :         blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
    1292             :         blend_a64_vmask_b12_w4_sse4_1,   // w == 4
    1293             :     }
    1294             :   };
    1295             : 
    1296           0 :   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
    1297           0 :   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
    1298             : 
    1299           0 :   assert(h >= 1);
    1300           0 :   assert(w >= 1);
    1301           0 :   assert(IS_POWER_OF_TWO(h));
    1302           0 :   assert(IS_POWER_OF_TWO(w));
    1303             : 
    1304           0 :   assert(bd == 8 || bd == 10 || bd == 12);
    1305             : 
    1306           0 :   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
    1307           0 :     eb_aom_highbd_blend_a64_vmask_c(dst, dst_stride, src0, src0_stride, src1,
    1308             :                                  src1_stride, mask, w, h, bd);
    1309             :   } else {
    1310             :     //uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
    1311             :     //const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
    1312             :     //const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
    1313             : 
    1314           0 :     blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
    1315             :                                   src1_stride, mask, w, h);
    1316             :   }
    1317           0 : }

Generated by: LCOV version 1.14