LCOV - code coverage report
Current view: top level - ASM_SSE4_1 - EbBlend_sse4.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 116 223 52.0 %
Date: 2019-11-25 17:38:06 Functions: 11 20 55.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : /*
       7             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       8             :  *
       9             :  * This source code is subject to the terms of the BSD 2 Clause License and
      10             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      11             :  * was not distributed with this source code in the LICENSE file, you can
      12             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      13             :  * Media Patent License 1.0 was not distributed with this source code in the
      14             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      15             :  */
      16             : 
      17             : #ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
      18             : #define AOM_AOM_DSP_X86_BLEND_SSE4_H_
      19             : 
      20             : #include <assert.h>
      21             : 
      22             : #include "EbDefinitions.h"
      23             : #include "smmintrin.h"
      24             : #include "synonyms.h"
      25             : 
      26             : static const uint8_t g_blend_a64_mask_shuffle[32] = {
      27             :   0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
      28             :   0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
      29             : };
      30             : 
      31             : //////////////////////////////////////////////////////////////////////////////
      32             : // Common kernels
      33             : //////////////////////////////////////////////////////////////////////////////
      34             : 
      35             : // convolve_av2.c TODO: Harmonize
      36   268912680 : static INLINE __m128i xx_loadl_32(const void *a) {
      37             :     int val;
      38   268912680 :     memcpy(&val, a, sizeof(val));
      39   537824360 :     return _mm_cvtsi32_si128(val);
      40             : }
      41             : 
      42      540060 : static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
      43             :     const __m128i *v_m0_w, const __m128i *v_m1_w)
      44             : {
      45      540060 :     const __m128i v_s0_b = xx_loadl_32(src0);
      46      540063 :     const __m128i v_s1_b = xx_loadl_32(src1);
      47      540061 :     const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
      48      540061 :     const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
      49             : 
      50      540061 :     const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
      51     1080120 :     const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
      52      540061 :     const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
      53      540061 :     const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
      54             : 
      55      540063 :     return v_res_w;
      56             : }
      57             : 
      58   292976000 : static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
      59             :     const __m128i *v_m0_w, const __m128i *v_m1_w)
      60             : {
      61   292976000 :     const __m128i v_s0_b = xx_loadl_64(src0);
      62   292800000 :     const __m128i v_s1_b = xx_loadl_64(src1);
      63   292687000 :     const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
      64   292687000 :     const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
      65             : 
      66   292687000 :     const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
      67   585373000 :     const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
      68             : 
      69   292687000 :     const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
      70             : 
      71   292687000 :     const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
      72             : 
      73   293074000 :     return v_res_w;
      74             : }
      75             : 
      76    89463544 : static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
      77             :     const __m128i *v_m0_b, const __m128i *v_m1_b,
      78             :     const __m128i *rounding)
      79             : {
      80    89463544 :     const __m128i v_s0_b = xx_loadl_32(src0);
      81    89450347 :     const __m128i v_s1_b = xx_loadl_32(src1);
      82             : 
      83   268285530 :     const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
      84             :         _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
      85             : 
      86   178856688 :     const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
      87    89428444 :     const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
      88    89428444 :     return v_res;
      89             : }
      90             : 
      91   249172300 : static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
      92             :     const __m128i *v_m0_b, const __m128i *v_m1_b,
      93             :     const __m128i *rounding)
      94             : {
      95   249172300 :     const __m128i v_s0_b = xx_loadl_64(src0);
      96   249142200 :     const __m128i v_s1_b = xx_loadl_64(src1);
      97             : 
      98   747263000 :     const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
      99             :         _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
     100             : 
     101   498176000 :     const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
     102   249087400 :     const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
     103   249087400 :     return v_res;
     104             : }
     105             : 
     106   261838900 : static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
     107             :     const __m128i *v_m0_b, const __m128i *v_m1_b,
     108             :     const __m128i *rounding)
     109             : {
     110   261838900 :     const __m128i v_s0_b = xx_loadu_128(src0);
     111   261710800 :     const __m128i v_s1_b = xx_loadu_128(src1);
     112             : 
     113   784939000 :     const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
     114             :         _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
     115   784939000 :     const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
     116             :         _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
     117             : 
     118   261646800 :     const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
     119   523294000 :     const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
     120   261646800 :     const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
     121   261646800 :     return v_res;
     122             : }
     123             : 
     124             : typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
     125             :     const __m128i v_m0_w, const __m128i v_m1_w);
     126             : 
     127           0 : static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
     128             :     const __m128i v_m0_w, const __m128i v_m1_w)
     129             : {
     130           0 :     const __m128i v_s0_w = xx_loadl_64(src0);
     131           0 :     const __m128i v_s1_w = xx_loadl_64(src1);
     132             : 
     133           0 :     const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
     134           0 :     const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
     135             : 
     136           0 :     const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
     137             : 
     138           0 :     const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
     139             : 
     140           0 :     return v_res_w;
     141             : }
     142             : 
     143           0 : static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
     144             :     const __m128i v_m0_w, const __m128i v_m1_w)
     145             : {
     146           0 :     const __m128i v_s0_w = xx_loadu_128(src0);
     147           0 :     const __m128i v_s1_w = xx_loadu_128(src1);
     148             : 
     149           0 :     const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
     150           0 :     const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
     151             : 
     152           0 :     const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
     153             : 
     154           0 :     const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
     155             : 
     156           0 :     return v_res_w;
     157             : }
     158             : 
     159           0 : static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
     160             :     const __m128i v_m0_w, const __m128i v_m1_w)
     161             : {
     162           0 :     const __m128i v_s0_w = xx_loadl_64(src0);
     163           0 :     const __m128i v_s1_w = xx_loadl_64(src1);
     164             : 
     165             :     // Interleave
     166           0 :     const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
     167           0 :     const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
     168             : 
     169             :     // Multiply-Add
     170           0 :     const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
     171             : 
     172             :     // Scale
     173             :     const __m128i v_ssum_d =
     174           0 :         _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
     175             : 
     176             :     // Pack
     177           0 :     const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
     178             : 
     179             :     // Round
     180           0 :     const __m128i v_res_w = xx_round_epu16(v_pssum_d);
     181             : 
     182           0 :     return v_res_w;
     183             : }
     184             : 
     185           0 : static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
     186             :     const __m128i v_m0_w, const __m128i v_m1_w)
     187             : {
     188           0 :     const __m128i v_s0_w = xx_loadu_128(src0);
     189           0 :     const __m128i v_s1_w = xx_loadu_128(src1);
     190             : 
     191             :     // Interleave
     192           0 :     const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
     193           0 :     const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
     194           0 :     const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
     195           0 :     const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
     196             : 
     197             :     // Multiply-Add
     198           0 :     const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
     199           0 :     const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
     200             : 
     201             :     // Scale
     202             :     const __m128i v_ssuml_d =
     203           0 :         _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
     204             :     const __m128i v_ssumh_d =
     205           0 :         _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
     206             : 
     207             :     // Pack
     208           0 :     const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
     209             : 
     210             :     // Round
     211           0 :     const __m128i v_res_w = xx_round_epu16(v_pssum_d);
     212             : 
     213           0 :     return v_res_w;
     214             : }
     215             : 
     216             : 
     217             : /*Functions from convolve_avx2.c*/
     218      987804 : static INLINE void blend_a64_d16_mask_w4_sse41(
     219             :     uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
     220             :     const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
     221             :     int shift)
     222             : {
     223      987804 :     const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
     224      987804 :     const __m128i s0 = xx_loadl_64(src0);
     225      987801 :     const __m128i s1 = xx_loadl_64(src1);
     226      987798 :     const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
     227     1975600 :     const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
     228      987798 :     const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
     229     1975600 :     const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
     230      987798 :     const __m128i res_d = _mm_srai_epi32(res_c, shift);
     231      987798 :     const __m128i res_e = _mm_packs_epi32(res_d, res_d);
     232      987798 :     const __m128i res = _mm_packus_epi16(res_e, res_e);
     233             : 
     234      987798 :     xx_storel_32(dst, res);
     235      987800 : }
     236             : 
     237   242866000 : static INLINE void blend_a64_d16_mask_w8_sse41(
     238             :     uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
     239             :     const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
     240             :     int shift)
     241             : {
     242   242866000 :     const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
     243   242866000 :     const __m128i s0 = xx_loadu_128(src0);
     244   242761000 :     const __m128i s1 = xx_loadu_128(src1);
     245   727798000 :     __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
     246             :         _mm_unpacklo_epi16(*m, max_minus_m));
     247   727798000 :     __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
     248             :         _mm_unpackhi_epi16(*m, max_minus_m));
     249   485199000 :     res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
     250   727798000 :     res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
     251   242599000 :     const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
     252   242599000 :     const __m128i res = _mm_packus_epi16(res_e, res_e);
     253             : 
     254   242599000 :     _mm_storel_epi64((__m128i *)(dst), res);
     255   242599000 : }
     256             : 
     257           0 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
     258             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     259             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     260             :     const uint8_t *mask, uint32_t mask_stride, int h,
     261             :     const __m128i *round_offset, int shift)
     262             : {
     263           0 :     const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     264           0 :     for (int i = 0; i < h; ++i) {
     265           0 :         const __m128i m0 = xx_loadl_32(mask);
     266           0 :         const __m128i m = _mm_cvtepu8_epi16(m0);
     267             : 
     268           0 :         blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
     269             :             shift);
     270           0 :         mask += mask_stride;
     271           0 :         dst += dst_stride;
     272           0 :         src0 += src0_stride;
     273           0 :         src1 += src1_stride;
     274             :     }
     275           0 : }
     276             : 
     277    17431900 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
     278             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     279             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     280             :     const uint8_t *mask, uint32_t mask_stride, int h,
     281             :     const __m128i *round_offset, int shift)
     282             : {
     283    17431900 :     const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     284   258511000 :     for (int i = 0; i < h; ++i) {
     285   241108000 :         const __m128i m0 = xx_loadl_64(mask);
     286   241118000 :         const __m128i m = _mm_cvtepu8_epi16(m0);
     287   241118000 :         blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset,
     288             :             &v_maxval, shift);
     289   241079000 :         mask += mask_stride;
     290   241079000 :         dst += dst_stride;
     291   241079000 :         src0 += src0_stride;
     292   241079000 :         src1 += src1_stride;
     293             :     }
     294    17403700 : }
     295             : 
     296      112766 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
     297             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     298             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     299             :     const uint8_t *mask, uint32_t mask_stride, int h,
     300             :     const __m128i *round_offset, int shift)
     301             : {
     302      112766 :     const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     303      112766 :     const __m128i one_b = _mm_set1_epi8(1);
     304      112766 :     const __m128i two_w = _mm_set1_epi16(2);
     305     1100570 :     for (int i = 0; i < h; ++i) {
     306      987801 :         const __m128i m_i0 = xx_loadl_64(mask);
     307      987799 :         const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
     308      987801 :         const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
     309      987801 :         const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
     310      987801 :         const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
     311      987801 :         const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
     312             : 
     313      987801 :         blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
     314             :             shift);
     315      987800 :         mask += mask_stride << 1;
     316      987800 :         dst += dst_stride;
     317      987800 :         src0 += src0_stride;
     318      987800 :         src1 += src1_stride;
     319             :     }
     320      112765 : }
     321             : 
     322      147856 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
     323             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     324             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     325             :     const uint8_t *mask, uint32_t mask_stride, int h,
     326             :     const __m128i *round_offset, int shift)
     327             : {
     328      147856 :     const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     329      147856 :     const __m128i one_b = _mm_set1_epi8(1);
     330      147856 :     const __m128i two_w = _mm_set1_epi16(2);
     331     1982490 :     for (int i = 0; i < h; ++i) {
     332     1834640 :         const __m128i m_i0 = xx_loadu_128(mask);
     333     1834630 :         const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
     334     1834620 :         const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
     335     1834620 :         const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
     336     1834620 :         const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
     337     1834620 :         const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
     338             : 
     339     1834620 :         blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset,
     340             :             &v_maxval, shift);
     341     1834640 :         mask += mask_stride << 1;
     342     1834640 :         dst += dst_stride;
     343     1834640 :         src0 += src0_stride;
     344     1834640 :         src1 += src1_stride;
     345             :     }
     346      147855 : }
     347             : 
     348           0 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
     349             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     350             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     351             :     const uint8_t *mask, uint32_t mask_stride, int h,
     352             :     const __m128i *round_offset, int shift)
     353             : {
     354           0 :     const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     355           0 :     const __m128i one_b = _mm_set1_epi8(1);
     356           0 :     const __m128i zeros = _mm_setzero_si128();
     357           0 :     for (int i = 0; i < h; ++i) {
     358           0 :         const __m128i m_i0 = xx_loadl_64(mask);
     359           0 :         const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
     360           0 :         const __m128i m = _mm_avg_epu16(m_ac, zeros);
     361             : 
     362           0 :         blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset,
     363             :             &v_maxval, shift);
     364           0 :         mask += mask_stride;
     365           0 :         dst += dst_stride;
     366           0 :         src0 += src0_stride;
     367           0 :         src1 += src1_stride;
     368             :     }
     369           0 : }
     370             : 
     371           0 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
     372             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     373             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     374             :     const uint8_t *mask, uint32_t mask_stride, int h,
     375             :     const __m128i *round_offset, int shift)
     376             : {
     377           0 :     const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     378           0 :     const __m128i one_b = _mm_set1_epi8(1);
     379           0 :     const __m128i zeros = _mm_setzero_si128();
     380           0 :     for (int i = 0; i < h; ++i) {
     381           0 :         const __m128i m_i0 = xx_loadu_128(mask);
     382           0 :         const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
     383           0 :         const __m128i m = _mm_avg_epu16(m_ac, zeros);
     384             : 
     385           0 :         blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset,
     386             :             &v_maxval, shift);
     387           0 :         mask += mask_stride;
     388           0 :         dst += dst_stride;
     389           0 :         src0 += src0_stride;
     390           0 :         src1 += src1_stride;
     391             :     }
     392           0 : }
     393             : 
     394           0 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
     395             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     396             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     397             :     const uint8_t *mask, uint32_t mask_stride, int h,
     398             :     const __m128i *round_offset, int shift)
     399             : {
     400           0 :     const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     401           0 :     const __m128i zeros = _mm_setzero_si128();
     402           0 :     for (int i = 0; i < h; ++i) {
     403           0 :         const __m128i m_i0 = xx_loadl_64(mask);
     404           0 :         const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
     405           0 :         const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
     406           0 :         const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
     407             : 
     408           0 :         blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset,
     409             :             &v_maxval, shift);
     410           0 :         mask += mask_stride << 1;
     411           0 :         dst += dst_stride;
     412           0 :         src0 += src0_stride;
     413           0 :         src1 += src1_stride;
     414             :     }
     415           0 : }
     416             : 
     417           0 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
     418             :     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     419             :     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     420             :     const uint8_t *mask, uint32_t mask_stride, int h,
     421             :     const __m128i *round_offset, int shift)
     422             : {
     423           0 :     const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     424           0 :     const __m128i zeros = _mm_setzero_si128();
     425           0 :     for (int i = 0; i < h; ++i) {
     426           0 :         const __m128i m_i0 = xx_loadl_64(mask);
     427           0 :         const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
     428           0 :         const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
     429           0 :         const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
     430             : 
     431           0 :         blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset,
     432             :             &v_maxval, shift);
     433           0 :         mask += mask_stride << 1;
     434           0 :         dst += dst_stride;
     435           0 :         src0 += src0_stride;
     436           0 :         src1 += src1_stride;
     437             :     }
     438           0 : }
     439             : #endif  // AOM_AOM_DSP_X86_BLEND_SSE4_H_

Generated by: LCOV version 1.14