Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : /*
7 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
8 : *
9 : * This source code is subject to the terms of the BSD 2 Clause License and
10 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
11 : * was not distributed with this source code in the LICENSE file, you can
12 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
13 : * Media Patent License 1.0 was not distributed with this source code in the
14 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
15 : */
16 :
17 : #ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
18 : #define AOM_AOM_DSP_X86_BLEND_SSE4_H_
19 :
20 : #include <assert.h>
21 :
22 : #include "EbDefinitions.h"
23 : #include "smmintrin.h"
24 : #include "synonyms.h"
25 :
26 : static const uint8_t g_blend_a64_mask_shuffle[32] = {
27 : 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
28 : 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
29 : };
30 :
31 : //////////////////////////////////////////////////////////////////////////////
32 : // Common kernels
33 : //////////////////////////////////////////////////////////////////////////////
34 :
35 : // convolve_av2.c TODO: Harmonize
36 268912680 : static INLINE __m128i xx_loadl_32(const void *a) {
37 : int val;
38 268912680 : memcpy(&val, a, sizeof(val));
39 537824360 : return _mm_cvtsi32_si128(val);
40 : }
41 :
42 540060 : static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
43 : const __m128i *v_m0_w, const __m128i *v_m1_w)
44 : {
45 540060 : const __m128i v_s0_b = xx_loadl_32(src0);
46 540063 : const __m128i v_s1_b = xx_loadl_32(src1);
47 540061 : const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
48 540061 : const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
49 :
50 540061 : const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
51 1080120 : const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
52 540061 : const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
53 540061 : const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
54 :
55 540063 : return v_res_w;
56 : }
57 :
58 292976000 : static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
59 : const __m128i *v_m0_w, const __m128i *v_m1_w)
60 : {
61 292976000 : const __m128i v_s0_b = xx_loadl_64(src0);
62 292800000 : const __m128i v_s1_b = xx_loadl_64(src1);
63 292687000 : const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
64 292687000 : const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
65 :
66 292687000 : const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
67 585373000 : const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
68 :
69 292687000 : const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
70 :
71 292687000 : const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
72 :
73 293074000 : return v_res_w;
74 : }
75 :
76 89463544 : static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
77 : const __m128i *v_m0_b, const __m128i *v_m1_b,
78 : const __m128i *rounding)
79 : {
80 89463544 : const __m128i v_s0_b = xx_loadl_32(src0);
81 89450347 : const __m128i v_s1_b = xx_loadl_32(src1);
82 :
83 268285530 : const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
84 : _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
85 :
86 178856688 : const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
87 89428444 : const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
88 89428444 : return v_res;
89 : }
90 :
91 249172300 : static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
92 : const __m128i *v_m0_b, const __m128i *v_m1_b,
93 : const __m128i *rounding)
94 : {
95 249172300 : const __m128i v_s0_b = xx_loadl_64(src0);
96 249142200 : const __m128i v_s1_b = xx_loadl_64(src1);
97 :
98 747263000 : const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
99 : _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
100 :
101 498176000 : const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
102 249087400 : const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
103 249087400 : return v_res;
104 : }
105 :
106 261838900 : static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
107 : const __m128i *v_m0_b, const __m128i *v_m1_b,
108 : const __m128i *rounding)
109 : {
110 261838900 : const __m128i v_s0_b = xx_loadu_128(src0);
111 261710800 : const __m128i v_s1_b = xx_loadu_128(src1);
112 :
113 784939000 : const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
114 : _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
115 784939000 : const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
116 : _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
117 :
118 261646800 : const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
119 523294000 : const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
120 261646800 : const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
121 261646800 : return v_res;
122 : }
123 :
124 : typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
125 : const __m128i v_m0_w, const __m128i v_m1_w);
126 :
127 0 : static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
128 : const __m128i v_m0_w, const __m128i v_m1_w)
129 : {
130 0 : const __m128i v_s0_w = xx_loadl_64(src0);
131 0 : const __m128i v_s1_w = xx_loadl_64(src1);
132 :
133 0 : const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
134 0 : const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
135 :
136 0 : const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
137 :
138 0 : const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
139 :
140 0 : return v_res_w;
141 : }
142 :
143 0 : static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
144 : const __m128i v_m0_w, const __m128i v_m1_w)
145 : {
146 0 : const __m128i v_s0_w = xx_loadu_128(src0);
147 0 : const __m128i v_s1_w = xx_loadu_128(src1);
148 :
149 0 : const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
150 0 : const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
151 :
152 0 : const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
153 :
154 0 : const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
155 :
156 0 : return v_res_w;
157 : }
158 :
159 0 : static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
160 : const __m128i v_m0_w, const __m128i v_m1_w)
161 : {
162 0 : const __m128i v_s0_w = xx_loadl_64(src0);
163 0 : const __m128i v_s1_w = xx_loadl_64(src1);
164 :
165 : // Interleave
166 0 : const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
167 0 : const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
168 :
169 : // Multiply-Add
170 0 : const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
171 :
172 : // Scale
173 : const __m128i v_ssum_d =
174 0 : _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
175 :
176 : // Pack
177 0 : const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
178 :
179 : // Round
180 0 : const __m128i v_res_w = xx_round_epu16(v_pssum_d);
181 :
182 0 : return v_res_w;
183 : }
184 :
185 0 : static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
186 : const __m128i v_m0_w, const __m128i v_m1_w)
187 : {
188 0 : const __m128i v_s0_w = xx_loadu_128(src0);
189 0 : const __m128i v_s1_w = xx_loadu_128(src1);
190 :
191 : // Interleave
192 0 : const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
193 0 : const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
194 0 : const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
195 0 : const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
196 :
197 : // Multiply-Add
198 0 : const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
199 0 : const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
200 :
201 : // Scale
202 : const __m128i v_ssuml_d =
203 0 : _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
204 : const __m128i v_ssumh_d =
205 0 : _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
206 :
207 : // Pack
208 0 : const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
209 :
210 : // Round
211 0 : const __m128i v_res_w = xx_round_epu16(v_pssum_d);
212 :
213 0 : return v_res_w;
214 : }
215 :
216 :
217 : /*Functions from convolve_avx2.c*/
218 987804 : static INLINE void blend_a64_d16_mask_w4_sse41(
219 : uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
220 : const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
221 : int shift)
222 : {
223 987804 : const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
224 987804 : const __m128i s0 = xx_loadl_64(src0);
225 987801 : const __m128i s1 = xx_loadl_64(src1);
226 987798 : const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
227 1975600 : const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
228 987798 : const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
229 1975600 : const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
230 987798 : const __m128i res_d = _mm_srai_epi32(res_c, shift);
231 987798 : const __m128i res_e = _mm_packs_epi32(res_d, res_d);
232 987798 : const __m128i res = _mm_packus_epi16(res_e, res_e);
233 :
234 987798 : xx_storel_32(dst, res);
235 987800 : }
236 :
237 242866000 : static INLINE void blend_a64_d16_mask_w8_sse41(
238 : uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
239 : const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
240 : int shift)
241 : {
242 242866000 : const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
243 242866000 : const __m128i s0 = xx_loadu_128(src0);
244 242761000 : const __m128i s1 = xx_loadu_128(src1);
245 727798000 : __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
246 : _mm_unpacklo_epi16(*m, max_minus_m));
247 727798000 : __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
248 : _mm_unpackhi_epi16(*m, max_minus_m));
249 485199000 : res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
250 727798000 : res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
251 242599000 : const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
252 242599000 : const __m128i res = _mm_packus_epi16(res_e, res_e);
253 :
254 242599000 : _mm_storel_epi64((__m128i *)(dst), res);
255 242599000 : }
256 :
257 0 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
258 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
259 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
260 : const uint8_t *mask, uint32_t mask_stride, int h,
261 : const __m128i *round_offset, int shift)
262 : {
263 0 : const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
264 0 : for (int i = 0; i < h; ++i) {
265 0 : const __m128i m0 = xx_loadl_32(mask);
266 0 : const __m128i m = _mm_cvtepu8_epi16(m0);
267 :
268 0 : blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
269 : shift);
270 0 : mask += mask_stride;
271 0 : dst += dst_stride;
272 0 : src0 += src0_stride;
273 0 : src1 += src1_stride;
274 : }
275 0 : }
276 :
277 17431900 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
278 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
279 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
280 : const uint8_t *mask, uint32_t mask_stride, int h,
281 : const __m128i *round_offset, int shift)
282 : {
283 17431900 : const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
284 258511000 : for (int i = 0; i < h; ++i) {
285 241108000 : const __m128i m0 = xx_loadl_64(mask);
286 241118000 : const __m128i m = _mm_cvtepu8_epi16(m0);
287 241118000 : blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset,
288 : &v_maxval, shift);
289 241079000 : mask += mask_stride;
290 241079000 : dst += dst_stride;
291 241079000 : src0 += src0_stride;
292 241079000 : src1 += src1_stride;
293 : }
294 17403700 : }
295 :
296 112766 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
297 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
298 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
299 : const uint8_t *mask, uint32_t mask_stride, int h,
300 : const __m128i *round_offset, int shift)
301 : {
302 112766 : const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
303 112766 : const __m128i one_b = _mm_set1_epi8(1);
304 112766 : const __m128i two_w = _mm_set1_epi16(2);
305 1100570 : for (int i = 0; i < h; ++i) {
306 987801 : const __m128i m_i0 = xx_loadl_64(mask);
307 987799 : const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
308 987801 : const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
309 987801 : const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
310 987801 : const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
311 987801 : const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
312 :
313 987801 : blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
314 : shift);
315 987800 : mask += mask_stride << 1;
316 987800 : dst += dst_stride;
317 987800 : src0 += src0_stride;
318 987800 : src1 += src1_stride;
319 : }
320 112765 : }
321 :
322 147856 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
323 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
324 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
325 : const uint8_t *mask, uint32_t mask_stride, int h,
326 : const __m128i *round_offset, int shift)
327 : {
328 147856 : const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
329 147856 : const __m128i one_b = _mm_set1_epi8(1);
330 147856 : const __m128i two_w = _mm_set1_epi16(2);
331 1982490 : for (int i = 0; i < h; ++i) {
332 1834640 : const __m128i m_i0 = xx_loadu_128(mask);
333 1834630 : const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
334 1834620 : const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
335 1834620 : const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
336 1834620 : const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
337 1834620 : const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
338 :
339 1834620 : blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset,
340 : &v_maxval, shift);
341 1834640 : mask += mask_stride << 1;
342 1834640 : dst += dst_stride;
343 1834640 : src0 += src0_stride;
344 1834640 : src1 += src1_stride;
345 : }
346 147855 : }
347 :
348 0 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
349 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
350 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
351 : const uint8_t *mask, uint32_t mask_stride, int h,
352 : const __m128i *round_offset, int shift)
353 : {
354 0 : const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
355 0 : const __m128i one_b = _mm_set1_epi8(1);
356 0 : const __m128i zeros = _mm_setzero_si128();
357 0 : for (int i = 0; i < h; ++i) {
358 0 : const __m128i m_i0 = xx_loadl_64(mask);
359 0 : const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
360 0 : const __m128i m = _mm_avg_epu16(m_ac, zeros);
361 :
362 0 : blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset,
363 : &v_maxval, shift);
364 0 : mask += mask_stride;
365 0 : dst += dst_stride;
366 0 : src0 += src0_stride;
367 0 : src1 += src1_stride;
368 : }
369 0 : }
370 :
371 0 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
372 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
373 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
374 : const uint8_t *mask, uint32_t mask_stride, int h,
375 : const __m128i *round_offset, int shift)
376 : {
377 0 : const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
378 0 : const __m128i one_b = _mm_set1_epi8(1);
379 0 : const __m128i zeros = _mm_setzero_si128();
380 0 : for (int i = 0; i < h; ++i) {
381 0 : const __m128i m_i0 = xx_loadu_128(mask);
382 0 : const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
383 0 : const __m128i m = _mm_avg_epu16(m_ac, zeros);
384 :
385 0 : blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset,
386 : &v_maxval, shift);
387 0 : mask += mask_stride;
388 0 : dst += dst_stride;
389 0 : src0 += src0_stride;
390 0 : src1 += src1_stride;
391 : }
392 0 : }
393 :
394 0 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
395 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
396 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
397 : const uint8_t *mask, uint32_t mask_stride, int h,
398 : const __m128i *round_offset, int shift)
399 : {
400 0 : const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
401 0 : const __m128i zeros = _mm_setzero_si128();
402 0 : for (int i = 0; i < h; ++i) {
403 0 : const __m128i m_i0 = xx_loadl_64(mask);
404 0 : const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
405 0 : const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
406 0 : const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
407 :
408 0 : blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset,
409 : &v_maxval, shift);
410 0 : mask += mask_stride << 1;
411 0 : dst += dst_stride;
412 0 : src0 += src0_stride;
413 0 : src1 += src1_stride;
414 : }
415 0 : }
416 :
417 0 : static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
418 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
419 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
420 : const uint8_t *mask, uint32_t mask_stride, int h,
421 : const __m128i *round_offset, int shift)
422 : {
423 0 : const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
424 0 : const __m128i zeros = _mm_setzero_si128();
425 0 : for (int i = 0; i < h; ++i) {
426 0 : const __m128i m_i0 = xx_loadl_64(mask);
427 0 : const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
428 0 : const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
429 0 : const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
430 :
431 0 : blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset,
432 : &v_maxval, shift);
433 0 : mask += mask_stride << 1;
434 0 : dst += dst_stride;
435 0 : src0 += src0_stride;
436 0 : src1 += src1_stride;
437 : }
438 0 : }
439 : #endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_
|