Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : /*
7 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
8 : *
9 : * This source code is subject to the terms of the BSD 2 Clause License and
10 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
11 : * was not distributed with this source code in the LICENSE file, you can
12 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
13 : * Media Patent License 1.0 was not distributed with this source code in the
14 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
15 : */
16 :
17 : #include <assert.h>
18 : #include "immintrin.h"
19 :
20 : #include "EbDefinitions.h"
21 :
22 : #include "synonyms.h"
23 : #include "EbMemory_AVX2.h"
24 : #include "synonyms_avx2.h"
25 : #include "convolve_avx2.h"
26 : #include "EbBlend_sse4.h"
27 :
28 : #include "aom_dsp_rtcd.h"
29 :
30 109072 : static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0,
31 : const uint8_t *src1,
32 : const __m256i *v_m0_b,
33 : const __m256i *v_m1_b,
34 : const int32_t bits)
35 : {
36 109072 : const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
37 109072 : const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
38 109072 : const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
39 109072 : const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
40 :
41 : const __m256i v_p0_w =
42 327216 : _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
43 : _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
44 :
45 109072 : const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
46 109072 : const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
47 109072 : const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
48 109072 : return v_res;
49 : }
50 :
51 85973600 : static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0,
52 : const uint8_t *src1,
53 : const __m256i *v_m0_b,
54 : const __m256i *v_m1_b,
55 : const int32_t bits)
56 : {
57 85973600 : const __m256i v_s0_b = yy_loadu_256(src0);
58 85929700 : const __m256i v_s1_b = yy_loadu_256(src1);
59 :
60 : const __m256i v_p0_w =
61 257806000 : _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
62 : _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
63 : const __m256i v_p1_w =
64 257806000 : _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
65 : _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
66 :
67 85935400 : const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
68 85990900 : const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
69 85992000 : const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
70 85992000 : return v_res;
71 : }
72 :
73 9016 : static INLINE void blend_a64_mask_sx_sy_w16_avx2(
74 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
75 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
76 : const uint8_t *mask, uint32_t mask_stride, int h)
77 : {
78 9016 : const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
79 9016 : const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
80 : do {
81 109072 : const __m256i v_ral_b = yy_loadu_256(mask);
82 109072 : const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
83 109072 : const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
84 109072 : const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
85 : const __m256i v_rvsbl_w =
86 218144 : _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
87 109072 : const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
88 :
89 109072 : const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
90 109072 : const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
91 109072 : const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
92 :
93 109072 : const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
94 : AOM_BLEND_A64_ROUND_BITS);
95 :
96 109072 : xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
97 109072 : dst += dst_stride;
98 109072 : src0 += src0_stride;
99 109072 : src1 += src1_stride;
100 109072 : mask += 2 * mask_stride;
101 109072 : } while (--h);
102 9016 : }
103 :
104 0 : static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
105 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
106 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
107 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
108 : {
109 0 : const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
110 0 : const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
111 : do {
112 : int c;
113 0 : for (c = 0; c < w; c += 32) {
114 0 : const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
115 0 : const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
116 0 : const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
117 0 : const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
118 0 : const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
119 0 : const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
120 0 : const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
121 0 : const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
122 : const __m256i v_rvsbl_w =
123 0 : _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
124 : const __m256i v_rvsbh_w =
125 0 : _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
126 0 : const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
127 0 : const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
128 :
129 0 : const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
130 0 : const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
131 0 : const __m256i v_m0_b =
132 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
133 0 : const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
134 :
135 0 : const __m256i v_res_b = blend_32_u8_avx2(
136 : src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
137 :
138 0 : yy_storeu_256(dst + c, v_res_b);
139 : }
140 0 : dst += dst_stride;
141 0 : src0 += src0_stride;
142 0 : src1 += src1_stride;
143 0 : mask += 2 * mask_stride;
144 0 : } while (--h);
145 0 : }
146 :
147 71390 : static INLINE void blend_a64_mask_sx_sy_avx2(
148 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
149 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
150 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
151 : {
152 71390 : const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
153 71390 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
154 71390 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
155 71390 : switch (w) {
156 193848 : case 4:
157 : do {
158 193848 : const __m128i v_ra_b = xx_loadl_64(mask);
159 193848 : const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
160 193848 : const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
161 193848 : const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
162 193848 : const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
163 387696 : const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
164 193848 : const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
165 193848 : const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
166 193848 : const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
167 193848 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
168 :
169 193848 : const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
170 :
171 193848 : xx_storel_32(dst, v_res_b);
172 :
173 193848 : dst += dst_stride;
174 193848 : src0 += src0_stride;
175 193848 : src1 += src1_stride;
176 193848 : mask += 2 * mask_stride;
177 193848 : } while (--h);
178 36928 : break;
179 169434 : case 8:
180 : do {
181 194880 : const __m128i v_ra_b = xx_loadu_128(mask);
182 194880 : const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
183 194880 : const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
184 194880 : const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
185 194880 : const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
186 389760 : const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
187 194880 : const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
188 194880 : const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
189 194880 : const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
190 194880 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
191 :
192 194880 : const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
193 :
194 194880 : xx_storel_64(dst, v_res_b);
195 :
196 194880 : dst += dst_stride;
197 194880 : src0 += src0_stride;
198 194880 : src1 += src1_stride;
199 194880 : mask += 2 * mask_stride;
200 194880 : } while (--h);
201 25446 : break;
202 9016 : case 16:
203 9016 : blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
204 : src1_stride, mask, mask_stride, h);
205 9016 : break;
206 0 : default:
207 0 : blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
208 : src1_stride, mask, mask_stride, w, h);
209 0 : break;
210 : }
211 71390 : }
212 :
213 0 : static INLINE void blend_a64_mask_sx_w16_avx2(
214 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
215 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
216 : const uint8_t *mask, uint32_t mask_stride, int h)
217 : {
218 0 : const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
219 0 : const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
220 : do {
221 0 : const __m256i v_rl_b = yy_loadu_256(mask);
222 : const __m256i v_al_b =
223 0 : _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
224 :
225 0 : const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
226 0 : const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
227 0 : const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
228 :
229 0 : const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
230 : AOM_BLEND_A64_ROUND_BITS);
231 :
232 0 : xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
233 0 : dst += dst_stride;
234 0 : src0 += src0_stride;
235 0 : src1 += src1_stride;
236 0 : mask += mask_stride;
237 0 : } while (--h);
238 0 : }
239 :
240 0 : static INLINE void blend_a64_mask_sx_w32n_avx2(
241 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
242 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
243 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
244 : {
245 0 : const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
246 0 : const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
247 : do {
248 : int c;
249 0 : for (c = 0; c < w; c += 32) {
250 0 : const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
251 0 : const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
252 0 : const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
253 0 : const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
254 : const __m256i v_al_b =
255 0 : _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
256 : const __m256i v_ah_b =
257 0 : _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
258 :
259 0 : const __m256i v_m0_b =
260 0 : _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
261 0 : const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
262 :
263 0 : const __m256i v_res_b = blend_32_u8_avx2(
264 : src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
265 :
266 0 : yy_storeu_256(dst + c, v_res_b);
267 : }
268 0 : dst += dst_stride;
269 0 : src0 += src0_stride;
270 0 : src1 += src1_stride;
271 0 : mask += mask_stride;
272 0 : } while (--h);
273 0 : }
274 :
275 0 : static INLINE void blend_a64_mask_sx_avx2(
276 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
277 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
278 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
279 : {
280 0 : const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
281 0 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
282 0 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
283 0 : switch (w) {
284 0 : case 4:
285 : do {
286 0 : const __m128i v_r_b = xx_loadl_64(mask);
287 0 : const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
288 0 : const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
289 0 : const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
290 0 : const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
291 0 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
292 :
293 0 : const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
294 :
295 0 : xx_storel_32(dst, v_res_b);
296 :
297 0 : dst += dst_stride;
298 0 : src0 += src0_stride;
299 0 : src1 += src1_stride;
300 0 : mask += mask_stride;
301 0 : } while (--h);
302 0 : break;
303 0 : case 8:
304 : do {
305 0 : const __m128i v_r_b = xx_loadu_128(mask);
306 0 : const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
307 0 : const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
308 0 : const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
309 0 : const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
310 0 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
311 :
312 0 : const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
313 :
314 0 : xx_storel_64(dst, v_res_b);
315 :
316 0 : dst += dst_stride;
317 0 : src0 += src0_stride;
318 0 : src1 += src1_stride;
319 0 : mask += mask_stride;
320 0 : } while (--h);
321 0 : break;
322 0 : case 16:
323 0 : blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
324 : src1_stride, mask, mask_stride, h);
325 0 : break;
326 0 : default:
327 0 : blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
328 : src1_stride, mask, mask_stride, w, h);
329 0 : break;
330 : }
331 0 : }
332 :
333 0 : static INLINE void blend_a64_mask_sy_w16_avx2(
334 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
335 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
336 : const uint8_t *mask, uint32_t mask_stride, int h) {
337 0 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
338 0 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
339 : do {
340 0 : const __m128i v_ra_b = xx_loadu_128(mask);
341 0 : const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
342 0 : const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
343 :
344 0 : const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
345 0 : const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
346 :
347 0 : xx_storeu_128(dst, v_res_b);
348 0 : dst += dst_stride;
349 0 : src0 += src0_stride;
350 0 : src1 += src1_stride;
351 0 : mask += 2 * mask_stride;
352 0 : } while (--h);
353 0 : }
354 :
355 0 : static INLINE void blend_a64_mask_sy_w32n_avx2(
356 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
357 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
358 : const uint8_t *mask, uint32_t mask_stride, int w, int h) {
359 0 : const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
360 : do {
361 : int c;
362 0 : for (c = 0; c < w; c += 32) {
363 0 : const __m256i v_ra_b = yy_loadu_256(mask + c);
364 0 : const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
365 0 : const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
366 0 : const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
367 0 : const __m256i v_res_b = blend_32_u8_avx2(
368 : src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
369 :
370 0 : yy_storeu_256(dst + c, v_res_b);
371 : }
372 0 : dst += dst_stride;
373 0 : src0 += src0_stride;
374 0 : src1 += src1_stride;
375 0 : mask += 2 * mask_stride;
376 0 : } while (--h);
377 0 : }
378 :
379 0 : static INLINE void blend_a64_mask_sy_avx2(
380 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
381 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
382 : const uint8_t *mask, uint32_t mask_stride, int w, int h) {
383 0 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
384 0 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
385 0 : switch (w) {
386 0 : case 4:
387 : do {
388 0 : const __m128i v_ra_b = xx_loadl_32(mask);
389 0 : const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
390 0 : const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
391 0 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
392 0 : const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
393 :
394 0 : xx_storel_32(dst, v_res_b);
395 :
396 0 : dst += dst_stride;
397 0 : src0 += src0_stride;
398 0 : src1 += src1_stride;
399 0 : mask += 2 * mask_stride;
400 0 : } while (--h);
401 0 : break;
402 0 : case 8:
403 : do {
404 0 : const __m128i v_ra_b = xx_loadl_64(mask);
405 0 : const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
406 0 : const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
407 0 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
408 0 : const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
409 :
410 0 : xx_storel_64(dst, v_res_b);
411 :
412 0 : dst += dst_stride;
413 0 : src0 += src0_stride;
414 0 : src1 += src1_stride;
415 0 : mask += 2 * mask_stride;
416 0 : } while (--h);
417 0 : break;
418 0 : case 16:
419 0 : blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
420 : src1_stride, mask, mask_stride, h);
421 0 : break;
422 0 : default:
423 0 : blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
424 : src1_stride, mask, mask_stride, w, h);
425 : }
426 0 : }
427 :
428 3715530 : static INLINE void blend_a64_mask_w32n_avx2(
429 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
430 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
431 : const uint8_t *mask, uint32_t mask_stride, int w, int h) {
432 3715530 : const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
433 : do {
434 : int c;
435 171989000 : for (c = 0; c < w; c += 32) {
436 86004800 : const __m256i v_m0_b = yy_loadu_256(mask + c);
437 85989600 : const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
438 :
439 85989600 : const __m256i v_res_b = blend_32_u8_avx2(
440 : src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
441 :
442 85988500 : yy_storeu_256(dst + c, v_res_b);
443 : }
444 85983900 : dst += dst_stride;
445 85983900 : src0 += src0_stride;
446 85983900 : src1 += src1_stride;
447 85983900 : mask += mask_stride;
448 85983900 : } while (--h);
449 3701270 : }
450 :
451 29600200 : static INLINE void blend_a64_mask_avx2(
452 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
453 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
454 : const uint8_t *mask, uint32_t mask_stride, int w, int h) {
455 29600200 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
456 29600200 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
457 29600200 : switch (w) {
458 243998 : case 4:
459 : do {
460 243998 : const __m128i v_m0_b = xx_loadl_32(mask);
461 243998 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
462 243998 : const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
463 :
464 243997 : xx_storel_32(dst, v_res_b);
465 :
466 243998 : dst += dst_stride;
467 243998 : src0 += src0_stride;
468 243998 : src1 += src1_stride;
469 243998 : mask += mask_stride;
470 243998 : } while (--h);
471 46548 : break;
472 153878000 : case 8:
473 : do {
474 168454000 : const __m128i v_m0_b = xx_loadl_64(mask);
475 168312000 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
476 168312000 : const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
477 :
478 168342000 : xx_storel_64(dst, v_res_b);
479 :
480 168452000 : dst += dst_stride;
481 168452000 : src0 += src0_stride;
482 168452000 : src1 += src1_stride;
483 168452000 : mask += mask_stride;
484 168452000 : } while (--h);
485 14573700 : break;
486 161487000 : case 16:
487 : do {
488 172749000 : const __m128i v_m0_b = xx_loadu_128(mask);
489 172671000 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
490 172671000 : const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
491 :
492 172729000 : xx_storeu_128(dst, v_res_b);
493 172764000 : dst += dst_stride;
494 172764000 : src0 += src0_stride;
495 172764000 : src1 += src1_stride;
496 172764000 : mask += mask_stride;
497 172764000 : } while (--h);
498 11276900 : break;
499 3715590 : default:
500 3715590 : blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
501 : src1_stride, mask, mask_stride, w, h);
502 : }
503 29612700 : }
504 :
505 29666300 : void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
506 : const uint8_t *src0, uint32_t src0_stride,
507 : const uint8_t *src1, uint32_t src1_stride,
508 : const uint8_t *mask, uint32_t mask_stride, int w,
509 : int h, int subx, int suby)
510 : {
511 29666300 : assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
512 29666300 : assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
513 :
514 29666300 : assert(h >= 1);
515 29666300 : assert(w >= 1);
516 29666300 : assert(IS_POWER_OF_TWO(h));
517 29666300 : assert(IS_POWER_OF_TWO(w));
518 :
519 29666300 : if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
520 0 : aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
521 : mask, mask_stride, w, h, subx, suby);
522 : }
523 : else {
524 29666300 : if (subx & suby) {
525 71390 : blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
526 : src1_stride, mask, mask_stride, w, h);
527 : }
528 29594900 : else if (subx) {
529 0 : blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
530 : src1_stride, mask, mask_stride, w, h);
531 : }
532 29594900 : else if (suby) {
533 0 : blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
534 : src1_stride, mask, mask_stride, w, h);
535 : }
536 : else {
537 29594900 : blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
538 : mask, mask_stride, w, h);
539 : }
540 : }
541 29674000 : }
542 :
543 : /*Functions from convolve_avx2.c*/
544 251250000 : static INLINE void blend_a64_d16_mask_w16_avx2(
545 : uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
546 : const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
547 : int shift)
548 : {
549 251250000 : const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
550 251250000 : const __m256i s0_0 = yy_loadu_256(src0);
551 251176000 : const __m256i s1_0 = yy_loadu_256(src1);
552 752532000 : __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
553 : _mm256_unpacklo_epi16(*m0, max_minus_m0));
554 752532000 : __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
555 : _mm256_unpackhi_epi16(*m0, max_minus_m0));
556 : res0_lo =
557 501688000 : _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
558 : res0_hi =
559 752532000 : _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
560 250844000 : const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
561 250844000 : __m256i res = _mm256_packus_epi16(res0, res0);
562 250844000 : res = _mm256_permute4x64_epi64(res, 0xd8);
563 250844000 : _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
564 250844000 : }
565 :
566 259858000 : static INLINE void blend_a64_d16_mask_w32_avx2(
567 : uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
568 : const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
569 : const __m256i *v_maxval, int shift)
570 : {
571 259858000 : const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
572 259858000 : const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
573 259858000 : const __m256i s0_0 = yy_loadu_256(src0);
574 259623000 : const __m256i s0_1 = yy_loadu_256(src0 + 16);
575 259079000 : const __m256i s1_0 = yy_loadu_256(src1);
576 258594000 : const __m256i s1_1 = yy_loadu_256(src1 + 16);
577 775074000 : __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
578 : _mm256_unpacklo_epi16(*m0, max_minus_m0));
579 775074000 : __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
580 : _mm256_unpackhi_epi16(*m0, max_minus_m0));
581 775074000 : __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
582 : _mm256_unpacklo_epi16(*m1, max_minus_m1));
583 775074000 : __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
584 : _mm256_unpackhi_epi16(*m1, max_minus_m1));
585 : res0_lo =
586 516716000 : _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
587 : res0_hi =
588 516716000 : _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
589 : res1_lo =
590 516716000 : _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
591 : res1_hi =
592 775074000 : _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
593 258358000 : const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
594 258358000 : const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
595 258358000 : __m256i res = _mm256_packus_epi16(res0, res1);
596 258358000 : res = _mm256_permute4x64_epi64(res, 0xd8);
597 : _mm256_storeu_si256((__m256i *)(dst), res);
598 258358000 : }
599 :
600 13567500 : static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
601 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
602 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
603 : const uint8_t *mask, uint32_t mask_stride, int h,
604 : const __m256i *round_offset, int shift)
605 : {
606 13567500 : const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
607 263514000 : for (int i = 0; i < h; ++i) {
608 250036000 : const __m128i m = xx_loadu_128(mask);
609 249866000 : const __m256i m0 = _mm256_cvtepu8_epi16(m);
610 :
611 249866000 : blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
612 : shift);
613 249946000 : mask += mask_stride;
614 249946000 : dst += dst_stride;
615 249946000 : src0 += src0_stride;
616 249946000 : src1 += src1_stride;
617 : }
618 13478100 : }
619 :
620 8457080 : static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
621 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
622 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
623 : const uint8_t *mask, uint32_t mask_stride, int h, int w,
624 : const __m256i *round_offset, int shift)
625 : {
626 8457080 : const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
627 214734000 : for (int i = 0; i < h; ++i) {
628 466100000 : for (int j = 0; j < w; j += 32) {
629 259823000 : const __m256i m = yy_loadu_256(mask + j);
630 259652000 : const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
631 259652000 : const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
632 :
633 259652000 : blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
634 : round_offset, &v_maxval, shift);
635 : }
636 206277000 : mask += mask_stride;
637 206277000 : dst += dst_stride;
638 206277000 : src0 += src0_stride;
639 206277000 : src1 += src1_stride;
640 : }
641 8483450 : }
642 :
643 118618 : static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
644 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
645 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
646 : const uint8_t *mask, uint32_t mask_stride, int h,
647 : const __m256i *round_offset, int shift)
648 : {
649 118618 : const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
650 118618 : const __m256i one_b = _mm256_set1_epi8(1);
651 118618 : const __m256i two_w = _mm256_set1_epi16(2);
652 1575040 : for (int i = 0; i < h; ++i) {
653 1456420 : const __m256i m_i00 = yy_loadu_256(mask);
654 1456420 : const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
655 :
656 1456420 : const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
657 1456420 : const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
658 1456420 : const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
659 :
660 1456420 : blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
661 : shift);
662 1456430 : mask += mask_stride << 1;
663 1456430 : dst += dst_stride;
664 1456430 : src0 += src0_stride;
665 1456430 : src1 += src1_stride;
666 : }
667 118620 : }
668 :
669 41488 : static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
670 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
671 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
672 : const uint8_t *mask, uint32_t mask_stride, int h, int w,
673 : const __m256i *round_offset, int shift)
674 : {
675 41488 : const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
676 41488 : const __m256i one_b = _mm256_set1_epi8(1);
677 41488 : const __m256i two_w = _mm256_set1_epi16(2);
678 662682 : for (int i = 0; i < h; ++i) {
679 1242390 : for (int j = 0; j < w; j += 32) {
680 621193 : const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
681 621191 : const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
682 621185 : const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
683 621179 : const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
684 :
685 621175 : const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
686 621175 : const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
687 621175 : const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
688 621175 : const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
689 1242350 : const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
690 621175 : const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
691 :
692 621175 : blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
693 : round_offset, &v_maxval, shift);
694 : }
695 621194 : mask += mask_stride << 1;
696 621194 : dst += dst_stride;
697 621194 : src0 += src0_stride;
698 621194 : src1 += src1_stride;
699 : }
700 41489 : }
701 :
702 0 : static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
703 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
704 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
705 : const uint8_t *mask, uint32_t mask_stride, int h, int w,
706 : const __m256i *round_offset, int shift)
707 : {
708 0 : const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
709 0 : const __m256i one_b = _mm256_set1_epi8(1);
710 0 : const __m256i zeros = _mm256_setzero_si256();
711 0 : for (int i = 0; i < h; ++i) {
712 0 : for (int j = 0; j < w; j += 16) {
713 0 : const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
714 0 : const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
715 0 : const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
716 :
717 0 : blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
718 : round_offset, &v_maxval, shift);
719 : }
720 0 : mask += mask_stride;
721 0 : dst += dst_stride;
722 0 : src0 += src0_stride;
723 0 : src1 += src1_stride;
724 : }
725 0 : }
726 :
727 0 : static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
728 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
729 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
730 : const uint8_t *mask, uint32_t mask_stride, int h, int w,
731 : const __m256i *round_offset, int shift)
732 : {
733 0 : const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
734 0 : const __m256i one_b = _mm256_set1_epi8(1);
735 0 : const __m256i zeros = _mm256_setzero_si256();
736 0 : for (int i = 0; i < h; ++i) {
737 0 : for (int j = 0; j < w; j += 32) {
738 0 : const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
739 0 : const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
740 0 : const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
741 0 : const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
742 0 : const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
743 0 : const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
744 :
745 0 : blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
746 : round_offset, &v_maxval, shift);
747 : }
748 0 : mask += mask_stride;
749 0 : dst += dst_stride;
750 0 : src0 += src0_stride;
751 0 : src1 += src1_stride;
752 : }
753 0 : }
754 :
755 0 : static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
756 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
757 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
758 : const uint8_t *mask, uint32_t mask_stride, int h, int w,
759 : const __m256i *round_offset, int shift)
760 : {
761 0 : const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
762 0 : const __m128i zeros = _mm_setzero_si128();
763 0 : for (int i = 0; i < h; ++i) {
764 0 : for (int j = 0; j < w; j += 16) {
765 0 : const __m128i m_i00 = xx_loadu_128(mask + j);
766 0 : const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
767 :
768 0 : const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
769 0 : const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
770 :
771 0 : blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
772 : round_offset, &v_maxval, shift);
773 : }
774 0 : mask += mask_stride << 1;
775 0 : dst += dst_stride;
776 0 : src0 += src0_stride;
777 0 : src1 += src1_stride;
778 : }
779 0 : }
780 :
781 0 : static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
782 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
783 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
784 : const uint8_t *mask, uint32_t mask_stride, int h, int w,
785 : const __m256i *round_offset, int shift)
786 : {
787 0 : const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
788 0 : const __m256i zeros = _mm256_setzero_si256();
789 0 : for (int i = 0; i < h; ++i) {
790 0 : for (int j = 0; j < w; j += 32) {
791 0 : const __m256i m_i00 = yy_loadu_256(mask + j);
792 0 : const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
793 :
794 : const __m256i m_ac =
795 0 : _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
796 0 : const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
797 0 : const __m256i m1 =
798 0 : _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
799 :
800 0 : blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
801 : round_offset, &v_maxval, shift);
802 : }
803 0 : mask += mask_stride << 1;
804 0 : dst += dst_stride;
805 0 : src0 += src0_stride;
806 0 : src1 += src1_stride;
807 : }
808 0 : }
809 :
810 39849000 : void aom_lowbd_blend_a64_d16_mask_avx2(
811 : uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
812 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
813 : const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
814 : ConvolveParams *conv_params)
815 : {
816 39849000 : const int bd = 8;
817 39849000 : const int round_bits =
818 39849000 : 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
819 :
820 39849000 : const int round_offset =
821 39849000 : ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
822 39849000 : (1 << (round_bits - 1)))
823 : << AOM_BLEND_A64_ROUND_BITS;
824 :
825 39849000 : const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
826 39849000 : assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
827 39849000 : assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
828 :
829 39849000 : assert(h >= 4);
830 39849000 : assert(w >= 4);
831 39849000 : assert(IS_POWER_OF_TWO(h));
832 39849000 : assert(IS_POWER_OF_TWO(w));
833 39849000 : const __m128i v_round_offset = _mm_set1_epi32(round_offset);
834 39849000 : const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
835 :
836 39849000 : if (subw == 0 && subh == 0) {
837 39438800 : switch (w) {
838 0 : case 4:
839 0 : aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
840 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
841 : mask_stride, h, &v_round_offset, shift);
842 0 : break;
843 17434500 : case 8:
844 17434500 : aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
845 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
846 : mask_stride, h, &v_round_offset, shift);
847 17435200 : break;
848 13568300 : case 16:
849 13568300 : lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
850 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
851 : mask_stride, h, &y_round_offset, shift);
852 13569100 : break;
853 8436110 : default:
854 8436110 : lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
855 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
856 : mask_stride, h, w, &y_round_offset, shift);
857 8458210 : break;
858 : }
859 : }
860 410126 : else if (subw == 1 && subh == 1) {
861 420727 : switch (w) {
862 112766 : case 4:
863 112766 : aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
864 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
865 : mask_stride, h, &v_round_offset, shift);
866 112766 : break;
867 147856 : case 8:
868 147856 : aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
869 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
870 : mask_stride, h, &v_round_offset, shift);
871 147856 : break;
872 118618 : case 16:
873 118618 : lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
874 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
875 : mask_stride, h, &y_round_offset, shift);
876 118618 : break;
877 41487 : default:
878 41487 : lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
879 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
880 : mask_stride, h, w, &y_round_offset, shift);
881 41488 : break;
882 : }
883 : }
884 0 : else if (subw == 1 && subh == 0) {
885 0 : switch (w) {
886 0 : case 4:
887 0 : aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
888 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
889 : mask_stride, h, &v_round_offset, shift);
890 0 : break;
891 0 : case 8:
892 0 : aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
893 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
894 : mask_stride, h, &v_round_offset, shift);
895 0 : break;
896 0 : case 16:
897 0 : lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
898 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
899 : mask_stride, h, w, &y_round_offset, shift);
900 0 : break;
901 0 : default:
902 0 : lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
903 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
904 : mask_stride, h, w, &y_round_offset, shift);
905 0 : break;
906 : }
907 : }
908 : else {
909 0 : switch (w) {
910 0 : case 4:
911 0 : aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
912 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
913 : mask_stride, h, &v_round_offset, shift);
914 0 : break;
915 0 : case 8:
916 0 : aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
917 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
918 : mask_stride, h, &v_round_offset, shift);
919 0 : break;
920 0 : case 16:
921 0 : lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
922 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
923 : mask_stride, h, w, &y_round_offset, shift);
924 0 : break;
925 0 : default:
926 0 : lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
927 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
928 : mask_stride, h, w, &y_round_offset, shift);
929 0 : break;
930 : }
931 : }
932 39883200 : }
933 :
934 : //////////////////////////////////////////////////////////////////////////////
935 : // aom_highbd_blend_a64_d16_mask_avx2()
936 : //////////////////////////////////////////////////////////////////////////////
937 :
938 0 : static INLINE void highbd_blend_a64_d16_mask_w4_avx2(
939 : uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
940 : const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0,
941 : const __m256i *round_offset, int shift, const __m256i *clip_low,
942 : const __m256i *clip_high, const __m256i *mask_max)
943 : {
944 : // Load 4x u16 pixels from each of 4 rows from each source
945 0 : const __m256i s0 = _mm256_set_epi64x(*(uint64_t *)(src0 + 3 * src0_stride),
946 0 : *(uint64_t *)(src0 + 2 * src0_stride),
947 0 : *(uint64_t *)(src0 + 1 * src0_stride),
948 0 : *(uint64_t *)(src0 + 0 * src0_stride));
949 0 : const __m256i s1 = _mm256_set_epi64x(*(uint64_t *)(src1 + 3 * src1_stride),
950 0 : *(uint64_t *)(src1 + 2 * src1_stride),
951 0 : *(uint64_t *)(src1 + 1 * src1_stride),
952 0 : *(uint64_t *)(src1 + 0 * src1_stride));
953 : // Generate the inverse mask
954 0 : const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0);
955 :
956 : // Multiply each mask by the respective source
957 0 : const __m256i mul0_highs = _mm256_mulhi_epu16(*mask0, s0);
958 0 : const __m256i mul0_lows = _mm256_mullo_epi16(*mask0, s0);
959 0 : const __m256i mul0h = _mm256_unpackhi_epi16(mul0_lows, mul0_highs);
960 0 : const __m256i mul0l = _mm256_unpacklo_epi16(mul0_lows, mul0_highs);
961 : // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
962 : // lanes Later, packs does the same again which cancels this out with no need
963 : // for a permute. The intermediate values being reordered makes no difference
964 :
965 0 : const __m256i mul1_highs = _mm256_mulhi_epu16(mask1, s1);
966 0 : const __m256i mul1_lows = _mm256_mullo_epi16(mask1, s1);
967 0 : const __m256i mul1h = _mm256_unpackhi_epi16(mul1_lows, mul1_highs);
968 0 : const __m256i mul1l = _mm256_unpacklo_epi16(mul1_lows, mul1_highs);
969 :
970 0 : const __m256i sumh = _mm256_add_epi32(mul0h, mul1h);
971 0 : const __m256i suml = _mm256_add_epi32(mul0l, mul1l);
972 :
973 : const __m256i roundh =
974 0 : _mm256_srai_epi32(_mm256_sub_epi32(sumh, *round_offset), shift);
975 : const __m256i roundl =
976 0 : _mm256_srai_epi32(_mm256_sub_epi32(suml, *round_offset), shift);
977 :
978 0 : const __m256i pack = _mm256_packs_epi32(roundl, roundh);
979 : const __m256i clip =
980 0 : _mm256_min_epi16(_mm256_max_epi16(pack, *clip_low), *clip_high);
981 :
982 : // _mm256_extract_epi64 doesn't exist on x86, so do it the old-fashioned way:
983 0 : const __m128i cliph = _mm256_extracti128_si256(clip, 1);
984 0 : xx_storel_64(dst + 3 * dst_stride, _mm_srli_si128(cliph, 8));
985 0 : xx_storel_64(dst + 2 * dst_stride, cliph);
986 0 : const __m128i clipl = _mm256_castsi256_si128(clip);
987 0 : xx_storel_64(dst + 1 * dst_stride, _mm_srli_si128(clipl, 8));
988 0 : xx_storel_64(dst + 0 * dst_stride, clipl);
989 0 : }
990 :
991 0 : static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
992 : uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
993 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
994 : const uint8_t *mask, uint32_t mask_stride, int h,
995 : const __m256i *round_offset, int shift, const __m256i *clip_low,
996 : const __m256i *clip_high, const __m256i *mask_max)
997 : {
998 : do {
999 : // Load 8x u8 pixels from each of 4 rows of the mask, pad each to u16
1000 0 : const __m128i mask08 = _mm_set_epi32(*(uint32_t *)(mask + 3 * mask_stride),
1001 0 : *(uint32_t *)(mask + 2 * mask_stride),
1002 0 : *(uint32_t *)(mask + 1 * mask_stride),
1003 0 : *(uint32_t *)(mask + 0 * mask_stride));
1004 0 : const __m256i mask0 = _mm256_cvtepu8_epi16(mask08);
1005 :
1006 0 : highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
1007 : src1_stride, &mask0, round_offset, shift,
1008 : clip_low, clip_high, mask_max);
1009 :
1010 0 : dst += dst_stride * 4;
1011 0 : src0 += src0_stride * 4;
1012 0 : src1 += src1_stride * 4;
1013 0 : mask += mask_stride * 4;
1014 0 : } while (h -= 4);
1015 0 : }
1016 :
1017 0 : static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
1018 : uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1019 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1020 : const uint8_t *mask, uint32_t mask_stride, int h,
1021 : const __m256i *round_offset, int shift, const __m256i *clip_low,
1022 : const __m256i *clip_high, const __m256i *mask_max)
1023 : {
1024 0 : const __m256i one_b = _mm256_set1_epi8(1);
1025 0 : const __m256i two_w = _mm256_set1_epi16(2);
1026 : do {
1027 : // Load 8 pixels from each of 8 rows of mask,
1028 : // (saturating) add together rows then use madd to add adjacent pixels
1029 : // Finally, divide each value by 4 (with rounding)
1030 : const __m256i m0246 =
1031 0 : _mm256_set_epi64x(*(uint64_t *)(mask + 6 * mask_stride),
1032 0 : *(uint64_t *)(mask + 4 * mask_stride),
1033 0 : *(uint64_t *)(mask + 2 * mask_stride),
1034 0 : *(uint64_t *)(mask + 0 * mask_stride));
1035 : const __m256i m1357 =
1036 0 : _mm256_set_epi64x(*(uint64_t *)(mask + 7 * mask_stride),
1037 0 : *(uint64_t *)(mask + 5 * mask_stride),
1038 0 : *(uint64_t *)(mask + 3 * mask_stride),
1039 0 : *(uint64_t *)(mask + 1 * mask_stride));
1040 0 : const __m256i addrows = _mm256_adds_epu8(m0246, m1357);
1041 0 : const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b);
1042 0 : const __m256i mask0 =
1043 0 : _mm256_srli_epi16(_mm256_add_epi16(adjacent, two_w), 2);
1044 :
1045 0 : highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
1046 : src1_stride, &mask0, round_offset, shift,
1047 : clip_low, clip_high, mask_max);
1048 :
1049 0 : dst += dst_stride * 4;
1050 0 : src0 += src0_stride * 4;
1051 0 : src1 += src1_stride * 4;
1052 0 : mask += mask_stride * 8;
1053 0 : } while (h -= 4);
1054 0 : }
1055 :
1056 0 : static INLINE void highbd_blend_a64_d16_mask_w8_avx2(
1057 : uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
1058 : const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
1059 : const __m256i *mask0b, const __m256i *round_offset, int shift,
1060 : const __m256i *clip_low, const __m256i *clip_high,
1061 : const __m256i *mask_max)
1062 : {
1063 : // Load 8x u16 pixels from each of 4 rows from each source
1064 : const __m256i s0a =
1065 0 : yy_loadu2_128(src0 + 0 * src0_stride, src0 + 1 * src0_stride);
1066 : const __m256i s0b =
1067 0 : yy_loadu2_128(src0 + 2 * src0_stride, src0 + 3 * src0_stride);
1068 : const __m256i s1a =
1069 0 : yy_loadu2_128(src1 + 0 * src1_stride, src1 + 1 * src1_stride);
1070 : const __m256i s1b =
1071 0 : yy_loadu2_128(src1 + 2 * src1_stride, src1 + 3 * src1_stride);
1072 :
1073 : // Generate inverse masks
1074 0 : const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
1075 0 : const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
1076 :
1077 : // Multiply sources by respective masks
1078 0 : const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
1079 0 : const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
1080 0 : const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
1081 0 : const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
1082 : // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
1083 : // lanes Later, packs does the same again which cancels this out with no need
1084 : // for a permute. The intermediate values being reordered makes no difference
1085 :
1086 0 : const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
1087 0 : const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
1088 0 : const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
1089 0 : const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
1090 :
1091 0 : const __m256i sumah = _mm256_add_epi32(mul0ah, mul1ah);
1092 0 : const __m256i sumal = _mm256_add_epi32(mul0al, mul1al);
1093 :
1094 0 : const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
1095 0 : const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
1096 0 : const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
1097 0 : const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
1098 :
1099 0 : const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
1100 0 : const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
1101 0 : const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
1102 0 : const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
1103 :
1104 0 : const __m256i sumbh = _mm256_add_epi32(mul0bh, mul1bh);
1105 0 : const __m256i sumbl = _mm256_add_epi32(mul0bl, mul1bl);
1106 :
1107 : // Divide down each result, with rounding
1108 : const __m256i roundah =
1109 0 : _mm256_srai_epi32(_mm256_sub_epi32(sumah, *round_offset), shift);
1110 : const __m256i roundal =
1111 0 : _mm256_srai_epi32(_mm256_sub_epi32(sumal, *round_offset), shift);
1112 : const __m256i roundbh =
1113 0 : _mm256_srai_epi32(_mm256_sub_epi32(sumbh, *round_offset), shift);
1114 : const __m256i roundbl =
1115 0 : _mm256_srai_epi32(_mm256_sub_epi32(sumbl, *round_offset), shift);
1116 :
1117 : // Pack each i32 down to an i16 with saturation, then clip to valid range
1118 0 : const __m256i packa = _mm256_packs_epi32(roundal, roundah);
1119 : const __m256i clipa =
1120 0 : _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
1121 0 : const __m256i packb = _mm256_packs_epi32(roundbl, roundbh);
1122 : const __m256i clipb =
1123 0 : _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
1124 :
1125 : // Store 8x u16 pixels to each of 4 rows in the destination
1126 0 : yy_storeu2_128(dst + 0 * dst_stride, dst + 1 * dst_stride, clipa);
1127 0 : yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb);
1128 0 : }
1129 :
1130 0 : static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
1131 : uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
1132 : const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
1133 : int mask_stride, int h, const __m256i *round_offset, int shift,
1134 : const __m256i *clip_low, const __m256i *clip_high,
1135 : const __m256i *mask_max)
1136 : {
1137 : do {
1138 : // Load 8x u8 pixels from each of 4 rows in the mask
1139 : const __m128i mask0a8 =
1140 0 : _mm_set_epi64x(*(uint64_t *)mask, *(uint64_t *)(mask + mask_stride));
1141 : const __m128i mask0b8 =
1142 0 : _mm_set_epi64x(*(uint64_t *)(mask + 2 * mask_stride),
1143 0 : *(uint64_t *)(mask + 3 * mask_stride));
1144 0 : const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
1145 0 : const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
1146 :
1147 0 : highbd_blend_a64_d16_mask_w8_avx2(
1148 : dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
1149 : round_offset, shift, clip_low, clip_high, mask_max);
1150 :
1151 0 : dst += dst_stride * 4;
1152 0 : src0 += src0_stride * 4;
1153 0 : src1 += src1_stride * 4;
1154 0 : mask += mask_stride * 4;
1155 0 : } while (h -= 4);
1156 0 : }
1157 :
1158 0 : static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
1159 : uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
1160 : const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
1161 : int mask_stride, int h, const __m256i *round_offset, int shift,
1162 : const __m256i *clip_low, const __m256i *clip_high,
1163 : const __m256i *mask_max)
1164 : {
1165 0 : const __m256i one_b = _mm256_set1_epi8(1);
1166 0 : const __m256i two_w = _mm256_set1_epi16(2);
1167 : do {
1168 : // Load 16x u8 pixels from each of 8 rows in the mask,
1169 : // (saturating) add together rows then use madd to add adjacent pixels
1170 : // Finally, divide each value by 4 (with rounding)
1171 : const __m256i m02 =
1172 0 : yy_loadu2_128(mask + 0 * mask_stride, mask + 2 * mask_stride);
1173 : const __m256i m13 =
1174 0 : yy_loadu2_128(mask + 1 * mask_stride, mask + 3 * mask_stride);
1175 : const __m256i m0123 =
1176 0 : _mm256_maddubs_epi16(_mm256_adds_epu8(m02, m13), one_b);
1177 0 : const __m256i mask_0a =
1178 0 : _mm256_srli_epi16(_mm256_add_epi16(m0123, two_w), 2);
1179 : const __m256i m46 =
1180 0 : yy_loadu2_128(mask + 4 * mask_stride, mask + 6 * mask_stride);
1181 : const __m256i m57 =
1182 0 : yy_loadu2_128(mask + 5 * mask_stride, mask + 7 * mask_stride);
1183 : const __m256i m4567 =
1184 0 : _mm256_maddubs_epi16(_mm256_adds_epu8(m46, m57), one_b);
1185 0 : const __m256i mask_0b =
1186 0 : _mm256_srli_epi16(_mm256_add_epi16(m4567, two_w), 2);
1187 :
1188 0 : highbd_blend_a64_d16_mask_w8_avx2(
1189 : dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
1190 : &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
1191 :
1192 0 : dst += dst_stride * 4;
1193 0 : src0 += src0_stride * 4;
1194 0 : src1 += src1_stride * 4;
1195 0 : mask += mask_stride * 8;
1196 0 : } while (h -= 4);
1197 0 : }
1198 :
1199 0 : static INLINE void highbd_blend_a64_d16_mask_w16_avx2(
1200 : uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
1201 : const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
1202 : const __m256i *mask0b, const __m256i *round_offset, int shift,
1203 : const __m256i *clip_low, const __m256i *clip_high,
1204 : const __m256i *mask_max)
1205 : {
1206 : // Load 16x pixels from each of 2 rows from each source
1207 0 : const __m256i s0a = yy_loadu_256(src0);
1208 0 : const __m256i s0b = yy_loadu_256(src0 + src0_stride);
1209 0 : const __m256i s1a = yy_loadu_256(src1);
1210 0 : const __m256i s1b = yy_loadu_256(src1 + src1_stride);
1211 :
1212 : // Calculate inverse masks
1213 0 : const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
1214 0 : const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
1215 :
1216 : // Multiply each source by appropriate mask
1217 0 : const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
1218 0 : const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
1219 0 : const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
1220 0 : const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
1221 : // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
1222 : // lanes Later, packs does the same again which cancels this out with no need
1223 : // for a permute. The intermediate values being reordered makes no difference
1224 :
1225 0 : const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
1226 0 : const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
1227 0 : const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
1228 0 : const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
1229 :
1230 0 : const __m256i mulah = _mm256_add_epi32(mul0ah, mul1ah);
1231 0 : const __m256i mulal = _mm256_add_epi32(mul0al, mul1al);
1232 :
1233 0 : const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
1234 0 : const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
1235 0 : const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
1236 0 : const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
1237 :
1238 0 : const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
1239 0 : const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
1240 0 : const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
1241 0 : const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
1242 :
1243 0 : const __m256i mulbh = _mm256_add_epi32(mul0bh, mul1bh);
1244 0 : const __m256i mulbl = _mm256_add_epi32(mul0bl, mul1bl);
1245 :
1246 : const __m256i resah =
1247 0 : _mm256_srai_epi32(_mm256_sub_epi32(mulah, *round_offset), shift);
1248 : const __m256i resal =
1249 0 : _mm256_srai_epi32(_mm256_sub_epi32(mulal, *round_offset), shift);
1250 : const __m256i resbh =
1251 0 : _mm256_srai_epi32(_mm256_sub_epi32(mulbh, *round_offset), shift);
1252 : const __m256i resbl =
1253 0 : _mm256_srai_epi32(_mm256_sub_epi32(mulbl, *round_offset), shift);
1254 :
1255 : // Signed saturating pack from i32 to i16:
1256 0 : const __m256i packa = _mm256_packs_epi32(resal, resah);
1257 0 : const __m256i packb = _mm256_packs_epi32(resbl, resbh);
1258 :
1259 : // Clip the values to the valid range
1260 : const __m256i clipa =
1261 0 : _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
1262 : const __m256i clipb =
1263 0 : _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
1264 :
1265 : // Store 16 pixels
1266 0 : yy_storeu_256(dst, clipa);
1267 0 : yy_storeu_256(dst + dst_stride, clipb);
1268 0 : }
1269 :
1270 0 : static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
1271 : uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
1272 : const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
1273 : int mask_stride, int h, int w, const __m256i *round_offset, int shift,
1274 : const __m256i *clip_low, const __m256i *clip_high,
1275 : const __m256i *mask_max)
1276 : {
1277 0 : for (int i = 0; i < h; i += 2) {
1278 0 : for (int j = 0; j < w; j += 16) {
1279 : // Load 16x u8 alpha-mask values from each of two rows and pad to u16
1280 0 : const __m128i masks_a8 = xx_loadu_128(mask + j);
1281 0 : const __m128i masks_b8 = xx_loadu_128(mask + mask_stride + j);
1282 0 : const __m256i mask0a = _mm256_cvtepu8_epi16(masks_a8);
1283 0 : const __m256i mask0b = _mm256_cvtepu8_epi16(masks_b8);
1284 :
1285 0 : highbd_blend_a64_d16_mask_w16_avx2(
1286 0 : dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
1287 : &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
1288 : }
1289 0 : dst += dst_stride * 2;
1290 0 : src0 += src0_stride * 2;
1291 0 : src1 += src1_stride * 2;
1292 0 : mask += mask_stride * 2;
1293 : }
1294 0 : }
1295 :
1296 0 : static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
1297 : uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
1298 : const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
1299 : int mask_stride, int h, int w, const __m256i *round_offset, int shift,
1300 : const __m256i *clip_low, const __m256i *clip_high,
1301 : const __m256i *mask_max)
1302 : {
1303 0 : const __m256i one_b = _mm256_set1_epi8(1);
1304 0 : const __m256i two_w = _mm256_set1_epi16(2);
1305 0 : for (int i = 0; i < h; i += 2) {
1306 0 : for (int j = 0; j < w; j += 16) {
1307 : // Load 32x u8 alpha-mask values from each of four rows
1308 : // (saturating) add pairs of rows, then use madd to add adjacent values
1309 : // Finally, divide down each result with rounding
1310 0 : const __m256i m0 = yy_loadu_256(mask + 0 * mask_stride + 2 * j);
1311 0 : const __m256i m1 = yy_loadu_256(mask + 1 * mask_stride + 2 * j);
1312 0 : const __m256i m2 = yy_loadu_256(mask + 2 * mask_stride + 2 * j);
1313 0 : const __m256i m3 = yy_loadu_256(mask + 3 * mask_stride + 2 * j);
1314 :
1315 0 : const __m256i m01_8 = _mm256_adds_epu8(m0, m1);
1316 0 : const __m256i m23_8 = _mm256_adds_epu8(m2, m3);
1317 :
1318 0 : const __m256i m01 = _mm256_maddubs_epi16(m01_8, one_b);
1319 0 : const __m256i m23 = _mm256_maddubs_epi16(m23_8, one_b);
1320 :
1321 0 : const __m256i mask0a = _mm256_srli_epi16(_mm256_add_epi16(m01, two_w), 2);
1322 0 : const __m256i mask0b = _mm256_srli_epi16(_mm256_add_epi16(m23, two_w), 2);
1323 :
1324 0 : highbd_blend_a64_d16_mask_w16_avx2(
1325 0 : dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
1326 : &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
1327 : }
1328 0 : dst += dst_stride * 2;
1329 0 : src0 += src0_stride * 2;
1330 0 : src1 += src1_stride * 2;
1331 0 : mask += mask_stride * 4;
1332 : }
1333 0 : }
1334 :
1335 0 : void aom_highbd_blend_a64_d16_mask_avx2(
1336 : uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
1337 : uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
1338 : const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
1339 : ConvolveParams *conv_params, const int bd)
1340 : {
1341 0 : uint16_t *dst = (uint16_t *)(dst8);//CONVERT_TO_SHORTPTR(dst8);
1342 0 : const int round_bits =
1343 0 : 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1344 0 : const int32_t round_offset =
1345 0 : ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
1346 0 : (1 << (round_bits - 1)))
1347 : << AOM_BLEND_A64_ROUND_BITS;
1348 0 : const __m256i v_round_offset = _mm256_set1_epi32(round_offset);
1349 0 : const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
1350 :
1351 0 : const __m256i clip_low = _mm256_set1_epi16(0);
1352 0 : const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1);
1353 0 : const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
1354 :
1355 0 : assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
1356 0 : assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
1357 :
1358 0 : assert(h >= 4);
1359 0 : assert(w >= 4);
1360 0 : assert(IS_POWER_OF_TWO(h));
1361 0 : assert(IS_POWER_OF_TWO(w));
1362 :
1363 0 : if (subw == 0 && subh == 0) {
1364 0 : switch (w) {
1365 0 : case 4:
1366 0 : highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
1367 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1368 : mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
1369 : &mask_max);
1370 0 : break;
1371 0 : case 8:
1372 0 : highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
1373 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1374 : mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
1375 : &mask_max);
1376 0 : break;
1377 0 : default: // >= 16
1378 0 : highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
1379 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1380 : mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
1381 : &mask_max);
1382 0 : break;
1383 : }
1384 :
1385 : }
1386 0 : else if (subw == 1 && subh == 1) {
1387 0 : switch (w) {
1388 0 : case 4:
1389 0 : highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
1390 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1391 : mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
1392 : &mask_max);
1393 0 : break;
1394 0 : case 8:
1395 0 : highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
1396 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1397 : mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
1398 : &mask_max);
1399 0 : break;
1400 0 : default: // >= 16
1401 0 : highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
1402 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1403 : mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
1404 : &mask_max);
1405 0 : break;
1406 : }
1407 : }
1408 : else {
1409 : // Sub-sampling in only one axis doesn't seem to happen very much, so fall
1410 : // back to the vanilla C implementation instead of having all the optimised
1411 : // code for these.
1412 0 : aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
1413 : src1_stride, mask, mask_stride, w, h, subw,
1414 : subh, conv_params, bd);
1415 : }
1416 0 : }
|