Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : /*
7 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
8 : *
9 : * This source code is subject to the terms of the BSD 2 Clause License and
10 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
11 : * was not distributed with this source code in the LICENSE file, you can
12 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
13 : * Media Patent License 1.0 was not distributed with this source code in the
14 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
15 : */
16 :
17 : #include <assert.h>
18 : #include "smmintrin.h"
19 :
20 : #include "EbDefinitions.h"
21 :
22 : #include "EbBlend_sse4.h"
23 :
24 : #include "aom_dsp_rtcd.h"
25 :
26 : //////////////////////////////////////////////////////////////////////////////
27 : // No sub-sampling
28 : //////////////////////////////////////////////////////////////////////////////
29 :
30 6631380 : static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
31 : const uint8_t *src0, uint32_t src0_stride,
32 : const uint8_t *src1, uint32_t src1_stride,
33 : const uint8_t *mask, uint32_t mask_stride,
34 : int w, int h)
35 : {
36 : (void)w;
37 6631380 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
38 6631380 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
39 : do {
40 89012800 : const __m128i v_m0_b = xx_loadl_32(mask);
41 89003300 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
42 89003300 : const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
43 89018900 : xx_storel_32(dst, v_res_b);
44 :
45 89012300 : dst += dst_stride;
46 89012300 : src0 += src0_stride;
47 89012300 : src1 += src1_stride;
48 89012300 : mask += mask_stride;
49 89012300 : } while (--h);
50 6630880 : }
51 :
52 4184930 : static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
53 : const uint8_t *src0, uint32_t src0_stride,
54 : const uint8_t *src1, uint32_t src1_stride,
55 : const uint8_t *mask, uint32_t mask_stride,
56 : int w, int h)
57 : {
58 : (void)w;
59 4184930 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
60 4184930 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
61 : do {
62 80632500 : const __m128i v_m0_b = xx_loadl_64(mask);
63 80624600 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
64 80624600 : const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
65 80647200 : xx_storel_64(dst, v_res_b);
66 :
67 80632500 : dst += dst_stride;
68 80632500 : src0 += src0_stride;
69 80632500 : src1 += src1_stride;
70 80632500 : mask += mask_stride;
71 80632500 : } while (--h);
72 4184880 : }
73 :
74 3208660 : static void blend_a64_mask_w16n_sse4_1(
75 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
76 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
77 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
78 : {
79 3208660 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
80 3208660 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
81 :
82 : do {
83 : int c;
84 158523000 : for (c = 0; c < w; c += 16) {
85 89210500 : const __m128i v_m0_b = xx_loadu_128(mask + c);
86 89180200 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
87 :
88 : const __m128i v_res_b =
89 89180200 : blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
90 :
91 89214500 : xx_storeu_128(dst + c, v_res_b);
92 : }
93 69312600 : dst += dst_stride;
94 69312600 : src0 += src0_stride;
95 69312600 : src1 += src1_stride;
96 69312600 : mask += mask_stride;
97 69312600 : } while (--h);
98 3211290 : }
99 :
100 : //////////////////////////////////////////////////////////////////////////////
101 : // Horizontal sub-sampling
102 : //////////////////////////////////////////////////////////////////////////////
103 :
104 0 : static void blend_a64_mask_sx_w4_sse4_1(
105 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
106 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
107 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
108 : {
109 : (void)w;
110 :
111 0 : const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
112 0 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
113 0 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
114 : do {
115 0 : const __m128i v_r_b = xx_loadl_64(mask);
116 0 : const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
117 0 : const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
118 0 : const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
119 0 : const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
120 0 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
121 :
122 0 : const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
123 0 : xx_storel_32(dst, v_res_b);
124 :
125 0 : dst += dst_stride;
126 0 : src0 += src0_stride;
127 0 : src1 += src1_stride;
128 0 : mask += mask_stride;
129 0 : } while (--h);
130 0 : }
131 :
132 0 : static void blend_a64_mask_sx_w8_sse4_1(
133 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
134 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
135 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
136 : {
137 : (void)w;
138 :
139 0 : const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
140 0 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
141 0 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
142 : do {
143 0 : const __m128i v_r_b = xx_loadu_128(mask);
144 0 : const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
145 0 : const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
146 0 : const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
147 0 : const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
148 0 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
149 :
150 0 : const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
151 :
152 0 : xx_storel_64(dst, v_res_b);
153 :
154 0 : dst += dst_stride;
155 0 : src0 += src0_stride;
156 0 : src1 += src1_stride;
157 0 : mask += mask_stride;
158 0 : } while (--h);
159 0 : }
160 :
161 0 : static void blend_a64_mask_sx_w16n_sse4_1(
162 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
163 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
164 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
165 : {
166 0 : const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
167 0 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
168 0 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
169 :
170 : do {
171 : int c;
172 0 : for (c = 0; c < w; c += 16) {
173 0 : const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
174 0 : const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
175 0 : const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
176 0 : const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
177 0 : const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
178 0 : const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
179 0 : const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
180 0 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
181 :
182 : const __m128i v_res_b =
183 0 : blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
184 :
185 0 : xx_storeu_128(dst + c, v_res_b);
186 : }
187 0 : dst += dst_stride;
188 0 : src0 += src0_stride;
189 0 : src1 += src1_stride;
190 0 : mask += mask_stride;
191 0 : } while (--h);
192 0 : }
193 :
194 : //////////////////////////////////////////////////////////////////////////////
195 : // Vertical sub-sampling
196 : //////////////////////////////////////////////////////////////////////////////
197 :
198 0 : static void blend_a64_mask_sy_w4_sse4_1(
199 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
200 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
201 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
202 : {
203 : (void)w;
204 :
205 0 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
206 0 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
207 :
208 : do {
209 0 : const __m128i v_ra_b = xx_loadl_32(mask);
210 0 : const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
211 0 : const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
212 0 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
213 :
214 0 : const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
215 :
216 0 : xx_storel_32(dst, v_res_b);
217 :
218 0 : dst += dst_stride;
219 0 : src0 += src0_stride;
220 0 : src1 += src1_stride;
221 0 : mask += 2 * mask_stride;
222 0 : } while (--h);
223 0 : }
224 :
225 0 : static void blend_a64_mask_sy_w8_sse4_1(
226 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
227 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
228 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
229 : {
230 : (void)w;
231 :
232 0 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
233 0 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
234 : do {
235 0 : const __m128i v_ra_b = xx_loadl_64(mask);
236 0 : const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
237 0 : const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
238 0 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
239 0 : const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
240 :
241 0 : xx_storel_64(dst, v_res_b);
242 :
243 0 : dst += dst_stride;
244 0 : src0 += src0_stride;
245 0 : src1 += src1_stride;
246 0 : mask += 2 * mask_stride;
247 0 : } while (--h);
248 0 : }
249 :
250 0 : static void blend_a64_mask_sy_w16n_sse4_1(
251 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
252 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
253 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
254 : {
255 0 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
256 0 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
257 : do {
258 : int c;
259 0 : for (c = 0; c < w; c += 16) {
260 0 : const __m128i v_ra_b = xx_loadu_128(mask + c);
261 0 : const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
262 0 : const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
263 0 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
264 :
265 : const __m128i v_res_b =
266 0 : blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
267 :
268 0 : xx_storeu_128(dst + c, v_res_b);
269 : }
270 0 : dst += dst_stride;
271 0 : src0 += src0_stride;
272 0 : src1 += src1_stride;
273 0 : mask += 2 * mask_stride;
274 0 : } while (--h);
275 0 : }
276 :
277 : //////////////////////////////////////////////////////////////////////////////
278 : // Horizontal and Vertical sub-sampling
279 : //////////////////////////////////////////////////////////////////////////////
280 :
281 0 : static void blend_a64_mask_sx_sy_w4_sse4_1(
282 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
283 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
284 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
285 : {
286 0 : const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
287 0 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
288 0 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
289 : (void)w;
290 :
291 : do {
292 0 : const __m128i v_ra_b = xx_loadl_64(mask);
293 0 : const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
294 0 : const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
295 0 : const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
296 0 : const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
297 0 : const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
298 0 : const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
299 0 : const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
300 0 : const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
301 0 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
302 :
303 0 : const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
304 :
305 0 : xx_storel_32(dst, v_res_b);
306 :
307 0 : dst += dst_stride;
308 0 : src0 += src0_stride;
309 0 : src1 += src1_stride;
310 0 : mask += 2 * mask_stride;
311 0 : } while (--h);
312 0 : }
313 :
314 0 : static void blend_a64_mask_sx_sy_w8_sse4_1(
315 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
316 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
317 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
318 : {
319 0 : const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
320 0 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
321 0 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
322 : (void)w;
323 :
324 : do {
325 0 : const __m128i v_ra_b = xx_loadu_128(mask);
326 0 : const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
327 :
328 0 : const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
329 0 : const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
330 0 : const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
331 0 : const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
332 0 : const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
333 0 : const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
334 0 : const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
335 0 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
336 :
337 0 : const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
338 :
339 0 : xx_storel_64(dst, v_res_b);
340 :
341 0 : dst += dst_stride;
342 0 : src0 += src0_stride;
343 0 : src1 += src1_stride;
344 0 : mask += 2 * mask_stride;
345 0 : } while (--h);
346 0 : }
347 :
348 0 : static void blend_a64_mask_sx_sy_w16n_sse4_1(
349 : uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
350 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
351 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
352 : {
353 0 : const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
354 : 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
355 0 : const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
356 0 : const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
357 : do {
358 : int c;
359 0 : for (c = 0; c < w; c += 16) {
360 0 : const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
361 0 : const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
362 0 : const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
363 0 : const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
364 0 : const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
365 0 : const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
366 0 : const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
367 0 : const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
368 : const __m128i v_rvsbl_w =
369 0 : _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
370 : const __m128i v_rvsbh_w =
371 0 : _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
372 0 : const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
373 0 : const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
374 :
375 0 : const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
376 0 : const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
377 0 : const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
378 0 : const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
379 :
380 : const __m128i v_res_b =
381 0 : blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
382 :
383 0 : xx_storeu_128(dst + c, v_res_b);
384 : }
385 0 : dst += dst_stride;
386 0 : src0 += src0_stride;
387 0 : src1 += src1_stride;
388 0 : mask += 2 * mask_stride;
389 0 : } while (--h);
390 0 : }
391 :
392 : //////////////////////////////////////////////////////////////////////////////
393 : // Dispatch
394 : //////////////////////////////////////////////////////////////////////////////
395 :
396 14317000 : void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
397 : const uint8_t *src0, uint32_t src0_stride,
398 : const uint8_t *src1, uint32_t src1_stride,
399 : const uint8_t *mask, uint32_t mask_stride, int w,
400 : int h, int subx, int suby)
401 : {
402 : typedef void(*blend_fn)(
403 : uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
404 : uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
405 : const uint8_t *mask, uint32_t mask_stride, int w, int h);
406 :
407 : // Dimensions are: width_index X subx X suby
408 : static const blend_fn blend[3][2][2] = {
409 : { // w % 16 == 0
410 : { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
411 : { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
412 : { // w == 4
413 : { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
414 : { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
415 : { // w == 8
416 : { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
417 : { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
418 : };
419 :
420 14317000 : assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
421 14317000 : assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
422 :
423 14317000 : assert(h >= 1);
424 14317000 : assert(w >= 1);
425 14317000 : assert(IS_POWER_OF_TWO(h));
426 14317000 : assert(IS_POWER_OF_TWO(w));
427 :
428 14317000 : if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
429 295896 : aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
430 : mask, mask_stride, w, h, subx, suby);
431 : }
432 : else {
433 14021100 : blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
434 : src0_stride, src1, src1_stride,
435 : mask, mask_stride, w, h);
436 : }
437 14317200 : }
438 :
439 : //////////////////////////////////////////////////////////////////////////////
440 : // No sub-sampling
441 : //////////////////////////////////////////////////////////////////////////////
442 :
443 0 : static INLINE void blend_a64_mask_bn_w4_sse4_1(
444 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
445 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
446 : const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend)
447 : {
448 0 : const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
449 :
450 : do {
451 0 : const __m128i v_m0_b = xx_loadl_32(mask);
452 0 : const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
453 0 : const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
454 :
455 0 : const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
456 :
457 0 : xx_storel_64(dst, v_res_w);
458 :
459 0 : dst += dst_stride;
460 0 : src0 += src0_stride;
461 0 : src1 += src1_stride;
462 0 : mask += mask_stride;
463 0 : } while (--h);
464 0 : }
465 :
466 0 : static void blend_a64_mask_b10_w4_sse4_1(
467 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
468 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
469 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
470 : {
471 : (void)w;
472 0 : blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
473 : src1_stride, mask, mask_stride, h, blend_4_b10);
474 0 : }
475 :
476 0 : static void blend_a64_mask_b12_w4_sse4_1(
477 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
478 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
479 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
480 : {
481 : (void)w;
482 0 : blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
483 : src1_stride, mask, mask_stride, h, blend_4_b12);
484 0 : }
485 :
486 0 : static INLINE void blend_a64_mask_bn_w8n_sse4_1(
487 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
488 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
489 : const uint8_t *mask, uint32_t mask_stride, int w, int h,
490 : blend_unit_fn blend)
491 : {
492 0 : const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
493 :
494 : do {
495 : int c;
496 0 : for (c = 0; c < w; c += 8) {
497 0 : const __m128i v_m0_b = xx_loadl_64(mask + c);
498 0 : const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
499 0 : const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
500 :
501 0 : const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
502 :
503 0 : xx_storeu_128(dst + c, v_res_w);
504 : }
505 0 : dst += dst_stride;
506 0 : src0 += src0_stride;
507 0 : src1 += src1_stride;
508 0 : mask += mask_stride;
509 0 : } while (--h);
510 0 : }
511 :
512 0 : static void blend_a64_mask_b10_w8n_sse4_1(
513 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
514 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
515 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
516 : {
517 0 : blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
518 : src1_stride, mask, mask_stride, w, h,
519 : blend_8_b10);
520 0 : }
521 :
522 0 : static void blend_a64_mask_b12_w8n_sse4_1(
523 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
524 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
525 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
526 : {
527 0 : blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
528 : src1_stride, mask, mask_stride, w, h,
529 : blend_8_b12);
530 0 : }
531 :
532 : //////////////////////////////////////////////////////////////////////////////
533 : // Horizontal sub-sampling
534 : //////////////////////////////////////////////////////////////////////////////
535 :
536 0 : static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
537 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
538 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
539 : const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend)
540 : {
541 0 : const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
542 : 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
543 0 : const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
544 :
545 : do {
546 0 : const __m128i v_r_b = xx_loadl_64(mask);
547 0 : const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
548 :
549 0 : const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
550 0 : const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
551 :
552 0 : const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
553 :
554 0 : xx_storel_64(dst, v_res_w);
555 :
556 0 : dst += dst_stride;
557 0 : src0 += src0_stride;
558 0 : src1 += src1_stride;
559 0 : mask += mask_stride;
560 0 : } while (--h);
561 0 : }
562 :
563 0 : static void blend_a64_mask_b10_sx_w4_sse4_1(
564 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
565 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
566 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
567 : {
568 : (void)w;
569 0 : blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
570 : src1_stride, mask, mask_stride, h,
571 : blend_4_b10);
572 0 : }
573 :
574 0 : static void blend_a64_mask_b12_sx_w4_sse4_1(
575 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
576 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
577 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
578 : {
579 : (void)w;
580 0 : blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
581 : src1_stride, mask, mask_stride, h,
582 : blend_4_b12);
583 0 : }
584 :
585 0 : static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
586 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
587 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
588 : const uint8_t *mask, uint32_t mask_stride, int w, int h,
589 : blend_unit_fn blend)
590 : {
591 0 : const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
592 : 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
593 0 : const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
594 :
595 : do {
596 : int c;
597 0 : for (c = 0; c < w; c += 8) {
598 0 : const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
599 0 : const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
600 :
601 0 : const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
602 0 : const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
603 :
604 0 : const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
605 :
606 0 : xx_storeu_128(dst + c, v_res_w);
607 : }
608 0 : dst += dst_stride;
609 0 : src0 += src0_stride;
610 0 : src1 += src1_stride;
611 0 : mask += mask_stride;
612 0 : } while (--h);
613 0 : }
614 :
615 0 : static void blend_a64_mask_b10_sx_w8n_sse4_1(
616 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
617 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
618 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
619 : {
620 0 : blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
621 : src1_stride, mask, mask_stride, w, h,
622 : blend_8_b10);
623 0 : }
624 :
625 0 : static void blend_a64_mask_b12_sx_w8n_sse4_1(
626 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
627 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
628 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
629 : {
630 0 : blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
631 : src1_stride, mask, mask_stride, w, h,
632 : blend_8_b12);
633 0 : }
634 :
635 : //////////////////////////////////////////////////////////////////////////////
636 : // Vertical sub-sampling
637 : //////////////////////////////////////////////////////////////////////////////
638 :
639 0 : static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
640 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
641 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
642 : const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend)
643 : {
644 0 : const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
645 :
646 : do {
647 0 : const __m128i v_ra_b = xx_loadl_32(mask);
648 0 : const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
649 0 : const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
650 :
651 0 : const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
652 0 : const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
653 :
654 0 : const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
655 :
656 0 : xx_storel_64(dst, v_res_w);
657 :
658 0 : dst += dst_stride;
659 0 : src0 += src0_stride;
660 0 : src1 += src1_stride;
661 0 : mask += 2 * mask_stride;
662 0 : } while (--h);
663 0 : }
664 :
665 0 : static void blend_a64_mask_b10_sy_w4_sse4_1(
666 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
667 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
668 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
669 : {
670 : (void)w;
671 0 : blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
672 : src1_stride, mask, mask_stride, h,
673 : blend_4_b10);
674 0 : }
675 :
676 0 : static void blend_a64_mask_b12_sy_w4_sse4_1(
677 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
678 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
679 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
680 : {
681 : (void)w;
682 0 : blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
683 : src1_stride, mask, mask_stride, h,
684 : blend_4_b12);
685 0 : }
686 :
687 0 : static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
688 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
689 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
690 : const uint8_t *mask, uint32_t mask_stride, int w, int h,
691 : blend_unit_fn blend)
692 : {
693 0 : const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
694 :
695 : do {
696 : int c;
697 0 : for (c = 0; c < w; c += 8) {
698 0 : const __m128i v_ra_b = xx_loadl_64(mask + c);
699 0 : const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
700 0 : const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
701 :
702 0 : const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
703 0 : const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
704 :
705 0 : const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
706 :
707 0 : xx_storeu_128(dst + c, v_res_w);
708 : }
709 0 : dst += dst_stride;
710 0 : src0 += src0_stride;
711 0 : src1 += src1_stride;
712 0 : mask += 2 * mask_stride;
713 0 : } while (--h);
714 0 : }
715 :
716 0 : static void blend_a64_mask_b10_sy_w8n_sse4_1(
717 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
718 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
719 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
720 : {
721 0 : blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
722 : src1_stride, mask, mask_stride, w, h,
723 : blend_8_b10);
724 0 : }
725 :
726 0 : static void blend_a64_mask_b12_sy_w8n_sse4_1(
727 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
728 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
729 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
730 : {
731 0 : blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
732 : src1_stride, mask, mask_stride, w, h,
733 : blend_8_b12);
734 0 : }
735 :
736 : //////////////////////////////////////////////////////////////////////////////
737 : // Horizontal and Vertical sub-sampling
738 : //////////////////////////////////////////////////////////////////////////////
739 :
740 0 : static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
741 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
742 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
743 : const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend)
744 : {
745 0 : const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
746 : 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
747 0 : const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
748 :
749 : do {
750 0 : const __m128i v_ra_b = xx_loadl_64(mask);
751 0 : const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
752 0 : const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
753 0 : const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
754 : const __m128i v_rvsb_w =
755 0 : _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
756 0 : const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
757 :
758 0 : const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
759 0 : const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
760 :
761 0 : const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
762 :
763 0 : xx_storel_64(dst, v_res_w);
764 :
765 0 : dst += dst_stride;
766 0 : src0 += src0_stride;
767 0 : src1 += src1_stride;
768 0 : mask += 2 * mask_stride;
769 0 : } while (--h);
770 0 : }
771 :
772 0 : static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
773 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
774 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
775 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
776 : {
777 : (void)w;
778 0 : blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
779 : src1_stride, mask, mask_stride, h,
780 : blend_4_b10);
781 0 : }
782 :
783 0 : static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
784 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
785 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
786 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
787 : {
788 : (void)w;
789 0 : blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
790 : src1_stride, mask, mask_stride, h,
791 : blend_4_b12);
792 0 : }
793 :
794 0 : static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
795 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
796 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
797 : const uint8_t *mask, uint32_t mask_stride, int w, int h,
798 : blend_unit_fn blend)
799 : {
800 0 : const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
801 : 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
802 0 : const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
803 :
804 : do {
805 : int c;
806 0 : for (c = 0; c < w; c += 8) {
807 0 : const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
808 0 : const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
809 0 : const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
810 0 : const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
811 : const __m128i v_rvsb_w =
812 0 : _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
813 0 : const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
814 :
815 0 : const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
816 0 : const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
817 :
818 0 : const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
819 :
820 0 : xx_storeu_128(dst + c, v_res_w);
821 : }
822 0 : dst += dst_stride;
823 0 : src0 += src0_stride;
824 0 : src1 += src1_stride;
825 0 : mask += 2 * mask_stride;
826 0 : } while (--h);
827 0 : }
828 :
829 0 : static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
830 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
831 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
832 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
833 : {
834 0 : blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
835 : src1_stride, mask, mask_stride, w, h,
836 : blend_8_b10);
837 0 : }
838 :
839 0 : static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
840 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
841 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
842 : const uint8_t *mask, uint32_t mask_stride, int w, int h)
843 : {
844 0 : blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
845 : src1_stride, mask, mask_stride, w, h,
846 : blend_8_b12);
847 0 : }
848 :
849 : //////////////////////////////////////////////////////////////////////////////
850 : // Dispatch
851 : //////////////////////////////////////////////////////////////////////////////
852 :
853 0 : void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
854 : const uint8_t *src0_8,
855 : uint32_t src0_stride,
856 : const uint8_t *src1_8,
857 : uint32_t src1_stride, const uint8_t *mask,
858 : uint32_t mask_stride, int w, int h,
859 : int subx, int suby, int bd)
860 : {
861 : typedef void(*blend_fn)(
862 : uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
863 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
864 : const uint8_t *mask, uint32_t mask_stride, int w, int h);
865 :
866 : // Dimensions are: bd_index X width_index X subx X suby
867 : static const blend_fn blend[2][2][2][2] = {
868 : { // bd == 8 or 10
869 : { // w % 8 == 0
870 : { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
871 : { blend_a64_mask_b10_sx_w8n_sse4_1,
872 : blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
873 : { // w == 4
874 : { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
875 : { blend_a64_mask_b10_sx_w4_sse4_1,
876 : blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
877 : { // bd == 12
878 : { // w % 8 == 0
879 : { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
880 : { blend_a64_mask_b12_sx_w8n_sse4_1,
881 : blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
882 : { // w == 4
883 : { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
884 : { blend_a64_mask_b12_sx_w4_sse4_1,
885 : blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
886 : };
887 :
888 0 : assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
889 0 : assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
890 :
891 0 : assert(h >= 1);
892 0 : assert(w >= 1);
893 0 : assert(IS_POWER_OF_TWO(h));
894 0 : assert(IS_POWER_OF_TWO(w));
895 :
896 0 : assert(bd == 8 || bd == 10 || bd == 12);
897 0 : if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
898 0 : aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
899 : src1_stride, mask, mask_stride, w, h, subx,
900 : suby, bd);
901 : }
902 : else {
903 0 : uint16_t *const dst = (uint16_t *)dst_8;
904 0 : const uint16_t *const src0 = (uint16_t *)src0_8;
905 0 : const uint16_t *const src1 = (uint16_t *)src1_8;
906 :
907 0 : blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
908 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
909 : mask_stride, w, h);
910 : }
911 0 : }
912 :
913 : /*Vertical mask related blend functions*/
914 73094 : static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
915 : const uint8_t *src0, uint32_t src0_stride,
916 : const uint8_t *src1, uint32_t src1_stride,
917 : const uint8_t *mask, int w, int h)
918 : {
919 73094 : const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
920 :
921 : (void)w;
922 :
923 : do {
924 540062 : const __m128i v_m0_w = _mm_set1_epi16(*mask);
925 540062 : const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
926 :
927 540062 : const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
928 :
929 540063 : const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
930 :
931 540063 : xx_storel_32(dst, v_res_b);
932 :
933 540063 : dst += dst_stride;
934 540063 : src0 += src0_stride;
935 540063 : src1 += src1_stride;
936 540063 : mask += 1;
937 540063 : } while (--h);
938 73095 : }
939 :
940 6134670 : static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
941 : const uint8_t *src0, uint32_t src0_stride,
942 : const uint8_t *src1, uint32_t src1_stride,
943 : const uint8_t *mask, int w, int h)
944 : {
945 6134670 : const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
946 :
947 : (void)w;
948 :
949 : do {
950 46285100 : const __m128i v_m0_w = _mm_set1_epi16(*mask);
951 46285100 : const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
952 :
953 46285100 : const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
954 :
955 46280700 : const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
956 :
957 46280700 : xx_storel_64(dst, v_res_b);
958 :
959 46278600 : dst += dst_stride;
960 46278600 : src0 += src0_stride;
961 46278600 : src1 += src1_stride;
962 46278600 : mask += 1;
963 46278600 : } while (--h);
964 6128140 : }
965 :
966 7123800 : static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
967 : const uint8_t *src0,
968 : uint32_t src0_stride,
969 : const uint8_t *src1,
970 : uint32_t src1_stride,
971 : const uint8_t *mask, int w, int h)
972 : {
973 7123800 : const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
974 :
975 : do {
976 : int c;
977 74972400 : const __m128i v_m0_w = _mm_set1_epi16(*mask);
978 74972400 : const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
979 198830000 : for (c = 0; c < w; c += 16) {
980 123929000 : const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
981 : const __m128i v_resh_w =
982 123946000 : blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
983 :
984 123867000 : const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
985 :
986 123867000 : xx_storeu_128(dst + c, v_res_b);
987 : }
988 74900300 : dst += dst_stride;
989 74900300 : src0 += src0_stride;
990 74900300 : src1 += src1_stride;
991 74900300 : mask += 1;
992 74900300 : } while (--h);
993 7051650 : }
994 :
995 : //////////////////////////////////////////////////////////////////////////////
996 : // Dispatch
997 : //////////////////////////////////////////////////////////////////////////////
998 :
999 13328400 : void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
1000 : const uint8_t *src0, uint32_t src0_stride,
1001 : const uint8_t *src1, uint32_t src1_stride,
1002 : const uint8_t *mask, int w, int h)
1003 : {
1004 :
1005 : typedef void(*blend_fn)(uint8_t * dst, uint32_t dst_stride,
1006 : const uint8_t *src0, uint32_t src0_stride,
1007 : const uint8_t *src1, uint32_t src1_stride,
1008 : const uint8_t *mask, int w, int h);
1009 :
1010 : // Dimension: width_index
1011 : static const blend_fn blend[9] = {
1012 : blend_a64_vmask_w16n_sse4_1, // w % 16 == 0
1013 : aom_blend_a64_vmask_c, // w == 1
1014 : aom_blend_a64_vmask_c, // w == 2
1015 : NULL, // INVALID
1016 : blend_a64_vmask_w4_sse4_1, // w == 4
1017 : NULL, // INVALID
1018 : NULL, // INVALID
1019 : NULL, // INVALID
1020 : blend_a64_vmask_w8_sse4_1, // w == 8
1021 : };
1022 :
1023 13328400 : assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
1024 13328400 : assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
1025 :
1026 13328400 : assert(h >= 1);
1027 13328400 : assert(w >= 1);
1028 13328400 : assert(IS_POWER_OF_TWO(h));
1029 13328400 : assert(IS_POWER_OF_TWO(w));
1030 :
1031 13328400 : blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
1032 : h);
1033 13328500 : }
1034 :
1035 : //////////////////////////////////////////////////////////////////////////////
1036 : // Implementation - No sub-sampling
1037 : //////////////////////////////////////////////////////////////////////////////
1038 :
1039 0 : static INLINE void blend_a64_vmask_bn_w4_sse4_1(
1040 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
1041 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
1042 : const uint8_t *mask, int h, blend_unit_fn blend)
1043 : {
1044 0 : const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
1045 :
1046 : do {
1047 0 : const __m128i v_m0_w = _mm_set1_epi16(*mask);
1048 0 : const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
1049 :
1050 0 : const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
1051 :
1052 0 : xx_storel_64(dst, v_res_w);
1053 :
1054 0 : dst += dst_stride;
1055 0 : src0 += src0_stride;
1056 0 : src1 += src1_stride;
1057 0 : mask += 1;
1058 0 : } while (--h);
1059 0 : }
1060 :
1061 0 : static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
1062 : const uint16_t *src0,
1063 : uint32_t src0_stride,
1064 : const uint16_t *src1,
1065 : uint32_t src1_stride,
1066 : const uint8_t *mask, int w, int h)
1067 : {
1068 : (void)w;
1069 0 : blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
1070 : src1_stride, mask, h, blend_4_b10);
1071 0 : }
1072 :
1073 0 : static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
1074 : const uint16_t *src0,
1075 : uint32_t src0_stride,
1076 : const uint16_t *src1,
1077 : uint32_t src1_stride,
1078 : const uint8_t *mask, int w, int h)
1079 : {
1080 : (void)w;
1081 0 : blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
1082 : src1_stride, mask, h, blend_4_b12);
1083 0 : }
1084 :
1085 0 : static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
1086 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
1087 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
1088 : const uint8_t *mask, int w, int h, blend_unit_fn blend)
1089 : {
1090 0 : const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
1091 :
1092 : do {
1093 : int c;
1094 0 : const __m128i v_m0_w = _mm_set1_epi16(*mask);
1095 0 : const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
1096 0 : for (c = 0; c < w; c += 8) {
1097 0 : const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
1098 :
1099 0 : xx_storeu_128(dst + c, v_res_w);
1100 : }
1101 0 : dst += dst_stride;
1102 0 : src0 += src0_stride;
1103 0 : src1 += src1_stride;
1104 0 : mask += 1;
1105 0 : } while (--h);
1106 0 : }
1107 :
1108 0 : static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
1109 : const uint16_t *src0,
1110 : uint32_t src0_stride,
1111 : const uint16_t *src1,
1112 : uint32_t src1_stride,
1113 : const uint8_t *mask, int w, int h)
1114 : {
1115 0 : blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
1116 : src1_stride, mask, w, h, blend_8_b10);
1117 0 : }
1118 :
1119 0 : static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
1120 : const uint16_t *src0,
1121 : uint32_t src0_stride,
1122 : const uint16_t *src1,
1123 : uint32_t src1_stride,
1124 : const uint8_t *mask, int w, int h)
1125 : {
1126 0 : blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
1127 : src1_stride, mask, w, h, blend_8_b12);
1128 0 : }
1129 :
1130 : //////////////////////////////////////////////////////////////////////////////
1131 : // Dispatch
1132 : //////////////////////////////////////////////////////////////////////////////
1133 :
1134 0 : void aom_highbd_blend_a64_vmask_sse4_1(
1135 : uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
1136 : uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
1137 : const uint8_t *mask, int w, int h, int bd)
1138 : {
1139 :
1140 : typedef void(*blend_fn)(uint16_t * dst, uint32_t dst_stride,
1141 : const uint16_t *src0, uint32_t src0_stride,
1142 : const uint16_t *src1, uint32_t src1_stride,
1143 : const uint8_t *mask, int w, int h);
1144 :
1145 : // Dimensions are: bd_index X width_index
1146 : static const blend_fn blend[2][2] = {
1147 : {
1148 : // bd == 8 or 10
1149 : blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0
1150 : blend_a64_vmask_b10_w4_sse4_1, // w == 4
1151 : },
1152 : {
1153 : // bd == 12
1154 : blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0
1155 : blend_a64_vmask_b12_w4_sse4_1, // w == 4
1156 : }
1157 : };
1158 :
1159 0 : assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
1160 0 : assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
1161 :
1162 0 : assert(h >= 1);
1163 0 : assert(w >= 1);
1164 0 : assert(IS_POWER_OF_TWO(h));
1165 0 : assert(IS_POWER_OF_TWO(w));
1166 :
1167 0 : assert(bd == 8 || bd == 10 || bd == 12);
1168 :
1169 0 : if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
1170 0 : aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
1171 : src1_stride, mask, w, h, bd);
1172 : }
1173 : else {
1174 0 : uint16_t *const dst = (uint16_t *)(dst_8);// CONVERT_TO_SHORTPTR(dst_8);
1175 0 : const uint16_t *const src0 = (uint16_t *)(src0_8); //CONVERT_TO_SHORTPTR(src0_8);
1176 0 : const uint16_t *const src1 = (uint16_t *)(src1_8); //CONVERT_TO_SHORTPTR(src1_8);
1177 :
1178 0 : blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
1179 : src1_stride, mask, w, h);
1180 : }
1181 0 : }
1182 :
1183 : /*Horizontal related blend functions*/
1184 :
1185 : // To start out, just dispatch to the function using the 2D mask and
1186 : // pass mask stride as 0. This can be improved upon if necessary.
1187 :
1188 14316700 : void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
1189 : const uint8_t *src0, uint32_t src0_stride,
1190 : const uint8_t *src1, uint32_t src1_stride,
1191 : const uint8_t *mask, int w, int h)
1192 : {
1193 14316700 : aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
1194 : src1_stride, mask, 0, w, h, 0, 0);
1195 14317100 : }
1196 :
1197 0 : void aom_highbd_blend_a64_hmask_sse4_1(
1198 : uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
1199 : uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
1200 : const uint8_t *mask, int w, int h, int bd)
1201 : {
1202 0 : aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
1203 : src1_8, src1_stride, mask, 0, w, h, 0, 0,
1204 : bd);
1205 0 : }
1206 :
1207 0 : void eb_aom_highbd_blend_a64_mask_sse4_1(uint16_t *dst, uint32_t dst_stride,
1208 : const uint16_t *src0,
1209 : uint32_t src0_stride,
1210 : const uint16_t *src1,
1211 : uint32_t src1_stride, const uint8_t *mask,
1212 : uint32_t mask_stride, int w, int h,
1213 : int subw, int subh, int bd) {
1214 : typedef void (*blend_fn)(
1215 : uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
1216 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
1217 : const uint8_t *mask, uint32_t mask_stride, int w, int h);
1218 :
1219 : // Dimensions are: bd_index X width_index X subw X subh
1220 : static const blend_fn blend[2][2][2][2] = {
1221 : { // bd == 8 or 10
1222 : { // w % 8 == 0
1223 : { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
1224 : { blend_a64_mask_b10_sx_w8n_sse4_1,
1225 : blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
1226 : { // w == 4
1227 : { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
1228 : { blend_a64_mask_b10_sx_w4_sse4_1,
1229 : blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
1230 : { // bd == 12
1231 : { // w % 8 == 0
1232 : { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
1233 : { blend_a64_mask_b12_sx_w8n_sse4_1,
1234 : blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
1235 : { // w == 4
1236 : { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
1237 : { blend_a64_mask_b12_sx_w4_sse4_1,
1238 : blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
1239 : };
1240 :
1241 0 : assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
1242 0 : assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
1243 :
1244 0 : assert(h >= 1);
1245 0 : assert(w >= 1);
1246 0 : assert(IS_POWER_OF_TWO(h));
1247 0 : assert(IS_POWER_OF_TWO(w));
1248 :
1249 0 : assert(bd == 8 || bd == 10 || bd == 12);
1250 0 : if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
1251 0 : aom_highbd_blend_a64_mask_c((uint8_t*) dst, dst_stride, (uint8_t*)src0, src0_stride, (uint8_t*)src1,
1252 : src1_stride, mask, mask_stride, w, h, subw,
1253 : subh, bd);
1254 : } else {
1255 : //uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
1256 : //const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
1257 : //const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
1258 :
1259 0 : blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0](
1260 : dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
1261 : mask_stride, w, h);
1262 : }
1263 0 : }
1264 0 : void eb_aom_highbd_blend_a64_hmask_sse4_1(
1265 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
1266 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
1267 : const uint8_t *mask, int w, int h, int bd) {
1268 0 : eb_aom_highbd_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride,
1269 : src1, src1_stride, mask, 0, w, h, 0, 0,
1270 : bd);
1271 0 : }
1272 :
1273 0 : void eb_aom_highbd_blend_a64_vmask_sse4_1(
1274 : uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
1275 : uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
1276 : const uint8_t *mask, int w, int h, int bd) {
1277 : typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
1278 : const uint16_t *src0, uint32_t src0_stride,
1279 : const uint16_t *src1, uint32_t src1_stride,
1280 : const uint8_t *mask, int w, int h);
1281 :
1282 : // Dimensions are: bd_index X width_index
1283 : static const blend_fn blend[2][2] = {
1284 : {
1285 : // bd == 8 or 10
1286 : blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0
1287 : blend_a64_vmask_b10_w4_sse4_1, // w == 4
1288 : },
1289 : {
1290 : // bd == 12
1291 : blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0
1292 : blend_a64_vmask_b12_w4_sse4_1, // w == 4
1293 : }
1294 : };
1295 :
1296 0 : assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
1297 0 : assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
1298 :
1299 0 : assert(h >= 1);
1300 0 : assert(w >= 1);
1301 0 : assert(IS_POWER_OF_TWO(h));
1302 0 : assert(IS_POWER_OF_TWO(w));
1303 :
1304 0 : assert(bd == 8 || bd == 10 || bd == 12);
1305 :
1306 0 : if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
1307 0 : eb_aom_highbd_blend_a64_vmask_c(dst, dst_stride, src0, src0_stride, src1,
1308 : src1_stride, mask, w, h, bd);
1309 : } else {
1310 : //uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
1311 : //const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
1312 : //const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
1313 :
1314 0 : blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
1315 : src1_stride, mask, w, h);
1316 : }
1317 0 : }
|