Line data Source code
1 : /*
2 : * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <immintrin.h>
13 : #include "EbDefinitions.h"
14 : #include "aom_dsp_rtcd.h"
15 : #include "convolve.h"
16 : #include "convolve_avx2.h"
17 : #include "EbInterPrediction.h"
18 : #include "EbMemory_AVX2.h"
19 : #include "synonyms.h"
20 :
21 32156200 : static INLINE __m128i sr_x_round_sse2(const __m128i src) {
22 32156200 : const __m128i round = _mm_set1_epi16(34);
23 32156200 : const __m128i dst = _mm_add_epi16(src, round);
24 32156200 : return _mm_srai_epi16(dst, 6);
25 : }
26 :
27 543599000 : static INLINE __m256i sr_x_round_avx2(const __m256i src) {
28 543599000 : const __m256i round = _mm256_set1_epi16(34);
29 543599000 : const __m256i dst = _mm256_add_epi16(src, round);
30 543599000 : return _mm256_srai_epi16(dst, 6);
31 : }
32 :
33 33554200 : static INLINE __m128i sr_y_round_sse2(const __m128i src) {
34 33554200 : const __m128i round = _mm_set1_epi16(32);
35 33554200 : const __m128i dst = _mm_add_epi16(src, round);
36 33554200 : return _mm_srai_epi16(dst, FILTER_BITS - 1);
37 : }
38 :
39 73473400 : static INLINE void sr_x_round_store_8x2_avx2(const __m256i res,
40 : uint8_t *const dst,
41 : const int32_t dst_stride) {
42 73473400 : const __m256i r = sr_x_round_avx2(res);
43 73502700 : pack_store_8x2_avx2(r, dst, dst_stride);
44 73475800 : }
45 :
46 80388100 : static INLINE void sr_x_round_store_16x2_avx2(const __m256i res[2],
47 : uint8_t *const dst,
48 : const int32_t dst_stride) {
49 : __m256i r[2];
50 :
51 80388100 : r[0] = sr_x_round_avx2(res[0]);
52 80407500 : r[1] = sr_x_round_avx2(res[1]);
53 80409300 : pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
54 80384600 : }
55 :
56 156292000 : static INLINE void sr_x_round_store_32_avx2(const __m256i res[2],
57 : uint8_t *const dst) {
58 : __m256i r[2];
59 :
60 156292000 : r[0] = sr_x_round_avx2(res[0]);
61 156393000 : r[1] = sr_x_round_avx2(res[1]);
62 156410000 : convolve_store_32_avx2(r[0], r[1], dst);
63 156326000 : }
64 :
65 73636100 : static INLINE void sr_y_round_store_8x2_avx2(const __m256i res,
66 : uint8_t *const dst,
67 : const int32_t dst_stride) {
68 73636100 : const __m256i r = sr_y_round_avx2(res);
69 73657700 : pack_store_8x2_avx2(r, dst, dst_stride);
70 73637800 : }
71 :
72 82167700 : static INLINE void sr_y_round_store_16x2_avx2(const __m256i res[2],
73 : uint8_t *const dst,
74 : const int32_t dst_stride) {
75 : __m256i r[2];
76 :
77 82167700 : r[0] = sr_y_round_avx2(res[0]);
78 82198700 : r[1] = sr_y_round_avx2(res[1]);
79 82212000 : pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
80 82184000 : }
81 :
82 161284000 : static INLINE void sr_y_round_store_32_avx2(const __m256i res[2],
83 : uint8_t *const dst) {
84 : __m256i r[2];
85 :
86 161284000 : r[0] = sr_y_round_avx2(res[0]);
87 161406000 : r[1] = sr_y_round_avx2(res[1]);
88 161372000 : convolve_store_32_avx2(r[0], r[1], dst);
89 161312000 : }
90 :
91 72922600 : static INLINE void sr_y_round_store_32x2_avx2(const __m256i res[2],
92 : uint8_t *const dst,
93 : const int32_t dst_stride) {
94 72922600 : sr_y_round_store_32_avx2(res, dst);
95 72960100 : sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
96 72953500 : }
97 :
98 15723700 : static INLINE void sr_y_2tap_32_avx2(const uint8_t *const src,
99 : const __m256i coeffs[1], const __m256i s0,
100 : __m256i *const s1, uint8_t *const dst) {
101 : __m256i r[2];
102 15723700 : y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
103 15722900 : sr_y_round_store_32_avx2(r, dst);
104 15724100 : }
105 :
106 4154600 : static INLINE void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
107 : const __m256i s0, __m256i *const s1,
108 : uint8_t *const dst) {
109 4154600 : *s1 = _mm256_loadu_si256((__m256i *)src);
110 8309190 : const __m256i d = _mm256_avg_epu8(s0, *s1);
111 : _mm256_storeu_si256((__m256i *)dst, d);
112 4154600 : }
113 :
114 31790500 : void eb_av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
115 : uint8_t *dst, int32_t dst_stride, int32_t w,
116 : int32_t h, InterpFilterParams *filter_params_x,
117 : InterpFilterParams *filter_params_y,
118 : const int32_t subpel_x_q4,
119 : const int32_t subpel_y_q4,
120 : ConvolveParams *conv_params) {
121 : int32_t x, y;
122 : __m128i coeffs_128[4];
123 : __m256i coeffs_256[4];
124 :
125 : (void)filter_params_x;
126 : (void)subpel_x_q4;
127 : (void)conv_params;
128 :
129 31790500 : if (is_convolve_2tap(filter_params_y->filter_ptr)) {
130 : // vert_filt as 2 tap
131 3719800 : const uint8_t *src_ptr = src;
132 :
133 3719800 : y = h;
134 :
135 3719800 : if (subpel_y_q4 != 8) {
136 2985080 : if (w <= 8) {
137 1493620 : prepare_half_coeffs_2tap_ssse3(
138 : filter_params_y, subpel_y_q4, coeffs_128);
139 :
140 1493600 : if (w == 2) {
141 : __m128i s_16[2];
142 :
143 0 : s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
144 :
145 : do {
146 0 : const __m128i res = y_convolve_2tap_2x2_ssse3(
147 : src_ptr, src_stride, coeffs_128, s_16);
148 0 : const __m128i r = sr_y_round_sse2(res);
149 0 : pack_store_2x2_sse2(r, dst, dst_stride);
150 0 : src_ptr += 2 * src_stride;
151 0 : dst += 2 * dst_stride;
152 0 : y -= 2;
153 0 : } while (y);
154 : }
155 1493600 : else if (w == 4) {
156 : __m128i s_32[2];
157 :
158 585890 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
159 :
160 : do {
161 1559290 : const __m128i res = y_convolve_2tap_4x2_ssse3(
162 : src_ptr, src_stride, coeffs_128, s_32);
163 1559290 : const __m128i r = sr_y_round_sse2(res);
164 1559290 : pack_store_4x2_sse2(r, dst, dst_stride);
165 1559290 : src_ptr += 2 * src_stride;
166 1559290 : dst += 2 * dst_stride;
167 1559290 : y -= 2;
168 1559290 : } while (y);
169 : }
170 : else {
171 : __m128i s_64[2], s_128[2];
172 :
173 1200650 : assert(w == 8);
174 :
175 1200650 : s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
176 :
177 : do {
178 : // Note: Faster than binding to AVX2 registers.
179 7841140 : s_64[1] =
180 7841140 : _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
181 7841140 : s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
182 7841140 : s_64[0] = _mm_loadl_epi64(
183 7841140 : (__m128i *)(src_ptr + 2 * src_stride));
184 7841140 : s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
185 7841140 : const __m128i ss0 =
186 7841140 : _mm_unpacklo_epi8(s_128[0], s_128[1]);
187 7841140 : const __m128i ss1 =
188 7841140 : _mm_unpackhi_epi8(s_128[0], s_128[1]);
189 : const __m128i res0 =
190 7841140 : convolve_2tap_ssse3(&ss0, coeffs_128);
191 : const __m128i res1 =
192 7841370 : convolve_2tap_ssse3(&ss1, coeffs_128);
193 7841250 : const __m128i r0 = sr_y_round_sse2(res0);
194 7841200 : const __m128i r1 = sr_y_round_sse2(res1);
195 7841200 : const __m128i d = _mm_packus_epi16(r0, r1);
196 7841200 : _mm_storel_epi64((__m128i *)dst, d);
197 7841200 : _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
198 7841230 : src_ptr += 2 * src_stride;
199 7841230 : dst += 2 * dst_stride;
200 7841230 : y -= 2;
201 7841230 : } while (y);
202 : }
203 : }
204 : else {
205 1491470 : prepare_half_coeffs_2tap_avx2(
206 : filter_params_y, subpel_y_q4, coeffs_256);
207 :
208 1491760 : if (w == 16) {
209 : __m128i s_128[2];
210 :
211 973079 : s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
212 :
213 : do {
214 : __m256i r[2];
215 :
216 8033530 : y_convolve_2tap_16x2_avx2(
217 : src_ptr, src_stride, coeffs_256, s_128, r);
218 8033290 : sr_y_round_store_16x2_avx2(r, dst, dst_stride);
219 8033540 : src_ptr += 2 * src_stride;
220 8033540 : dst += 2 * dst_stride;
221 8033540 : y -= 2;
222 8033540 : } while (y);
223 : }
224 518685 : else if (w == 32) {
225 : __m256i s_256[2];
226 :
227 419410 : s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
228 :
229 : do {
230 4699100 : sr_y_2tap_32_avx2(src_ptr + src_stride,
231 : coeffs_256,
232 : s_256[0],
233 : &s_256[1],
234 : dst);
235 4699070 : sr_y_2tap_32_avx2(src_ptr + 2 * src_stride,
236 : coeffs_256,
237 : s_256[1],
238 : &s_256[0],
239 : dst + dst_stride);
240 4699100 : src_ptr += 2 * src_stride;
241 4699100 : dst += 2 * dst_stride;
242 4699100 : y -= 2;
243 4699100 : } while (y);
244 : }
245 99275 : else if (w == 64) {
246 : __m256i s_256[2][2];
247 :
248 99320 : s_256[0][0] =
249 99320 : _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
250 99320 : s_256[0][1] =
251 198640 : _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
252 :
253 : do {
254 1582420 : sr_y_2tap_32_avx2(src_ptr + src_stride,
255 : coeffs_256,
256 : s_256[0][0],
257 : &s_256[1][0],
258 : dst);
259 1582410 : sr_y_2tap_32_avx2(src_ptr + src_stride + 32,
260 : coeffs_256,
261 : s_256[0][1],
262 : &s_256[1][1],
263 : dst + 32);
264 1582420 : sr_y_2tap_32_avx2(src_ptr + 2 * src_stride,
265 : coeffs_256,
266 : s_256[1][0],
267 : &s_256[0][0],
268 : dst + dst_stride);
269 1582420 : sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32,
270 : coeffs_256,
271 : s_256[1][1],
272 : &s_256[0][1],
273 1582420 : dst + dst_stride + 32);
274 :
275 1582420 : src_ptr += 2 * src_stride;
276 1582420 : dst += 2 * dst_stride;
277 1582420 : y -= 2;
278 1582420 : } while (y);
279 : }
280 : else {
281 : __m256i s_256[2][4];
282 :
283 0 : assert(w == 128);
284 :
285 0 : s_256[0][0] =
286 0 : _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
287 0 : s_256[0][1] =
288 0 : _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
289 0 : s_256[0][2] =
290 0 : _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
291 0 : s_256[0][3] =
292 0 : _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
293 :
294 : do {
295 0 : sr_y_2tap_32_avx2(src_ptr + src_stride,
296 : coeffs_256,
297 : s_256[0][0],
298 : &s_256[1][0],
299 : dst);
300 0 : sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32,
301 : coeffs_256,
302 : s_256[0][1],
303 : &s_256[1][1],
304 : dst + 1 * 32);
305 0 : sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32,
306 : coeffs_256,
307 : s_256[0][2],
308 : &s_256[1][2],
309 : dst + 2 * 32);
310 0 : sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32,
311 : coeffs_256,
312 : s_256[0][3],
313 : &s_256[1][3],
314 : dst + 3 * 32);
315 :
316 0 : sr_y_2tap_32_avx2(src_ptr + 2 * src_stride,
317 : coeffs_256,
318 : s_256[1][0],
319 : &s_256[0][0],
320 : dst + dst_stride);
321 0 : sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32,
322 : coeffs_256,
323 : s_256[1][1],
324 : &s_256[0][1],
325 0 : dst + dst_stride + 1 * 32);
326 0 : sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32,
327 : coeffs_256,
328 : s_256[1][2],
329 : &s_256[0][2],
330 0 : dst + dst_stride + 2 * 32);
331 0 : sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32,
332 : coeffs_256,
333 : s_256[1][3],
334 : &s_256[0][3],
335 0 : dst + dst_stride + 3 * 32);
336 :
337 0 : src_ptr += 2 * src_stride;
338 0 : dst += 2 * dst_stride;
339 0 : y -= 2;
340 0 : } while (y);
341 : }
342 : }
343 : }
344 : else {
345 : // average to get half pel
346 734720 : if (w <= 8) {
347 371146 : if (w == 2) {
348 : __m128i s_16[2];
349 :
350 0 : s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
351 :
352 : do {
353 0 : s_16[1] = _mm_cvtsi32_si128(
354 0 : *(int16_t *)(src_ptr + src_stride));
355 0 : const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
356 0 : *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
357 0 : s_16[0] = _mm_cvtsi32_si128(
358 0 : *(int16_t *)(src_ptr + 2 * src_stride));
359 0 : const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
360 0 : *(int16_t *)(dst + dst_stride) =
361 0 : (int16_t)_mm_cvtsi128_si32(d1);
362 0 : src_ptr += 2 * src_stride;
363 0 : dst += 2 * dst_stride;
364 0 : y -= 2;
365 0 : } while (y);
366 : }
367 371146 : else if (w == 4) {
368 : __m128i s_32[2];
369 :
370 199702 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
371 :
372 : do {
373 941886 : s_32[1] = _mm_cvtsi32_si128(
374 470943 : *(int32_t *)(src_ptr + src_stride));
375 470943 : const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
376 470943 : xx_storel_32(dst, d0);
377 941888 : s_32[0] = _mm_cvtsi32_si128(
378 470944 : *(int32_t *)(src_ptr + 2 * src_stride));
379 470944 : const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
380 470944 : xx_storel_32(dst + dst_stride, d1);
381 470944 : src_ptr += 2 * src_stride;
382 470944 : dst += 2 * dst_stride;
383 470944 : y -= 2;
384 470944 : } while (y);
385 : }
386 : else {
387 : __m128i s_64[2];
388 :
389 271295 : assert(w == 8);
390 :
391 271295 : s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
392 :
393 : do {
394 : // Note: Faster than binding to AVX2 registers.
395 1706080 : s_64[1] =
396 1706080 : _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
397 3412150 : const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
398 1706080 : _mm_storel_epi64((__m128i *)dst, d0);
399 1706080 : s_64[0] = _mm_loadl_epi64(
400 1706080 : (__m128i *)(src_ptr + 2 * src_stride));
401 1706080 : const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
402 1706080 : _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
403 1706080 : src_ptr += 2 * src_stride;
404 1706080 : dst += 2 * dst_stride;
405 1706080 : y -= 2;
406 1706080 : } while (y);
407 : }
408 : }
409 363574 : else if (w == 16) {
410 : __m128i s_128[2];
411 :
412 233605 : s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
413 :
414 : do {
415 1965600 : s_128[1] =
416 1965600 : _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
417 3931200 : const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
418 : _mm_storeu_si128((__m128i *)dst, d0);
419 1965600 : s_128[0] =
420 1965600 : _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
421 1965600 : const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
422 1965600 : _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
423 1965600 : src_ptr += 2 * src_stride;
424 1965600 : dst += 2 * dst_stride;
425 1965600 : y -= 2;
426 1965600 : } while (y);
427 : }
428 129969 : else if (w == 32) {
429 : __m256i s_256[2];
430 :
431 102178 : s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
432 :
433 : do {
434 1180800 : sr_y_2tap_32_avg_avx2(
435 : src_ptr + src_stride, s_256[0], &s_256[1], dst);
436 1180800 : sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride,
437 : s_256[1],
438 : &s_256[0],
439 : dst + dst_stride);
440 1180800 : src_ptr += 2 * src_stride;
441 1180800 : dst += 2 * dst_stride;
442 1180800 : y -= 2;
443 1180800 : } while (y);
444 : }
445 27791 : else if (w == 64) {
446 : __m256i s_256[2][2];
447 :
448 27806 : s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
449 55612 : s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
450 :
451 : do {
452 448264 : sr_y_2tap_32_avg_avx2(
453 : src_ptr + src_stride, s_256[0][0], &s_256[1][0], dst);
454 448264 : sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32,
455 : s_256[0][1],
456 : &s_256[1][1],
457 : dst + 32);
458 :
459 448264 : sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride,
460 : s_256[1][0],
461 : &s_256[0][0],
462 : dst + dst_stride);
463 448264 : sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32,
464 : s_256[1][1],
465 : &s_256[0][1],
466 448264 : dst + dst_stride + 32);
467 :
468 448264 : src_ptr += 2 * src_stride;
469 448264 : dst += 2 * dst_stride;
470 448264 : y -= 2;
471 448264 : } while (y);
472 : }
473 : else {
474 : __m256i s_256[2][4];
475 :
476 0 : assert(w == 128);
477 :
478 0 : s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
479 0 : s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
480 0 : s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
481 0 : s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
482 :
483 : do {
484 0 : sr_y_2tap_32_avg_avx2(
485 : src_ptr + src_stride, s_256[0][0], &s_256[1][0], dst);
486 0 : sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32,
487 : s_256[0][1],
488 : &s_256[1][1],
489 : dst + 1 * 32);
490 0 : sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32,
491 : s_256[0][2],
492 : &s_256[1][2],
493 : dst + 2 * 32);
494 0 : sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32,
495 : s_256[0][3],
496 : &s_256[1][3],
497 : dst + 3 * 32);
498 :
499 0 : sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride,
500 : s_256[1][0],
501 : &s_256[0][0],
502 : dst + dst_stride);
503 0 : sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32,
504 : s_256[1][1],
505 : &s_256[0][1],
506 0 : dst + dst_stride + 1 * 32);
507 0 : sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32,
508 : s_256[1][2],
509 : &s_256[0][2],
510 0 : dst + dst_stride + 2 * 32);
511 0 : sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32,
512 : s_256[1][3],
513 : &s_256[0][3],
514 0 : dst + dst_stride + 3 * 32);
515 :
516 0 : src_ptr += 2 * src_stride;
517 0 : dst += 2 * dst_stride;
518 0 : y -= 2;
519 0 : } while (y);
520 : }
521 : }
522 : }
523 28068100 : else if (is_convolve_4tap(filter_params_y->filter_ptr)) {
524 : // vert_filt as 4 tap
525 2892740 : const uint8_t *src_ptr = src - src_stride;
526 :
527 2892740 : y = h;
528 :
529 2892740 : if (w <= 4) {
530 622441 : prepare_half_coeffs_4tap_ssse3(
531 : filter_params_y, subpel_y_q4, coeffs_128);
532 :
533 622444 : if (w == 2) {
534 : __m128i s_16[4], ss_128[2];
535 :
536 14306 : s_16[0] =
537 14306 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
538 14306 : s_16[1] =
539 14306 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
540 14306 : s_16[2] =
541 14306 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
542 :
543 14306 : const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
544 28612 : const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
545 :
546 14306 : ss_128[0] = _mm_unpacklo_epi8(src01, src12);
547 :
548 : do {
549 26776 : src_ptr += 2 * src_stride;
550 26776 : const __m128i res = y_convolve_4tap_2x2_ssse3(
551 : src_ptr, src_stride, coeffs_128, s_16, ss_128);
552 26776 : const __m128i r = sr_y_round_sse2(res);
553 26776 : pack_store_2x2_sse2(r, dst, dst_stride);
554 :
555 26776 : ss_128[0] = ss_128[1];
556 26776 : dst += 2 * dst_stride;
557 26776 : y -= 2;
558 26776 : } while (y);
559 : }
560 : else {
561 : __m128i s_32[4], ss_128[2];
562 :
563 608138 : assert(w == 4);
564 :
565 608138 : s_32[0] =
566 608138 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
567 608138 : s_32[1] =
568 608138 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
569 608138 : s_32[2] =
570 608138 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
571 :
572 608138 : const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
573 1216280 : const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
574 :
575 608138 : ss_128[0] = _mm_unpacklo_epi8(src01, src12);
576 :
577 : do {
578 1203740 : src_ptr += 2 * src_stride;
579 1203740 : const __m128i res = y_convolve_4tap_4x2_ssse3(
580 : src_ptr, src_stride, coeffs_128, s_32, ss_128);
581 1203730 : const __m128i r = sr_y_round_sse2(res);
582 1203730 : pack_store_4x2_sse2(r, dst, dst_stride);
583 :
584 1203720 : ss_128[0] = ss_128[1];
585 1203720 : dst += 2 * dst_stride;
586 1203720 : y -= 2;
587 1203720 : } while (y);
588 : }
589 : }
590 : else {
591 2270300 : prepare_half_coeffs_4tap_avx2(
592 : filter_params_y, subpel_y_q4, coeffs_256);
593 :
594 2270510 : if (w == 8) {
595 : __m128i s_64[4];
596 : __m256i ss_256[2];
597 :
598 925045 : s_64[0] =
599 925045 : _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
600 925045 : s_64[1] =
601 925045 : _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
602 925045 : s_64[2] =
603 925045 : _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
604 :
605 : // Load lines a and b. Line a to lower 128, line b to upper 128
606 925045 : const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
607 1850090 : const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
608 :
609 925045 : ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
610 :
611 : do {
612 1833150 : src_ptr += 2 * src_stride;
613 1833150 : const __m256i res = y_convolve_4tap_8x2_avx2(
614 : src_ptr, src_stride, coeffs_256, s_64, ss_256);
615 1833130 : sr_y_round_store_8x2_avx2(res, dst, dst_stride);
616 :
617 1833120 : ss_256[0] = ss_256[1];
618 1833120 : dst += 2 * dst_stride;
619 1833120 : y -= 2;
620 1833120 : } while (y);
621 : }
622 1345460 : else if (w == 16) {
623 : __m128i s_128[4];
624 : __m256i ss_256[4], r[2];
625 :
626 1339940 : assert(w == 16);
627 :
628 1339940 : s_128[0] =
629 1339940 : _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
630 1339940 : s_128[1] =
631 1339940 : _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
632 1339940 : s_128[2] =
633 1339940 : _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
634 :
635 : // Load lines a and b. Line a to lower 128, line b to upper 128
636 1339940 : const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
637 2679870 : const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
638 :
639 1339940 : ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
640 1339940 : ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
641 :
642 : do {
643 2679750 : src_ptr += 2 * src_stride;
644 2679750 : y_convolve_4tap_16x2_avx2(
645 : src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
646 2679670 : sr_y_round_store_16x2_avx2(r, dst, dst_stride);
647 :
648 2679720 : ss_256[0] = ss_256[1];
649 2679720 : ss_256[2] = ss_256[3];
650 2679720 : dst += 2 * dst_stride;
651 2679720 : y -= 2;
652 2679720 : } while (y);
653 : }
654 : else {
655 : // AV1 standard won't have 32x4 case.
656 : // This only favors some optimization feature which
657 : // subsamples 32x8 to 32x4 and triggers 4-tap filter.
658 :
659 : __m256i s_256[4], ss_256[4], tt_256[4], r[4];
660 :
661 5528 : assert(w == 32);
662 :
663 5528 : s_256[0] =
664 5528 : _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
665 5528 : s_256[1] =
666 5528 : _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
667 5528 : s_256[2] =
668 5528 : _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
669 :
670 5528 : ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
671 5528 : ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
672 :
673 5528 : tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
674 11056 : tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
675 :
676 : do {
677 11071 : src_ptr += 2 * src_stride;
678 11071 : y_convolve_4tap_32x2_avx2(src_ptr,
679 : src_stride,
680 : coeffs_256,
681 : s_256,
682 : ss_256,
683 : tt_256,
684 : r);
685 11086 : sr_y_round_store_32x2_avx2(r, dst, dst_stride);
686 :
687 11086 : ss_256[0] = ss_256[1];
688 11086 : ss_256[2] = ss_256[3];
689 :
690 11086 : tt_256[0] = tt_256[1];
691 11086 : tt_256[2] = tt_256[3];
692 11086 : dst += 2 * dst_stride;
693 11086 : y -= 2;
694 11086 : } while (y);
695 : }
696 : }
697 : }
698 25189000 : else if (is_convolve_6tap(filter_params_y->filter_ptr)) {
699 : // vert_filt as 6 tap
700 23346000 : const uint8_t *src_ptr = src - 2 * src_stride;
701 :
702 23346000 : if (w <= 4) {
703 2372640 : prepare_half_coeffs_6tap_ssse3(
704 : filter_params_y, subpel_y_q4, coeffs_128);
705 :
706 2372800 : y = h;
707 :
708 2372800 : if (w == 2) {
709 : __m128i s_16[6], ss_128[3];
710 :
711 16428 : s_16[0] =
712 16428 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
713 16428 : s_16[1] =
714 16428 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
715 16428 : s_16[2] =
716 16428 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
717 16428 : s_16[3] =
718 16428 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
719 16428 : s_16[4] =
720 16428 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
721 :
722 16428 : const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
723 16428 : const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
724 16428 : const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
725 32856 : const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
726 :
727 16428 : ss_128[0] = _mm_unpacklo_epi8(src01, src12);
728 16428 : ss_128[1] = _mm_unpacklo_epi8(src23, src34);
729 :
730 : do {
731 65728 : src_ptr += 2 * src_stride;
732 65728 : const __m128i res = y_convolve_6tap_2x2_ssse3(
733 : src_ptr, src_stride, coeffs_128, s_16, ss_128);
734 65728 : const __m128i r = sr_y_round_sse2(res);
735 65728 : pack_store_2x2_sse2(r, dst, dst_stride);
736 :
737 65728 : ss_128[0] = ss_128[1];
738 65728 : ss_128[1] = ss_128[2];
739 65728 : dst += 2 * dst_stride;
740 65728 : y -= 2;
741 65728 : } while (y);
742 : }
743 : else {
744 : __m128i s_32[6], ss_128[3];
745 :
746 2356370 : assert(w == 4);
747 :
748 2356370 : s_32[0] =
749 2356370 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
750 2356370 : s_32[1] =
751 2356370 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
752 2356370 : s_32[2] =
753 2356370 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
754 2356370 : s_32[3] =
755 2356370 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
756 2356370 : s_32[4] =
757 2356370 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
758 :
759 2356370 : const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
760 2356370 : const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
761 2356370 : const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
762 4712740 : const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
763 :
764 2356370 : ss_128[0] = _mm_unpacklo_epi8(src01, src12);
765 2356370 : ss_128[1] = _mm_unpacklo_epi8(src23, src34);
766 :
767 : do {
768 15005000 : src_ptr += 2 * src_stride;
769 15005000 : const __m128i res = y_convolve_6tap_4x2_ssse3(
770 : src_ptr, src_stride, coeffs_128, s_32, ss_128);
771 15005600 : const __m128i r = sr_y_round_sse2(res);
772 15005100 : pack_store_4x2_sse2(r, dst, dst_stride);
773 :
774 15004900 : ss_128[0] = ss_128[1];
775 15004900 : ss_128[1] = ss_128[2];
776 15004900 : dst += 2 * dst_stride;
777 15004900 : y -= 2;
778 15004900 : } while (y);
779 : }
780 : }
781 : else {
782 20973300 : prepare_half_coeffs_6tap_avx2(
783 : filter_params_y, subpel_y_q4, coeffs_256);
784 :
785 20984300 : if (w == 8) {
786 : __m128i s_64[6];
787 : __m256i ss_256[3];
788 :
789 9396180 : s_64[0] =
790 9396180 : _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
791 9396180 : s_64[1] =
792 9396180 : _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
793 9396180 : s_64[2] =
794 9396180 : _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
795 9396180 : s_64[3] =
796 9396180 : _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
797 9396180 : s_64[4] =
798 9396180 : _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
799 :
800 : // Load lines a and b. Line a to lower 128, line b to upper 128
801 9396180 : const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
802 9396180 : const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
803 9396180 : const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
804 18792400 : const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
805 :
806 9396180 : ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
807 9396180 : ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
808 :
809 9396180 : y = h;
810 : do {
811 65996800 : src_ptr += 2 * src_stride;
812 65996800 : const __m256i res = y_convolve_6tap_8x2_avx2(
813 : src_ptr, src_stride, coeffs_256, s_64, ss_256);
814 65993700 : sr_y_round_store_8x2_avx2(res, dst, dst_stride);
815 :
816 65995500 : ss_256[0] = ss_256[1];
817 65995500 : ss_256[1] = ss_256[2];
818 65995500 : dst += 2 * dst_stride;
819 65995500 : y -= 2;
820 65995500 : } while (y);
821 : }
822 11588100 : else if (w == 16) {
823 : __m128i s_128[6];
824 : __m256i ss_256[6], r[2];
825 :
826 7053300 : s_128[0] =
827 7053300 : _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
828 7053300 : s_128[1] =
829 7053300 : _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
830 7053300 : s_128[2] =
831 7053300 : _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
832 7053300 : s_128[3] =
833 7053300 : _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
834 7053300 : s_128[4] =
835 7053300 : _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
836 :
837 : // Load lines a and b. Line a to lower 128, line b to upper 128
838 7053300 : const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
839 7053300 : const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
840 7053300 : const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
841 14106600 : const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
842 :
843 7053300 : ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
844 7053300 : ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
845 :
846 7053300 : ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
847 7053300 : ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
848 :
849 7053300 : y = h;
850 : do {
851 65767500 : src_ptr += 2 * src_stride;
852 65767500 : y_convolve_6tap_16x2_avx2(
853 : src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
854 65760400 : sr_y_round_store_16x2_avx2(r, dst, dst_stride);
855 :
856 65766900 : ss_256[0] = ss_256[1];
857 65766900 : ss_256[1] = ss_256[2];
858 :
859 65766900 : ss_256[3] = ss_256[4];
860 65766900 : ss_256[4] = ss_256[5];
861 65766900 : dst += 2 * dst_stride;
862 65766900 : y -= 2;
863 65766900 : } while (y);
864 : }
865 : else {
866 : __m256i s_256[6], ss_256[6], tt_256[6], r[4];
867 :
868 4534810 : assert(!(w % 32));
869 :
870 4534810 : x = 0;
871 : do {
872 5456800 : const uint8_t *s = src_ptr + x;
873 5456800 : uint8_t *d = dst + x;
874 :
875 5456800 : s_256[0] =
876 5456800 : _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
877 5456800 : s_256[1] =
878 5456800 : _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
879 5456800 : s_256[2] =
880 5456800 : _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
881 5456800 : s_256[3] =
882 5456800 : _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
883 5456800 : s_256[4] =
884 5456800 : _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
885 :
886 5456800 : ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
887 10913600 : ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
888 5456800 : ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
889 5456800 : ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
890 :
891 5456800 : tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
892 5456800 : tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
893 5456800 : tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
894 5456800 : tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
895 :
896 5456800 : y = h;
897 : do {
898 67575000 : s += 2 * src_stride;
899 67575000 : y_convolve_6tap_32x2_avx2(s,
900 : src_stride,
901 : coeffs_256,
902 : s_256,
903 : ss_256,
904 : tt_256,
905 : r);
906 67553900 : sr_y_round_store_32x2_avx2(r, d, dst_stride);
907 :
908 67579600 : ss_256[0] = ss_256[1];
909 67579600 : ss_256[1] = ss_256[2];
910 67579600 : ss_256[3] = ss_256[4];
911 67579600 : ss_256[4] = ss_256[5];
912 :
913 67579600 : tt_256[0] = tt_256[1];
914 67579600 : tt_256[1] = tt_256[2];
915 67579600 : tt_256[3] = tt_256[4];
916 67579600 : tt_256[4] = tt_256[5];
917 67579600 : d += 2 * dst_stride;
918 67579600 : y -= 2;
919 67579600 : } while (y);
920 :
921 5461340 : x += 32;
922 5461340 : } while (x < w);
923 : }
924 : }
925 : }
926 : else {
927 : // vert_filt as 8 tap
928 1848400 : const uint8_t *src_ptr = src - 3 * src_stride;
929 :
930 1848400 : if (w <= 4) {
931 3958 : prepare_half_coeffs_8tap_ssse3(
932 : filter_params_y, subpel_y_q4, coeffs_128);
933 :
934 3958 : y = h;
935 :
936 3958 : if (w == 2) {
937 : __m128i s_16[8], ss_128[4];
938 :
939 0 : s_16[0] =
940 0 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
941 0 : s_16[1] =
942 0 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
943 0 : s_16[2] =
944 0 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
945 0 : s_16[3] =
946 0 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
947 0 : s_16[4] =
948 0 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
949 0 : s_16[5] =
950 0 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 5 * src_stride));
951 0 : s_16[6] =
952 0 : _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 6 * src_stride));
953 :
954 0 : const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
955 0 : const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
956 0 : const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
957 0 : const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
958 0 : const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
959 0 : const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
960 :
961 0 : ss_128[0] = _mm_unpacklo_epi8(src01, src12);
962 0 : ss_128[1] = _mm_unpacklo_epi8(src23, src34);
963 0 : ss_128[2] = _mm_unpacklo_epi8(src45, src56);
964 :
965 : do {
966 0 : const __m128i res = y_convolve_8tap_2x2_ssse3(
967 : src_ptr, src_stride, coeffs_128, s_16, ss_128);
968 0 : const __m128i r = sr_y_round_sse2(res);
969 0 : pack_store_2x2_sse2(r, dst, dst_stride);
970 0 : ss_128[0] = ss_128[1];
971 0 : ss_128[1] = ss_128[2];
972 0 : ss_128[2] = ss_128[3];
973 0 : src_ptr += 2 * src_stride;
974 0 : dst += 2 * dst_stride;
975 0 : y -= 2;
976 0 : } while (y);
977 : }
978 : else {
979 : __m128i s_32[8], ss_128[4];
980 :
981 3958 : assert(w == 4);
982 :
983 3958 : s_32[0] =
984 3958 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
985 3958 : s_32[1] =
986 3958 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
987 3958 : s_32[2] =
988 3958 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
989 3958 : s_32[3] =
990 3958 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
991 3958 : s_32[4] =
992 3958 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
993 3958 : s_32[5] =
994 3958 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 5 * src_stride));
995 3958 : s_32[6] =
996 3958 : _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 6 * src_stride));
997 :
998 3958 : const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
999 3958 : const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1000 3958 : const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1001 3958 : const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1002 3958 : const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1003 7916 : const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
1004 :
1005 3958 : ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1006 7916 : ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1007 3958 : ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1008 :
1009 : do {
1010 23472 : const __m128i res = y_convolve_8tap_4x2_ssse3(
1011 : src_ptr, src_stride, coeffs_128, s_32, ss_128);
1012 23472 : const __m128i r = sr_y_round_sse2(res);
1013 23472 : pack_store_4x2_sse2(r, dst, dst_stride);
1014 23472 : ss_128[0] = ss_128[1];
1015 23472 : ss_128[1] = ss_128[2];
1016 23472 : ss_128[2] = ss_128[3];
1017 23472 : src_ptr += 2 * src_stride;
1018 23472 : dst += 2 * dst_stride;
1019 23472 : y -= 2;
1020 23472 : } while (y);
1021 : }
1022 : }
1023 : else {
1024 1844440 : prepare_half_coeffs_8tap_avx2(
1025 : filter_params_y, subpel_y_q4, coeffs_256);
1026 :
1027 1851490 : if (w == 8) {
1028 : __m128i s_64[8];
1029 : __m256i ss_256[4];
1030 :
1031 863733 : s_64[0] =
1032 863733 : _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
1033 863733 : s_64[1] =
1034 863733 : _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
1035 863733 : s_64[2] =
1036 863733 : _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
1037 863733 : s_64[3] =
1038 863733 : _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
1039 863733 : s_64[4] =
1040 863733 : _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
1041 863733 : s_64[5] =
1042 863733 : _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
1043 863733 : s_64[6] =
1044 863733 : _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
1045 :
1046 : // Load lines a and b. Line a to lower 128, line b to upper 128
1047 863733 : const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
1048 863733 : const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
1049 863733 : const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1050 863733 : const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
1051 863733 : const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1052 1727470 : const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
1053 :
1054 863733 : ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1055 863733 : ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1056 863733 : ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1057 :
1058 863733 : y = h;
1059 : do {
1060 5833640 : const __m256i res = y_convolve_8tap_8x2_avx2(
1061 : src_ptr, src_stride, coeffs_256, s_64, ss_256);
1062 5833630 : sr_y_round_store_8x2_avx2(res, dst, dst_stride);
1063 5833600 : ss_256[0] = ss_256[1];
1064 5833600 : ss_256[1] = ss_256[2];
1065 5833600 : ss_256[2] = ss_256[3];
1066 5833600 : src_ptr += 2 * src_stride;
1067 5833600 : dst += 2 * dst_stride;
1068 5833600 : y -= 2;
1069 5833600 : } while (y);
1070 : }
1071 987756 : else if (w == 16) {
1072 : __m128i s_128[8];
1073 : __m256i ss_256[8], r[2];
1074 :
1075 619540 : s_128[0] =
1076 619540 : _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
1077 619540 : s_128[1] =
1078 619540 : _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
1079 619540 : s_128[2] =
1080 619540 : _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
1081 619540 : s_128[3] =
1082 619540 : _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
1083 619540 : s_128[4] =
1084 619540 : _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
1085 619540 : s_128[5] =
1086 619540 : _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
1087 619540 : s_128[6] =
1088 619540 : _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
1089 :
1090 : // Load lines a and b. Line a to lower 128, line b to upper 128
1091 619540 : const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
1092 619540 : const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
1093 619540 : const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1094 619540 : const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
1095 619540 : const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1096 1239080 : const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
1097 :
1098 619540 : ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1099 619540 : ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1100 619540 : ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1101 :
1102 619540 : ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
1103 619540 : ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
1104 619540 : ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
1105 :
1106 619540 : y = h;
1107 : do {
1108 5751690 : y_convolve_8tap_16x2_avx2(
1109 : src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1110 5751890 : sr_y_round_store_16x2_avx2(r, dst, dst_stride);
1111 :
1112 5751670 : ss_256[0] = ss_256[1];
1113 5751670 : ss_256[1] = ss_256[2];
1114 5751670 : ss_256[2] = ss_256[3];
1115 :
1116 5751670 : ss_256[4] = ss_256[5];
1117 5751670 : ss_256[5] = ss_256[6];
1118 5751670 : ss_256[6] = ss_256[7];
1119 5751670 : src_ptr += 2 * src_stride;
1120 5751670 : dst += 2 * dst_stride;
1121 5751670 : y -= 2;
1122 5751670 : } while (y);
1123 : }
1124 : else {
1125 : __m256i s_256[8], ss_256[8], tt_256[8], r[4];
1126 :
1127 368216 : assert(!(w % 32));
1128 :
1129 368216 : x = 0;
1130 : do {
1131 443224 : const uint8_t *s = src_ptr + x;
1132 443224 : uint8_t *d = dst + x;
1133 :
1134 443224 : s_256[0] =
1135 443224 : _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
1136 443224 : s_256[1] =
1137 443224 : _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
1138 443224 : s_256[2] =
1139 443224 : _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
1140 443224 : s_256[3] =
1141 443224 : _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
1142 443224 : s_256[4] =
1143 443224 : _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
1144 443224 : s_256[5] =
1145 443224 : _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
1146 443224 : s_256[6] =
1147 443224 : _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
1148 :
1149 443224 : ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1150 443224 : ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1151 443224 : ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1152 443224 : ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1153 443224 : ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1154 443224 : ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1155 :
1156 443224 : tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
1157 443224 : tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
1158 443224 : tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
1159 443224 : tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
1160 443224 : tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
1161 443224 : tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
1162 :
1163 443224 : y = h;
1164 : do {
1165 5378280 : y_convolve_8tap_32x2_avx2(s,
1166 : src_stride,
1167 : coeffs_256,
1168 : s_256,
1169 : ss_256,
1170 : tt_256,
1171 : r);
1172 5378190 : sr_y_round_store_32x2_avx2(r, d, dst_stride);
1173 :
1174 5378350 : ss_256[0] = ss_256[1];
1175 5378350 : ss_256[1] = ss_256[2];
1176 5378350 : ss_256[2] = ss_256[3];
1177 5378350 : ss_256[4] = ss_256[5];
1178 5378350 : ss_256[5] = ss_256[6];
1179 5378350 : ss_256[6] = ss_256[7];
1180 :
1181 5378350 : tt_256[0] = tt_256[1];
1182 5378350 : tt_256[1] = tt_256[2];
1183 5378350 : tt_256[2] = tt_256[3];
1184 5378350 : tt_256[4] = tt_256[5];
1185 5378350 : tt_256[5] = tt_256[6];
1186 5378350 : tt_256[6] = tt_256[7];
1187 5378350 : s += 2 * src_stride;
1188 5378350 : d += 2 * dst_stride;
1189 5378350 : y -= 2;
1190 5378350 : } while (y);
1191 :
1192 443301 : x += 32;
1193 443301 : } while (x < w);
1194 : }
1195 : }
1196 : }
1197 31828400 : }
1198 :
1199 15049100 : static INLINE void sr_x_2tap_32_avx2(const uint8_t *const src,
1200 : const __m256i coeffs[1],
1201 : uint8_t *const dst) {
1202 : __m256i r[2];
1203 :
1204 15049100 : x_convolve_2tap_32_avx2(src, coeffs, r);
1205 15048100 : sr_x_round_store_32_avx2(r, dst);
1206 15049400 : }
1207 :
1208 3521320 : static INLINE void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
1209 : uint8_t *const dst) {
1210 3521320 : const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1211 7042630 : const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1212 3521320 : const __m256i d = _mm256_avg_epu8(s0, s1);
1213 : _mm256_storeu_si256((__m256i *)dst, d);
1214 3521320 : }
1215 :
1216 131651000 : static INLINE void sr_x_6tap_32_avx2(const uint8_t *const src,
1217 : const __m256i coeffs[3],
1218 : const __m256i *const filt,
1219 : uint8_t *const dst) {
1220 : __m256i r[2];
1221 :
1222 131651000 : x_convolve_6tap_16x2_avx2(src, 16, coeffs, filt, r);
1223 131646000 : sr_x_round_store_32_avx2(r, dst);
1224 131676000 : }
1225 :
1226 : SIMD_INLINE void sr_x_8tap_32_avx2(const uint8_t *const src,
1227 : const __m256i coeffs[4],
1228 : const __m256i *const filt,
1229 : uint8_t *const dst) {
1230 : __m256i r[2];
1231 :
1232 : x_convolve_8tap_16x2_avx2(src, 16, coeffs, filt, r);
1233 9742950 : sr_x_round_store_32_avx2(r, dst);
1234 9742950 : }
1235 :
1236 31589400 : void eb_av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
1237 : uint8_t *dst, int32_t dst_stride, int32_t w,
1238 : int32_t h, InterpFilterParams *filter_params_x,
1239 : InterpFilterParams *filter_params_y,
1240 : const int32_t subpel_x_q4,
1241 : const int32_t subpel_y_q4,
1242 : ConvolveParams *conv_params) {
1243 31589400 : int32_t y = h;
1244 : __m128i coeffs_128[4];
1245 : __m256i coeffs_256[4];
1246 :
1247 : (void)filter_params_y;
1248 : (void)subpel_y_q4;
1249 : (void)conv_params;
1250 :
1251 31589400 : assert(conv_params->round_0 == 3);
1252 31589400 : assert((FILTER_BITS - conv_params->round_1) >= 0 ||
1253 : ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
1254 :
1255 31589400 : if (is_convolve_2tap(filter_params_x->filter_ptr)) {
1256 : // horz_filt as 2 tap
1257 3636860 : const uint8_t *src_ptr = src;
1258 :
1259 3636860 : if (subpel_x_q4 != 8) {
1260 2921240 : if (w <= 8) {
1261 1462860 : prepare_half_coeffs_2tap_ssse3(
1262 : filter_params_x, subpel_x_q4, coeffs_128);
1263 :
1264 1462860 : if (w == 2) {
1265 : do {
1266 0 : const __m128i res = x_convolve_2tap_2x2_sse4_1(
1267 : src_ptr, src_stride, coeffs_128);
1268 0 : const __m128i r = sr_x_round_sse2(res);
1269 0 : pack_store_2x2_sse2(r, dst, dst_stride);
1270 0 : src_ptr += 2 * src_stride;
1271 0 : dst += 2 * dst_stride;
1272 0 : y -= 2;
1273 0 : } while (y);
1274 : }
1275 1462860 : else if (w == 4) {
1276 : do {
1277 1468820 : const __m128i res = x_convolve_2tap_4x2_ssse3(
1278 : src_ptr, src_stride, coeffs_128);
1279 1468840 : const __m128i r = sr_x_round_sse2(res);
1280 1468840 : pack_store_4x2_sse2(r, dst, dst_stride);
1281 1468830 : src_ptr += 2 * src_stride;
1282 1468830 : dst += 2 * dst_stride;
1283 1468830 : y -= 2;
1284 1468830 : } while (y);
1285 : }
1286 : else {
1287 1177460 : assert(w == 8);
1288 :
1289 : do {
1290 : __m128i res[2];
1291 :
1292 7573850 : x_convolve_2tap_8x2_ssse3(
1293 : src_ptr, src_stride, coeffs_128, res);
1294 7573470 : res[0] = sr_x_round_sse2(res[0]);
1295 7573430 : res[1] = sr_x_round_sse2(res[1]);
1296 15147000 : const __m128i d = _mm_packus_epi16(res[0], res[1]);
1297 7573500 : _mm_storel_epi64((__m128i *)dst, d);
1298 7573500 : _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
1299 :
1300 7573500 : src_ptr += 2 * src_stride;
1301 7573500 : dst += 2 * dst_stride;
1302 7573500 : y -= 2;
1303 7573500 : } while (y);
1304 : }
1305 : }
1306 : else {
1307 1458380 : prepare_half_coeffs_2tap_avx2(
1308 : filter_params_x, subpel_x_q4, coeffs_256);
1309 :
1310 1458590 : if (w == 16) {
1311 : do {
1312 : __m256i r[2];
1313 :
1314 7726420 : x_convolve_2tap_16x2_avx2(
1315 : src_ptr, src_stride, coeffs_256, r);
1316 7725890 : sr_x_round_store_16x2_avx2(r, dst, dst_stride);
1317 7726200 : src_ptr += 2 * src_stride;
1318 7726200 : dst += 2 * dst_stride;
1319 7726200 : y -= 2;
1320 7726200 : } while (y);
1321 : }
1322 503787 : else if (w == 32) {
1323 : do {
1324 9107900 : sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
1325 9107810 : src_ptr += src_stride;
1326 9107810 : dst += dst_stride;
1327 9107810 : } while (--y);
1328 : }
1329 93306 : else if (w == 64) {
1330 : do {
1331 2971890 : sr_x_2tap_32_avx2(
1332 : src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
1333 2971860 : sr_x_2tap_32_avx2(
1334 : src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
1335 2971870 : src_ptr += src_stride;
1336 2971870 : dst += dst_stride;
1337 2971870 : } while (--y);
1338 : }
1339 : else {
1340 0 : assert(w == 128);
1341 :
1342 : do {
1343 0 : sr_x_2tap_32_avx2(
1344 : src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
1345 0 : sr_x_2tap_32_avx2(
1346 : src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
1347 0 : sr_x_2tap_32_avx2(
1348 : src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
1349 0 : sr_x_2tap_32_avx2(
1350 : src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
1351 0 : src_ptr += src_stride;
1352 0 : dst += dst_stride;
1353 0 : } while (--y);
1354 : }
1355 : }
1356 : }
1357 : else {
1358 : // average to get half pel
1359 715617 : if (w == 2) {
1360 : do {
1361 : __m128i s_128;
1362 :
1363 1 : s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
1364 0 : const __m128i s1 = _mm_srli_si128(s_128, 1);
1365 0 : const __m128i d = _mm_avg_epu8(s_128, s1);
1366 0 : *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
1367 0 : *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
1368 :
1369 0 : src_ptr += 2 * src_stride;
1370 0 : dst += 2 * dst_stride;
1371 0 : y -= 2;
1372 0 : } while (y);
1373 : }
1374 715616 : else if (w == 4) {
1375 : do {
1376 : __m128i s_128;
1377 :
1378 453792 : s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
1379 453794 : const __m128i s1 = _mm_srli_si128(s_128, 1);
1380 453794 : const __m128i d = _mm_avg_epu8(s_128, s1);
1381 453794 : xx_storel_32(dst, d);
1382 453795 : *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
1383 :
1384 453795 : src_ptr += 2 * src_stride;
1385 453795 : dst += 2 * dst_stride;
1386 453795 : y -= 2;
1387 453795 : } while (y);
1388 : }
1389 616347 : else if (w == 8) {
1390 : do {
1391 1703770 : const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
1392 : const __m128i s10 =
1393 1703770 : _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
1394 1703770 : const __m128i s01 = _mm_srli_si128(s00, 1);
1395 1703770 : const __m128i s11 = _mm_srli_si128(s10, 1);
1396 1703770 : const __m128i d0 = _mm_avg_epu8(s00, s01);
1397 1703770 : const __m128i d1 = _mm_avg_epu8(s10, s11);
1398 1703770 : _mm_storel_epi64((__m128i *)dst, d0);
1399 1703770 : _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
1400 :
1401 1703770 : src_ptr += 2 * src_stride;
1402 1703770 : dst += 2 * dst_stride;
1403 1703770 : y -= 2;
1404 1703770 : } while (y);
1405 : }
1406 337142 : else if (w == 16) {
1407 : do {
1408 1793350 : const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
1409 : const __m128i s01 =
1410 1793350 : _mm_loadu_si128((__m128i *)(src_ptr + 1));
1411 : const __m128i s10 =
1412 1793350 : _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
1413 : const __m128i s11 =
1414 3586700 : _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
1415 1793350 : const __m128i d0 = _mm_avg_epu8(s00, s01);
1416 1793350 : const __m128i d1 = _mm_avg_epu8(s10, s11);
1417 : _mm_storeu_si128((__m128i *)dst, d0);
1418 1793350 : _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
1419 :
1420 1793350 : src_ptr += 2 * src_stride;
1421 1793350 : dst += 2 * dst_stride;
1422 1793350 : y -= 2;
1423 1793350 : } while (y);
1424 : }
1425 110967 : else if (w == 32) {
1426 : do {
1427 2033370 : sr_x_2tap_32_avg_avx2(src_ptr, dst);
1428 2033370 : src_ptr += src_stride;
1429 2033370 : dst += dst_stride;
1430 2033370 : } while (--y);
1431 : }
1432 22111 : else if (w == 64) {
1433 : do {
1434 744004 : sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
1435 744001 : sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
1436 744002 : src_ptr += src_stride;
1437 744002 : dst += dst_stride;
1438 744002 : } while (--y);
1439 : }
1440 : else {
1441 0 : assert(w == 128);
1442 :
1443 : do {
1444 0 : sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
1445 0 : sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
1446 0 : sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
1447 0 : sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
1448 0 : src_ptr += src_stride;
1449 0 : dst += dst_stride;
1450 0 : } while (--y);
1451 : }
1452 : }
1453 : }
1454 27955100 : else if (is_convolve_4tap(filter_params_x->filter_ptr)) {
1455 : // horz_filt as 4 tap
1456 2876230 : const uint8_t *src_ptr = src - 1;
1457 :
1458 2876230 : prepare_half_coeffs_4tap_ssse3(
1459 : filter_params_x, subpel_x_q4, coeffs_128);
1460 :
1461 2881160 : if (w == 2) {
1462 : do {
1463 : const __m128i res =
1464 104938 : x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
1465 104946 : const __m128i r = sr_x_round_sse2(res);
1466 104946 : pack_store_2x2_sse2(r, dst, dst_stride);
1467 104946 : src_ptr += 2 * src_stride;
1468 104946 : dst += 2 * dst_stride;
1469 104946 : y -= 2;
1470 104946 : } while (y);
1471 : }
1472 : else {
1473 2846640 : assert(w == 4);
1474 :
1475 : do {
1476 : const __m128i res =
1477 15446900 : x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
1478 15446100 : const __m128i r = sr_x_round_sse2(res);
1479 15445500 : pack_store_4x2_sse2(r, dst, dst_stride);
1480 15445800 : src_ptr += 2 * src_stride;
1481 15445800 : dst += 2 * dst_stride;
1482 15445800 : y -= 2;
1483 15445800 : } while (y);
1484 : }
1485 : }
1486 : else {
1487 : __m256i filt_256[4];
1488 :
1489 25091400 : filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
1490 25091400 : filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
1491 25091400 : filt_256[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
1492 :
1493 25091400 : if (is_convolve_6tap(filter_params_x->filter_ptr)) {
1494 : // horz_filt as 6 tap
1495 23262900 : const uint8_t *src_ptr = src - 2;
1496 :
1497 23262900 : prepare_half_coeffs_6tap_avx2(
1498 : filter_params_x, subpel_x_q4, coeffs_256);
1499 :
1500 23277900 : if (w == 8) {
1501 : do {
1502 67736200 : const __m256i res = x_convolve_6tap_8x2_avx2(
1503 : src_ptr, src_stride, coeffs_256, filt_256);
1504 67726200 : sr_x_round_store_8x2_avx2(res, dst, dst_stride);
1505 67723200 : src_ptr += 2 * src_stride;
1506 67723200 : dst += 2 * dst_stride;
1507 67723200 : y -= 2;
1508 67723200 : } while (y);
1509 : }
1510 12797200 : else if (w == 16) {
1511 : do {
1512 : __m256i r[2];
1513 :
1514 67176900 : x_convolve_6tap_16x2_avx2(
1515 : src_ptr, src_stride, coeffs_256, filt_256, r);
1516 67164900 : sr_x_round_store_16x2_avx2(r, dst, dst_stride);
1517 67158500 : src_ptr += 2 * src_stride;
1518 67158500 : dst += 2 * dst_stride;
1519 67158500 : y -= 2;
1520 67158500 : } while (y);
1521 : }
1522 4382890 : else if (w == 32) {
1523 : do {
1524 76682000 : sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
1525 76674100 : src_ptr += src_stride;
1526 76674100 : dst += dst_stride;
1527 76674100 : } while (--y);
1528 : }
1529 876998 : else if (w == 64) {
1530 : do {
1531 27582800 : sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
1532 27582200 : sr_x_6tap_32_avx2(
1533 : src_ptr + 32, coeffs_256, filt_256, dst + 32);
1534 27582100 : src_ptr += src_stride;
1535 27582100 : dst += dst_stride;
1536 27582100 : } while (--y);
1537 : }
1538 : else {
1539 0 : assert(w == 128);
1540 :
1541 : do {
1542 0 : sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
1543 0 : sr_x_6tap_32_avx2(
1544 : src_ptr + 1 * 32, coeffs_256, filt_256, dst + 1 * 32);
1545 0 : sr_x_6tap_32_avx2(
1546 : src_ptr + 2 * 32, coeffs_256, filt_256, dst + 2 * 32);
1547 0 : sr_x_6tap_32_avx2(
1548 : src_ptr + 3 * 32, coeffs_256, filt_256, dst + 3 * 32);
1549 0 : src_ptr += src_stride;
1550 0 : dst += dst_stride;
1551 0 : } while (--y);
1552 : }
1553 : }
1554 : else {
1555 : // horz_filt as 8 tap
1556 1826660 : const uint8_t *src_ptr = src - 3;
1557 :
1558 1826660 : filt_256[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
1559 :
1560 1826660 : prepare_half_coeffs_8tap_avx2(
1561 : filter_params_x, subpel_x_q4, coeffs_256);
1562 :
1563 1826810 : if (w == 8) {
1564 : do {
1565 5729910 : const __m256i res = x_convolve_8tap_8x2_avx2(
1566 : src_ptr, src_stride, coeffs_256, filt_256);
1567 5766150 : sr_x_round_store_8x2_avx2(res, dst, dst_stride);
1568 5766080 : src_ptr += 2 * src_stride;
1569 5766080 : dst += 2 * dst_stride;
1570 5766080 : y -= 2;
1571 5766080 : } while (y);
1572 : }
1573 957222 : else if (w == 16) {
1574 : do {
1575 : __m256i r[2];
1576 :
1577 : x_convolve_8tap_16x2_avx2(
1578 : src_ptr, src_stride, coeffs_256, filt_256, r);
1579 5540220 : sr_x_round_store_16x2_avx2(r, dst, dst_stride);
1580 5540340 : src_ptr += 2 * src_stride;
1581 5540340 : dst += 2 * dst_stride;
1582 5540340 : y -= 2;
1583 5540340 : } while (y);
1584 : }
1585 344321 : else if (w == 32) {
1586 : do {
1587 : sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
1588 5889800 : src_ptr += src_stride;
1589 5889800 : dst += dst_stride;
1590 5889800 : } while (--y);
1591 : }
1592 64873 : else if (w == 64) {
1593 : do {
1594 : sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
1595 1926560 : sr_x_8tap_32_avx2(
1596 : src_ptr + 32, coeffs_256, filt_256, dst + 32);
1597 1926580 : src_ptr += src_stride;
1598 1926580 : dst += dst_stride;
1599 1926580 : } while (--y);
1600 : }
1601 : else {
1602 0 : assert(w == 128);
1603 :
1604 : do {
1605 : sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
1606 0 : sr_x_8tap_32_avx2(
1607 : src_ptr + 1 * 32, coeffs_256, filt_256, dst + 1 * 32);
1608 0 : sr_x_8tap_32_avx2(
1609 : src_ptr + 2 * 32, coeffs_256, filt_256, dst + 2 * 32);
1610 0 : sr_x_8tap_32_avx2(
1611 : src_ptr + 3 * 32, coeffs_256, filt_256, dst + 3 * 32);
1612 0 : src_ptr += src_stride;
1613 0 : dst += dst_stride;
1614 0 : } while (--y);
1615 : }
1616 : }
1617 : }
1618 31617800 : }
1619 :
1620 : // Loads and stores to do away with the tedium of casting the address
1621 : // to the right type.
1622 0 : static INLINE __m128i xx_loadl_32(const void *a) {
1623 : int val;
1624 0 : memcpy(&val, a, sizeof(val));
1625 0 : return _mm_cvtsi32_si128(val);
1626 : }
1627 1190760000 : static INLINE __m128i xx_load_128(const void *a) {
1628 1190760000 : return _mm_load_si128((const __m128i *)a);
1629 : }
1630 :
1631 1032330000 : static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
1632 : const __m256i s1) {
1633 2064650000 : const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1));
1634 3096980000 : return _mm256_abs_epi16(
1635 : _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4)));
1636 : // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
1637 : }
1638 39720800 : void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
1639 : DIFFWTD_MASK_TYPE mask_type,
1640 : const uint8_t *src0, int src0_stride,
1641 : const uint8_t *src1, int src1_stride,
1642 : int h, int w) {
1643 39720800 : const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
1644 39720800 : const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
1645 39720800 : int i = 0;
1646 39720800 : if (4 == w) {
1647 : do {
1648 0 : const __m128i s0A = xx_loadl_32(src0);
1649 0 : const __m128i s0B = xx_loadl_32(src0 + src0_stride);
1650 0 : const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2);
1651 0 : const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3);
1652 0 : const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
1653 0 : const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
1654 0 : const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
1655 0 : const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
1656 :
1657 0 : const __m128i s1A = xx_loadl_32(src1);
1658 0 : const __m128i s1B = xx_loadl_32(src1 + src1_stride);
1659 0 : const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2);
1660 0 : const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3);
1661 0 : const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
1662 0 : const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
1663 0 : const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
1664 0 : const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD);
1665 0 : const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w);
1666 0 : const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
1667 : const __m128i x_m8 =
1668 0 : _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
1669 0 : xx_storeu_128(mask, x_m8);
1670 0 : src0 += (src0_stride << 2);
1671 0 : src1 += (src1_stride << 2);
1672 0 : mask += 16;
1673 0 : i += 4;
1674 0 : } while (i < h);
1675 : }
1676 39734900 : else if (8 == w) {
1677 : do {
1678 62336500 : const __m128i s0A = xx_loadl_64(src0);
1679 62303500 : const __m128i s0B = xx_loadl_64(src0 + src0_stride);
1680 62292900 : const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
1681 62270900 : const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
1682 124545000 : const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
1683 62272300 : const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
1684 62272300 : const __m128i s1A = xx_loadl_64(src1);
1685 62272500 : const __m128i s1B = xx_loadl_64(src1 + src1_stride);
1686 62260500 : const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
1687 62254900 : const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
1688 124529000 : const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
1689 62264300 : const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
1690 62264300 : const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
1691 62294500 : const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
1692 62303400 : const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
1693 62303400 : yy_storeu_256(mask, m8);
1694 62300900 : src0 += src0_stride << 2;
1695 62300900 : src1 += src1_stride << 2;
1696 62300900 : mask += 32;
1697 62300900 : i += 4;
1698 62300900 : } while (i < h);
1699 : }
1700 23000800 : else if (16 == w) {
1701 : do {
1702 129757000 : const __m128i s0A = xx_load_128(src0);
1703 129658000 : const __m128i s0B = xx_load_128(src0 + src0_stride);
1704 129595000 : const __m128i s1A = xx_load_128(src1);
1705 129525000 : const __m128i s1B = xx_load_128(src1 + src1_stride);
1706 129485000 : const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
1707 129485000 : const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
1708 129485000 : const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
1709 129485000 : const __m256i s1BL = _mm256_cvtepu8_epi16(s1B);
1710 :
1711 129485000 : const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL);
1712 129583000 : const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL);
1713 :
1714 : const __m256i m8 =
1715 129655000 : _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
1716 129655000 : yy_storeu_256(mask, m8);
1717 129670000 : src0 += src0_stride << 1;
1718 129670000 : src1 += src1_stride << 1;
1719 129670000 : mask += 32;
1720 129670000 : i += 2;
1721 129670000 : } while (i < h);
1722 : }
1723 : else {
1724 : do {
1725 250572000 : int j = 0;
1726 : do {
1727 330371000 : const __m256i s0 = yy_loadu_256(src0 + j);
1728 330115000 : const __m256i s1 = yy_loadu_256(src1 + j);
1729 659290000 : const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0));
1730 329645000 : const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1));
1731 : const __m256i s0H =
1732 329645000 : _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1));
1733 : const __m256i s1H =
1734 329645000 : _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1));
1735 329645000 : const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L);
1736 330116000 : const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H);
1737 : const __m256i m8 =
1738 330583000 : _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8);
1739 330583000 : yy_storeu_256(mask + j, m8);
1740 330376000 : j += 32;
1741 330376000 : } while (j < w);
1742 250576000 : src0 += src0_stride;
1743 250576000 : src1 += src1_stride;
1744 250576000 : mask += w;
1745 250576000 : i += 1;
1746 250576000 : } while (i < h);
1747 : }
1748 39616700 : }
1749 : ////////
1750 :
1751 372286000 : static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0,
1752 : const __m256i *data_src1,
1753 : const __m256i *round_const,
1754 : const __m256i *mask_base_16,
1755 : const __m256i *clip_diff, int round) {
1756 372286000 : const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
1757 744573000 : const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
1758 372286000 : const __m256i diff = _mm256_max_epu16(diffa, diffb);
1759 : const __m256i diff_round =
1760 1116860000 : _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
1761 372286000 : const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
1762 372286000 : const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
1763 372286000 : const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
1764 372286000 : return diff_clamp;
1765 : }
1766 :
1767 325266000 : static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0,
1768 : const __m256i *data_src1,
1769 : const __m256i *round_const,
1770 : const __m256i *mask_base_16,
1771 : const __m256i *clip_diff,
1772 : int round) {
1773 325266000 : const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
1774 650533000 : const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
1775 325266000 : const __m256i diff = _mm256_max_epu16(diffa, diffb);
1776 : const __m256i diff_round =
1777 975799000 : _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
1778 325266000 : const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
1779 325266000 : const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
1780 325266000 : const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
1781 325266000 : const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp);
1782 325266000 : return diff_const_16;
1783 : }
1784 :
1785 14127100 : static INLINE void build_compound_diffwtd_mask_d16_avx2(
1786 : uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
1787 : const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
1788 14127100 : const int mask_base = 38;
1789 14127100 : const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
1790 28254300 : const __m256i y38 = _mm256_set1_epi16(mask_base);
1791 14127100 : const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
1792 14127100 : int i = 0;
1793 14127100 : if (w == 4) {
1794 : do {
1795 0 : const __m128i s0A = xx_loadl_64(src0);
1796 0 : const __m128i s0B = xx_loadl_64(src0 + src0_stride);
1797 0 : const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
1798 0 : const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
1799 0 : const __m128i s1A = xx_loadl_64(src1);
1800 0 : const __m128i s1B = xx_loadl_64(src1 + src1_stride);
1801 0 : const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
1802 0 : const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
1803 0 : const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
1804 : _mm_unpacklo_epi64(s0A, s0B));
1805 0 : const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
1806 : _mm_unpacklo_epi64(s1A, s1B));
1807 0 : const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift);
1808 0 : const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
1809 0 : xx_storeu_128(mask,
1810 0 : _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
1811 0 : src0 += src0_stride << 2;
1812 0 : src1 += src1_stride << 2;
1813 0 : mask += 16;
1814 0 : i += 4;
1815 0 : } while (i < h);
1816 : }
1817 14129300 : else if (w == 8) {
1818 : do {
1819 22611500 : const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
1820 22600600 : const __m256i s0CD =
1821 22606700 : yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
1822 22600600 : const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
1823 22597500 : const __m256i s1CD =
1824 22597900 : yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
1825 : const __m256i m16AB =
1826 22597500 : calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
1827 : const __m256i m16CD =
1828 22608700 : calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
1829 22610200 : const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
1830 22610200 : yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
1831 22609400 : src0 += src0_stride << 2;
1832 22609400 : src1 += src1_stride << 2;
1833 22609400 : mask += 32;
1834 22609400 : i += 4;
1835 22609400 : } while (i < h);
1836 : }
1837 8209340 : else if (w == 16) {
1838 : do {
1839 46565600 : const __m256i s0A = yy_loadu_256(src0);
1840 46552800 : const __m256i s0B = yy_loadu_256(src0 + src0_stride);
1841 46539300 : const __m256i s1A = yy_loadu_256(src1);
1842 46530300 : const __m256i s1B = yy_loadu_256(src1 + src1_stride);
1843 : const __m256i m16A =
1844 46525200 : calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
1845 : const __m256i m16B =
1846 46569400 : calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
1847 46559300 : const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
1848 46559300 : yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
1849 46554700 : src0 += src0_stride << 1;
1850 46554700 : src1 += src1_stride << 1;
1851 46554700 : mask += 32;
1852 46554700 : i += 2;
1853 46554700 : } while (i < h);
1854 : }
1855 3680100 : else if (w == 32) {
1856 : do {
1857 61400100 : const __m256i s0A = yy_loadu_256(src0);
1858 61383800 : const __m256i s0B = yy_loadu_256(src0 + 16);
1859 61360900 : const __m256i s1A = yy_loadu_256(src1);
1860 61347700 : const __m256i s1B = yy_loadu_256(src1 + 16);
1861 : const __m256i m16A =
1862 61335000 : calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
1863 : const __m256i m16B =
1864 61413900 : calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
1865 61398400 : const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
1866 61398400 : yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
1867 61390000 : src0 += src0_stride;
1868 61390000 : src1 += src1_stride;
1869 61390000 : mask += 32;
1870 61390000 : i += 1;
1871 61390000 : } while (i < h);
1872 : }
1873 894399 : else if (w == 64) {
1874 : do {
1875 28302200 : const __m256i s0A = yy_loadu_256(src0);
1876 28295400 : const __m256i s0B = yy_loadu_256(src0 + 16);
1877 28288900 : const __m256i s0C = yy_loadu_256(src0 + 32);
1878 28283900 : const __m256i s0D = yy_loadu_256(src0 + 48);
1879 28278200 : const __m256i s1A = yy_loadu_256(src1);
1880 28276700 : const __m256i s1B = yy_loadu_256(src1 + 16);
1881 28276800 : const __m256i s1C = yy_loadu_256(src1 + 32);
1882 28276200 : const __m256i s1D = yy_loadu_256(src1 + 48);
1883 : const __m256i m16A =
1884 28275600 : calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
1885 : const __m256i m16B =
1886 28309200 : calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
1887 : const __m256i m16C =
1888 28316100 : calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
1889 : const __m256i m16D =
1890 28316000 : calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
1891 28310300 : const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
1892 28310300 : const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
1893 28310300 : yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
1894 28305600 : yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
1895 28297900 : src0 += src0_stride;
1896 28297900 : src1 += src1_stride;
1897 28297900 : mask += 64;
1898 28297900 : i += 1;
1899 28297900 : } while (i < h);
1900 : }
1901 : else {
1902 : do {
1903 0 : const __m256i s0A = yy_loadu_256(src0);
1904 0 : const __m256i s0B = yy_loadu_256(src0 + 16);
1905 0 : const __m256i s0C = yy_loadu_256(src0 + 32);
1906 0 : const __m256i s0D = yy_loadu_256(src0 + 48);
1907 0 : const __m256i s0E = yy_loadu_256(src0 + 64);
1908 0 : const __m256i s0F = yy_loadu_256(src0 + 80);
1909 0 : const __m256i s0G = yy_loadu_256(src0 + 96);
1910 0 : const __m256i s0H = yy_loadu_256(src0 + 112);
1911 0 : const __m256i s1A = yy_loadu_256(src1);
1912 0 : const __m256i s1B = yy_loadu_256(src1 + 16);
1913 0 : const __m256i s1C = yy_loadu_256(src1 + 32);
1914 0 : const __m256i s1D = yy_loadu_256(src1 + 48);
1915 0 : const __m256i s1E = yy_loadu_256(src1 + 64);
1916 0 : const __m256i s1F = yy_loadu_256(src1 + 80);
1917 0 : const __m256i s1G = yy_loadu_256(src1 + 96);
1918 0 : const __m256i s1H = yy_loadu_256(src1 + 112);
1919 : const __m256i m16A =
1920 0 : calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
1921 : const __m256i m16B =
1922 0 : calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
1923 : const __m256i m16C =
1924 0 : calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
1925 : const __m256i m16D =
1926 0 : calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
1927 : const __m256i m16E =
1928 0 : calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
1929 : const __m256i m16F =
1930 0 : calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
1931 : const __m256i m16G =
1932 0 : calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
1933 : const __m256i m16H =
1934 0 : calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
1935 0 : const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
1936 0 : const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
1937 0 : const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
1938 0 : const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
1939 0 : yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
1940 0 : yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
1941 0 : yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
1942 0 : yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
1943 0 : src0 += src0_stride;
1944 0 : src1 += src1_stride;
1945 0 : mask += 128;
1946 0 : i += 1;
1947 0 : } while (i < h);
1948 : }
1949 14101900 : }
1950 12237500 : static INLINE void build_compound_diffwtd_mask_d16_inv_avx2(
1951 : uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
1952 : const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
1953 12237500 : const int mask_base = 38;
1954 12237500 : const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
1955 24474900 : const __m256i y38 = _mm256_set1_epi16(mask_base);
1956 12237500 : const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
1957 12237500 : int i = 0;
1958 12237500 : if (w == 4) {
1959 : do {
1960 0 : const __m128i s0A = xx_loadl_64(src0);
1961 0 : const __m128i s0B = xx_loadl_64(src0 + src0_stride);
1962 0 : const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
1963 0 : const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
1964 0 : const __m128i s1A = xx_loadl_64(src1);
1965 0 : const __m128i s1B = xx_loadl_64(src1 + src1_stride);
1966 0 : const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
1967 0 : const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
1968 0 : const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
1969 : _mm_unpacklo_epi64(s0A, s0B));
1970 0 : const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
1971 : _mm_unpacklo_epi64(s1A, s1B));
1972 : const __m256i m16 =
1973 0 : calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift);
1974 0 : const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
1975 0 : xx_storeu_128(mask,
1976 0 : _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
1977 0 : src0 += src0_stride << 2;
1978 0 : src1 += src1_stride << 2;
1979 0 : mask += 16;
1980 0 : i += 4;
1981 0 : } while (i < h);
1982 : }
1983 12238700 : else if (w == 8) {
1984 : do {
1985 19382800 : const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
1986 19374100 : const __m256i s0CD =
1987 19378600 : yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
1988 19374100 : const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
1989 19372200 : const __m256i s1CD =
1990 19372100 : yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
1991 : const __m256i m16AB =
1992 19372200 : calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
1993 : const __m256i m16CD =
1994 19381300 : calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
1995 19382600 : const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
1996 19382600 : yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
1997 19381000 : src0 += src0_stride << 2;
1998 19381000 : src1 += src1_stride << 2;
1999 19381000 : mask += 32;
2000 19381000 : i += 4;
2001 19381000 : } while (i < h);
2002 : }
2003 7126280 : else if (w == 16) {
2004 : do {
2005 40128200 : const __m256i s0A = yy_loadu_256(src0);
2006 40117000 : const __m256i s0B = yy_loadu_256(src0 + src0_stride);
2007 40103000 : const __m256i s1A = yy_loadu_256(src1);
2008 40095300 : const __m256i s1B = yy_loadu_256(src1 + src1_stride);
2009 : const __m256i m16A =
2010 40090600 : calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
2011 : const __m256i m16B =
2012 40126200 : calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
2013 40126000 : const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
2014 40126000 : yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
2015 40119700 : src0 += src0_stride << 1;
2016 40119700 : src1 += src1_stride << 1;
2017 40119700 : mask += 32;
2018 40119700 : i += 2;
2019 40119700 : } while (i < h);
2020 : }
2021 3194250 : else if (w == 32) {
2022 : do {
2023 53085200 : const __m256i s0A = yy_loadu_256(src0);
2024 53068600 : const __m256i s0B = yy_loadu_256(src0 + 16);
2025 53048600 : const __m256i s1A = yy_loadu_256(src1);
2026 53035300 : const __m256i s1B = yy_loadu_256(src1 + 16);
2027 : const __m256i m16A =
2028 53023400 : calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
2029 : const __m256i m16B =
2030 53082200 : calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
2031 53082400 : const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
2032 53082400 : yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
2033 53074400 : src0 += src0_stride;
2034 53074400 : src1 += src1_stride;
2035 53074400 : mask += 32;
2036 53074400 : i += 1;
2037 53074400 : } while (i < h);
2038 : }
2039 796986 : else if (w == 64) {
2040 : do {
2041 25452100 : const __m256i s0A = yy_loadu_256(src0);
2042 25446600 : const __m256i s0B = yy_loadu_256(src0 + 16);
2043 25439700 : const __m256i s0C = yy_loadu_256(src0 + 32);
2044 25432600 : const __m256i s0D = yy_loadu_256(src0 + 48);
2045 25424000 : const __m256i s1A = yy_loadu_256(src1);
2046 25424000 : const __m256i s1B = yy_loadu_256(src1 + 16);
2047 25423400 : const __m256i s1C = yy_loadu_256(src1 + 32);
2048 25423100 : const __m256i s1D = yy_loadu_256(src1 + 48);
2049 : const __m256i m16A =
2050 25422400 : calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
2051 : const __m256i m16B =
2052 25459400 : calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
2053 : const __m256i m16C =
2054 25461900 : calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
2055 : const __m256i m16D =
2056 25464300 : calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
2057 25464400 : const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
2058 25464400 : const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
2059 25464400 : yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
2060 25457600 : yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
2061 25448900 : src0 += src0_stride;
2062 25448900 : src1 += src1_stride;
2063 25448900 : mask += 64;
2064 25448900 : i += 1;
2065 25448900 : } while (i < h);
2066 : }
2067 : else {
2068 : do {
2069 0 : const __m256i s0A = yy_loadu_256(src0);
2070 0 : const __m256i s0B = yy_loadu_256(src0 + 16);
2071 0 : const __m256i s0C = yy_loadu_256(src0 + 32);
2072 0 : const __m256i s0D = yy_loadu_256(src0 + 48);
2073 0 : const __m256i s0E = yy_loadu_256(src0 + 64);
2074 0 : const __m256i s0F = yy_loadu_256(src0 + 80);
2075 0 : const __m256i s0G = yy_loadu_256(src0 + 96);
2076 0 : const __m256i s0H = yy_loadu_256(src0 + 112);
2077 0 : const __m256i s1A = yy_loadu_256(src1);
2078 0 : const __m256i s1B = yy_loadu_256(src1 + 16);
2079 0 : const __m256i s1C = yy_loadu_256(src1 + 32);
2080 0 : const __m256i s1D = yy_loadu_256(src1 + 48);
2081 0 : const __m256i s1E = yy_loadu_256(src1 + 64);
2082 0 : const __m256i s1F = yy_loadu_256(src1 + 80);
2083 0 : const __m256i s1G = yy_loadu_256(src1 + 96);
2084 0 : const __m256i s1H = yy_loadu_256(src1 + 112);
2085 : const __m256i m16A =
2086 0 : calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
2087 : const __m256i m16B =
2088 0 : calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
2089 : const __m256i m16C =
2090 0 : calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
2091 : const __m256i m16D =
2092 0 : calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
2093 : const __m256i m16E =
2094 0 : calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
2095 : const __m256i m16F =
2096 0 : calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
2097 : const __m256i m16G =
2098 0 : calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
2099 : const __m256i m16H =
2100 0 : calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
2101 0 : const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
2102 0 : const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
2103 0 : const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
2104 0 : const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
2105 0 : yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
2106 0 : yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
2107 0 : yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
2108 0 : yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
2109 0 : src0 += src0_stride;
2110 0 : src1 += src1_stride;
2111 0 : mask += 128;
2112 0 : i += 1;
2113 0 : } while (i < h);
2114 : }
2115 12214400 : }
2116 26352300 : void av1_build_compound_diffwtd_mask_d16_avx2(
2117 : uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
2118 : int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
2119 : ConvolveParams *conv_params, int bd) {
2120 26352300 : const int shift =
2121 26352300 : 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
2122 : // When rounding constant is added, there is a possibility of overflow.
2123 : // However that much precision is not required. Code should very well work for
2124 : // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
2125 : // there is a possibility of corner case bugs.
2126 : assert(DIFF_FACTOR_LOG2 == 4);
2127 : assert(AOM_BLEND_A64_MAX_ALPHA == 64);
2128 :
2129 26352300 : if (mask_type == DIFFWTD_38) {
2130 14126700 : build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1,
2131 : src1_stride, h, w, shift);
2132 : }
2133 : else {
2134 12225600 : build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1,
2135 : src1_stride, h, w, shift);
2136 : }
2137 26366700 : }
2138 :
2139 :
2140 : #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
2141 :
2142 : /**
2143 : * See av1_wedge_sse_from_residuals_c
2144 : */
2145 292732000 : uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
2146 : const uint8_t *m, int N) {
2147 292732000 : int n = -N;
2148 :
2149 : uint64_t csse;
2150 :
2151 292732000 : const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
2152 292732000 : const __m256i v_zext_q = yy_set1_64_from_32i(0xffffffff);
2153 :
2154 292890000 : __m256i v_acc0_q = _mm256_setzero_si256();
2155 :
2156 292890000 : assert(N % 64 == 0);
2157 :
2158 292890000 : r1 += N;
2159 292890000 : d += N;
2160 292890000 : m += N;
2161 :
2162 : do {
2163 4388270000 : const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n));
2164 4395960000 : const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n));
2165 8781470000 : const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n));
2166 :
2167 4388170000 : const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w);
2168 4388170000 : const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w);
2169 4388170000 : const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b);
2170 :
2171 4388170000 : const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w);
2172 4388170000 : const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w);
2173 :
2174 4388170000 : const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w);
2175 4388170000 : const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w);
2176 :
2177 4388170000 : const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d);
2178 :
2179 4388170000 : const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w);
2180 :
2181 13164500000 : const __m256i v_sum0_q = _mm256_add_epi64(
2182 : _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32));
2183 :
2184 4388170000 : v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q);
2185 :
2186 4388170000 : n += 16;
2187 4388170000 : } while (n);
2188 :
2189 585577000 : v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8));
2190 292788000 : __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q);
2191 292788000 : __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1);
2192 292788000 : v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
2193 : #if ARCH_X86_64
2194 : csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
2195 : #else
2196 292788000 : xx_storel_64(&csse, v_acc_q_0);
2197 : #endif
2198 :
2199 292216000 : return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
2200 : }
2201 :
2202 :
2203 458849000 : static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
2204 : const uint8_t *pred_ptr) {
2205 458886000 : __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
2206 458884000 : __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
2207 458884000 : __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
2208 917768000 : __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
2209 458884000 : __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
2210 917768000 : __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
2211 458884000 : const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
2212 458884000 : const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
2213 : _mm256_storeu_si256((__m256i *)(diff_ptr), d_0);
2214 458884000 : _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1);
2215 458884000 : }
2216 :
2217 :
2218 29982100 : static INLINE void aom_subtract_block_16xn_avx2(
2219 : int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
2220 : ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
2221 546887000 : for (int32_t j = 0; j < rows; ++j) {
2222 516815000 : __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
2223 516905000 : __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
2224 516905000 : __m256i s_0 = _mm256_cvtepu8_epi16(s);
2225 516905000 : __m256i p_0 = _mm256_cvtepu8_epi16(p);
2226 516905000 : const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
2227 : _mm256_storeu_si256((__m256i *)(diff_ptr), d_0);
2228 516905000 : src_ptr += src_stride;
2229 516905000 : pred_ptr += pred_stride;
2230 516905000 : diff_ptr += diff_stride;
2231 : }
2232 30160900 : }
2233 :
2234 12979000 : static INLINE void aom_subtract_block_32xn_avx2(
2235 : int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
2236 : ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
2237 311854000 : for (int32_t j = 0; j < rows; ++j) {
2238 298992000 : subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
2239 298875000 : src_ptr += src_stride;
2240 298875000 : pred_ptr += pred_stride;
2241 298875000 : diff_ptr += diff_stride;
2242 : }
2243 12862300 : }
2244 2476930 : static INLINE void aom_subtract_block_64xn_avx2(
2245 : int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
2246 : ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
2247 83529900 : for (int32_t j = 0; j < rows; ++j) {
2248 81052800 : subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
2249 81052700 : subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
2250 81052900 : src_ptr += src_stride;
2251 81052900 : pred_ptr += pred_stride;
2252 81052900 : diff_ptr += diff_stride;
2253 : }
2254 2477080 : }
2255 0 : static INLINE void aom_subtract_block_128xn_avx2(
2256 : int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
2257 : ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
2258 0 : for (int32_t j = 0; j < rows; ++j) {
2259 0 : subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
2260 0 : subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
2261 0 : subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
2262 0 : subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
2263 0 : src_ptr += src_stride;
2264 0 : pred_ptr += pred_stride;
2265 0 : diff_ptr += diff_stride;
2266 : }
2267 0 : }
2268 83024600 : void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
2269 : ptrdiff_t diff_stride, const uint8_t *src_ptr,
2270 : ptrdiff_t src_stride, const uint8_t *pred_ptr,
2271 : ptrdiff_t pred_stride) {
2272 83024600 : switch (cols) {
2273 29986600 : case 16:
2274 29986600 : aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
2275 : src_stride, pred_ptr, pred_stride);
2276 29999000 : break;
2277 12979300 : case 32:
2278 12979300 : aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
2279 : src_stride, pred_ptr, pred_stride);
2280 12981800 : break;
2281 2476940 : case 64:
2282 2476940 : aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
2283 : src_stride, pred_ptr, pred_stride);
2284 2477060 : break;
2285 0 : case 128:
2286 0 : aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
2287 : src_stride, pred_ptr, pred_stride);
2288 0 : break;
2289 37581700 : default:
2290 37581700 : eb_aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
2291 : src_stride, pred_ptr, pred_stride);
2292 37750600 : break;
2293 : }
2294 83208500 : }
2295 :
2296 :
2297 :
2298 0 : static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
2299 : const uint8_t *b, int b_stride, __m256i *sum) {
2300 0 : const __m128i v_a0 = xx_loadl_32(a);
2301 0 : const __m128i v_a1 = xx_loadl_32(a + a_stride);
2302 0 : const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
2303 0 : const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
2304 0 : const __m128i v_b0 = xx_loadl_32(b);
2305 0 : const __m128i v_b1 = xx_loadl_32(b + b_stride);
2306 0 : const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
2307 0 : const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
2308 0 : const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
2309 : _mm_unpacklo_epi32(v_a2, v_a3));
2310 0 : const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
2311 : _mm_unpacklo_epi32(v_b2, v_b3));
2312 0 : const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
2313 0 : const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
2314 0 : const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
2315 0 : *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
2316 0 : }
2317 :
2318 :
2319 50679100 : static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
2320 : const uint8_t *b, int b_stride, __m256i *sum) {
2321 50679100 : const __m128i v_a0 = xx_loadl_64(a);
2322 50676400 : const __m128i v_a1 = xx_loadl_64(a + a_stride);
2323 50671200 : const __m128i v_b0 = xx_loadl_64(b);
2324 50668700 : const __m128i v_b1 = xx_loadl_64(b + b_stride);
2325 101339000 : const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
2326 101339000 : const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
2327 50669500 : const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
2328 50669500 : *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
2329 50669500 : }
2330 :
2331 52427600 : static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
2332 : const uint8_t *b) {
2333 52427600 : const __m256i v_a0 = yy_loadu_256(a);
2334 52422500 : const __m256i v_b0 = yy_loadu_256(b);
2335 52416100 : const __m256i zero = _mm256_setzero_si256();
2336 52416100 : const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
2337 52416100 : const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
2338 52416100 : const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
2339 52416100 : const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
2340 52416100 : const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
2341 52416100 : const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
2342 104832000 : *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
2343 52416100 : *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
2344 52416100 : }
2345 :
2346 17811100 : static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
2347 : int64_t sum;
2348 17811100 : __m256i zero = _mm256_setzero_si256();
2349 17811100 : const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
2350 35622100 : const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
2351 17811100 : const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
2352 17811100 : const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
2353 17811100 : _mm256_extracti128_si256(sum_4x64, 1));
2354 17811100 : const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
2355 17811100 : xx_storel_64(&sum, sum_1x64);
2356 17811000 : return sum;
2357 : }
2358 17809400 : int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
2359 : int b_stride, int width, int height) {
2360 17809400 : int32_t y = 0;
2361 17809400 : int64_t sse = 0;
2362 17809400 : __m256i sum = _mm256_setzero_si256();
2363 17809400 : __m256i zero = _mm256_setzero_si256();
2364 17809400 : switch (width) {
2365 0 : case 4:
2366 : do {
2367 0 : sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
2368 0 : a += a_stride << 2;
2369 0 : b += b_stride << 2;
2370 0 : y += 4;
2371 0 : } while (y < height);
2372 0 : sse = summary_all_avx2(&sum);
2373 0 : break;
2374 41940800 : case 8:
2375 : do {
2376 50676700 : sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
2377 50677100 : a += a_stride << 1;
2378 50677100 : b += b_stride << 1;
2379 50677100 : y += 2;
2380 50677100 : } while (y < height);
2381 8736220 : sse = summary_all_avx2(&sum);
2382 8736400 : break;
2383 45555500 : case 16:
2384 : do {
2385 52373900 : const __m128i v_a0 = xx_loadu_128(a);
2386 52388400 : const __m128i v_a1 = xx_loadu_128(a + a_stride);
2387 52384500 : const __m128i v_b0 = xx_loadu_128(b);
2388 52377600 : const __m128i v_b1 = xx_loadu_128(b + b_stride);
2389 52374300 : const __m256i v_a =
2390 52374300 : _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
2391 52374300 : const __m256i v_b =
2392 104749000 : _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
2393 52374300 : const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
2394 52374300 : const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
2395 52374300 : const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
2396 52374300 : const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
2397 52374300 : const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
2398 52374300 : const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
2399 : const __m256i temp =
2400 104749000 : _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
2401 : _mm256_madd_epi16(v_bsub, v_bsub));
2402 52374300 : sum = _mm256_add_epi32(sum, temp);
2403 52374300 : a += a_stride << 1;
2404 52374300 : b += b_stride << 1;
2405 52374300 : y += 2;
2406 52374300 : } while (y < height);
2407 6818770 : sse = summary_all_avx2(&sum);
2408 6818750 : break;
2409 50174400 : case 32:
2410 : do {
2411 52433200 : sse_w32_avx2(&sum, a, b);
2412 52433200 : a += a_stride;
2413 52433200 : b += b_stride;
2414 52433200 : y += 1;
2415 52433200 : } while (y < height);
2416 2258840 : sse = summary_all_avx2(&sum);
2417 2258840 : break;
2418 0 : case 64:
2419 : do {
2420 0 : sse_w32_avx2(&sum, a, b);
2421 0 : sse_w32_avx2(&sum, a + 32, b + 32);
2422 0 : a += a_stride;
2423 0 : b += b_stride;
2424 0 : y += 1;
2425 0 : } while (y < height);
2426 0 : sse = summary_all_avx2(&sum);
2427 0 : break;
2428 0 : case 128:
2429 : do {
2430 0 : sse_w32_avx2(&sum, a, b);
2431 0 : sse_w32_avx2(&sum, a + 32, b + 32);
2432 0 : sse_w32_avx2(&sum, a + 64, b + 64);
2433 0 : sse_w32_avx2(&sum, a + 96, b + 96);
2434 0 : a += a_stride;
2435 0 : b += b_stride;
2436 0 : y += 1;
2437 0 : } while (y < height);
2438 0 : sse = summary_all_avx2(&sum);
2439 0 : break;
2440 0 : default:
2441 0 : if ((width & 0x07) == 0) {
2442 : do {
2443 0 : int i = 0;
2444 : do {
2445 0 : sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
2446 0 : i += 8;
2447 0 : } while (i < width);
2448 0 : a += a_stride << 1;
2449 0 : b += b_stride << 1;
2450 0 : y += 2;
2451 0 : } while (y < height);
2452 : }
2453 : else {
2454 : do {
2455 0 : int i = 0;
2456 : do {
2457 0 : sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
2458 0 : const uint8_t *a2 = a + i + (a_stride << 1);
2459 0 : const uint8_t *b2 = b + i + (b_stride << 1);
2460 0 : sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
2461 0 : i += 8;
2462 0 : } while (i + 4 < width);
2463 0 : sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
2464 0 : a += a_stride << 2;
2465 0 : b += b_stride << 2;
2466 0 : y += 4;
2467 0 : } while (y < height);
2468 : }
2469 0 : sse = summary_all_avx2(&sum);
2470 0 : break;
2471 : }
2472 :
2473 17814000 : return sse;
2474 : }
2475 :
2476 :
2477 23060400 : static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
2478 : #if ARCH_X86_64
2479 : return (uint64_t)_mm_cvtsi128_si64(a);
2480 : #else
2481 : {
2482 : uint64_t tmp;
2483 23060400 : _mm_storel_epi64((__m128i *)&tmp, a);
2484 23060400 : return tmp;
2485 : }
2486 : #endif
2487 :
2488 : }
2489 23051000 : static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
2490 23051000 : const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
2491 23049100 : __m128i v_acc0_q = _mm_setzero_si128();
2492 23049100 : __m128i v_acc1_q = _mm_setzero_si128();
2493 :
2494 23049100 : const int16_t *const end = src + n;
2495 :
2496 23049100 : assert(n % 64 == 0);
2497 :
2498 107794000 : while (src < end) {
2499 84908900 : const __m128i v_val_0_w = xx_load_128(src);
2500 84893000 : const __m128i v_val_1_w = xx_load_128(src + 8);
2501 84857600 : const __m128i v_val_2_w = xx_load_128(src + 16);
2502 84817500 : const __m128i v_val_3_w = xx_load_128(src + 24);
2503 84783600 : const __m128i v_val_4_w = xx_load_128(src + 32);
2504 84763000 : const __m128i v_val_5_w = xx_load_128(src + 40);
2505 84753100 : const __m128i v_val_6_w = xx_load_128(src + 48);
2506 84744900 : const __m128i v_val_7_w = xx_load_128(src + 56);
2507 :
2508 84744500 : const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
2509 84744500 : const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
2510 84744500 : const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
2511 84744500 : const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
2512 84744500 : const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
2513 84744500 : const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
2514 84744500 : const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
2515 84744500 : const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
2516 :
2517 84744500 : const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
2518 84744500 : const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
2519 84744500 : const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
2520 84744500 : const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
2521 :
2522 84744500 : const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
2523 84744500 : const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
2524 :
2525 84744500 : const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);
2526 :
2527 169489000 : v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
2528 84744500 : v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));
2529 :
2530 84744500 : src += 64;
2531 : }
2532 :
2533 22884700 : v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
2534 22884700 : v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
2535 22884700 : return xx_cvtsi128_si64(v_acc0_q);
2536 : }
2537 :
2538 23051400 : uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
2539 23051400 : if (n % 64 == 0) {
2540 23051800 : return aom_sum_squares_i16_64n_sse2(src, n);
2541 : }
2542 0 : else if (n > 64) {
2543 0 : int k = n & ~(64 - 1);
2544 0 : return aom_sum_squares_i16_64n_sse2(src, k) +
2545 0 : aom_sum_squares_i16_c(src + k, n - k);
2546 : }
2547 : else {
2548 0 : return aom_sum_squares_i16_c(src, n);
2549 : }
2550 : }
2551 :
2552 :
2553 :
2554 :
2555 : /**
2556 : * See av1_wedge_sign_from_residuals_c
2557 : */
2558 183596000 : int8_t av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m,
2559 : int N, int64_t limit) {
2560 : int64_t acc;
2561 183596000 : __m256i v_acc0_d = _mm256_setzero_si256();
2562 :
2563 : // Input size limited to 8192 by the use of 32 bit accumulators and m
2564 : // being between [0, 64]. Overflow might happen at larger sizes,
2565 : // though it is practically impossible on real video input.
2566 183596000 : assert(N < 8192);
2567 183596000 : assert(N % 64 == 0);
2568 :
2569 : do {
2570 673061000 : const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m));
2571 1345000000 : const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32));
2572 :
2573 671958000 : const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds));
2574 671958000 : const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16));
2575 671816000 : const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32));
2576 1342960000 : const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48));
2577 :
2578 : const __m256i v_m0_w =
2579 671603000 : _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b));
2580 : const __m256i v_m1_w =
2581 1343210000 : _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1));
2582 : const __m256i v_m2_w =
2583 671603000 : _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b));
2584 : const __m256i v_m3_w =
2585 1343210000 : _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1));
2586 :
2587 671603000 : const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w);
2588 671603000 : const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w);
2589 671603000 : const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w);
2590 671603000 : const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w);
2591 :
2592 671603000 : const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d);
2593 671603000 : const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d);
2594 :
2595 671603000 : const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d);
2596 :
2597 671603000 : v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d);
2598 :
2599 671603000 : ds += 64;
2600 671603000 : m += 64;
2601 :
2602 671603000 : N -= 64;
2603 671603000 : } while (N);
2604 :
2605 183873000 : __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31);
2606 367745000 : v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d),
2607 : _mm256_unpackhi_epi32(v_acc0_d, v_sign_d));
2608 :
2609 367745000 : __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8));
2610 :
2611 183873000 : __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q);
2612 183873000 : __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1);
2613 183873000 : v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
2614 :
2615 : #if ARCH_X86_64
2616 : acc = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
2617 : #else
2618 183873000 : xx_storel_64(&acc, v_acc_q_0);
2619 : #endif
2620 :
2621 183805000 : return acc > limit;
2622 : }
2623 :
2624 :
2625 : /**
2626 : * av1_wedge_compute_delta_squares_c
2627 : */
2628 11531200 : void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a,
2629 : const int16_t *b, int N) {
2630 11531200 : const __m256i v_neg_w = _mm256_set1_epi32(0xffff0001);
2631 :
2632 11531200 : assert(N % 64 == 0);
2633 :
2634 : do {
2635 42469800 : const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a));
2636 42470000 : const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b));
2637 42470000 : const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16));
2638 42468100 : const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16));
2639 42466000 : const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32));
2640 42465400 : const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32));
2641 42466500 : const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48));
2642 84934500 : const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48));
2643 :
2644 42465700 : const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w);
2645 42465700 : const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w);
2646 42465700 : const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w);
2647 42465700 : const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w);
2648 42465700 : const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w);
2649 42465700 : const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w);
2650 42465700 : const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w);
2651 42465700 : const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w);
2652 :
2653 : // Negate top word of pairs
2654 42465700 : const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w);
2655 42465700 : const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w);
2656 42465700 : const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w);
2657 42465700 : const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w);
2658 42465700 : const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w);
2659 42465700 : const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w);
2660 42465700 : const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w);
2661 42465700 : const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w);
2662 :
2663 42465700 : const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w);
2664 42465700 : const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w);
2665 42465700 : const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w);
2666 42465700 : const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w);
2667 42465700 : const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w);
2668 42465700 : const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w);
2669 42465700 : const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w);
2670 42465700 : const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w);
2671 :
2672 42465700 : const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w);
2673 42465700 : const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w);
2674 42465700 : const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w);
2675 42465700 : const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w);
2676 :
2677 : _mm256_storeu_si256((__m256i *)(d), v_r0_w);
2678 42465700 : _mm256_storeu_si256((__m256i *)(d + 16), v_r1_w);
2679 42465700 : _mm256_storeu_si256((__m256i *)(d + 32), v_r2_w);
2680 42465700 : _mm256_storeu_si256((__m256i *)(d + 48), v_r3_w);
2681 :
2682 42465700 : a += 64;
2683 42465700 : b += 64;
2684 42465700 : d += 64;
2685 42465700 : N -= 64;
2686 42465700 : } while (N);
2687 11531200 : }
|