Line data Source code
1 : /*
2 : * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <immintrin.h>
13 : #include "EbDefinitions.h"
14 : #include "EbMemory_SSE4_1.h"
15 : #include "aom_dsp_rtcd.h"
16 : #include "convolve.h"
17 : #include "convolve_avx2.h"
18 : #include "synonyms.h"
19 :
20 : #if !OBMC_CONVOLVE
21 : static INLINE void xy_y_round_store_8x2_avx2(const __m256i res[2],
22 : uint8_t *const dst,
23 : const int32_t stride) {
24 : const __m256i r = xy_y_round_16_avx2(res);
25 : pack_store_8x2_avx2(r, dst, stride);
26 : }
27 :
28 : static INLINE void xy_y_round_store_16x2_avx2(const __m256i res[4],
29 : uint8_t *const dst,
30 : const int32_t stride) {
31 : const __m256i r0 = xy_y_round_16_avx2(res + 0);
32 : const __m256i r1 = xy_y_round_16_avx2(res + 2);
33 : xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
34 : }
35 : static void convolve_2d_sr_hor_2tap_avx2(
36 : const uint8_t *const src, const int32_t src_stride, const int32_t w,
37 : const int32_t h, const InterpFilterParams *const filter_params_x,
38 : const int32_t subpel_x_q4, int16_t *const im_block) {
39 : const uint8_t *src_ptr = src;
40 : int32_t y = h;
41 : int16_t *im = im_block;
42 : __m128i coeffs_128[4];
43 : __m256i coeffs_256[4];
44 :
45 : if (w <= 8) {
46 : prepare_half_coeffs_2tap_ssse3(
47 : filter_params_x, subpel_x_q4, coeffs_128);
48 :
49 : if (w == 2) {
50 : do {
51 : const __m128i r =
52 : x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
53 : xy_x_round_store_2x2_sse2(r, im);
54 : src_ptr += 2 * src_stride;
55 : im += 2 * 2;
56 : y -= 2;
57 : } while (y);
58 : }
59 : else if (w == 4) {
60 : do {
61 : const __m128i r =
62 : x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
63 : xy_x_round_store_4x2_sse2(r, im);
64 : src_ptr += 2 * src_stride;
65 : im += 2 * 4;
66 : y -= 2;
67 : } while (y);
68 : }
69 : else {
70 : assert(w == 8);
71 :
72 : do {
73 : __m128i r[2];
74 :
75 : x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, r);
76 : xy_x_round_store_8x2_sse2(r, im);
77 : src_ptr += 2 * src_stride;
78 : im += 2 * 8;
79 : y -= 2;
80 : } while (y);
81 : }
82 : }
83 : else {
84 : prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
85 :
86 : if (w == 16) {
87 : do {
88 : __m256i r[2];
89 :
90 : x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
91 : xy_x_round_store_32_avx2(r, im);
92 : src_ptr += 2 * src_stride;
93 : im += 2 * 16;
94 : y -= 2;
95 : } while (y);
96 : }
97 : else if (w == 32) {
98 : do {
99 : xy_x_2tap_32_avx2(src_ptr, coeffs_256, im);
100 : src_ptr += src_stride;
101 : im += 32;
102 : } while (--y);
103 : }
104 : else if (w == 64) {
105 : do {
106 : xy_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, im + 0 * 32);
107 : xy_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, im + 1 * 32);
108 : src_ptr += src_stride;
109 : im += 64;
110 : } while (--y);
111 : }
112 : else {
113 : assert(w == 128);
114 :
115 : do {
116 : xy_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, im + 0 * 32);
117 : xy_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, im + 1 * 32);
118 : xy_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, im + 2 * 32);
119 : xy_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, im + 3 * 32);
120 : src_ptr += src_stride;
121 : im += 128;
122 : } while (--y);
123 : }
124 : }
125 : }
126 :
127 : static void convolve_2d_sr_hor_4tap_avx2(
128 : const uint8_t *const src, const int32_t src_stride, const int32_t w,
129 : const int32_t h, const InterpFilterParams *const filter_params_x,
130 : const int32_t subpel_x_q4, int16_t *const im_block) {
131 : const uint8_t *src_ptr = src - 1;
132 : int32_t y = h;
133 : int16_t *im = im_block;
134 : __m128i coeffs_128[4];
135 :
136 : prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
137 :
138 : if (w == 2) {
139 : do {
140 : const __m128i r =
141 : x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
142 : xy_x_round_store_2x2_sse2(r, im);
143 : src_ptr += 2 * src_stride;
144 : im += 2 * 2;
145 : y -= 2;
146 : } while (y);
147 : }
148 : else {
149 : assert(w == 4);
150 :
151 : do {
152 : const __m128i r =
153 : x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
154 : xy_x_round_store_4x2_sse2(r, im);
155 : src_ptr += 2 * src_stride;
156 : im += 2 * 4;
157 : y -= 2;
158 : } while (y);
159 : }
160 : }
161 :
162 : static void convolve_2d_sr_hor_6tap_avx2(
163 : const uint8_t *const src, const int32_t src_stride, const int32_t w,
164 : const int32_t h, const InterpFilterParams *const filter_params_x,
165 : const int32_t subpel_x_q4, int16_t *const im_block) {
166 : const uint8_t *src_ptr = src - 2;
167 : int32_t y = h;
168 : int16_t *im = im_block;
169 : __m256i coeffs_256[4], filt_256[4];
170 :
171 : filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
172 : filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
173 : filt_256[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
174 :
175 : prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
176 :
177 : if (w == 8) {
178 : do {
179 : const __m256i res = x_convolve_6tap_8x2_avx2(
180 : src_ptr, src_stride, coeffs_256, filt_256);
181 : xy_x_round_store_8x2_avx2(res, im);
182 : src_ptr += 2 * src_stride;
183 : im += 2 * 8;
184 : y -= 2;
185 : } while (y);
186 : }
187 : else if (w == 16) {
188 : do {
189 : __m256i r[2];
190 :
191 : x_convolve_6tap_16x2_avx2(
192 : src_ptr, src_stride, coeffs_256, filt_256, r);
193 : xy_x_round_store_32_avx2(r, im);
194 : src_ptr += 2 * src_stride;
195 : im += 2 * 16;
196 : y -= 2;
197 : } while (y);
198 : }
199 : else if (w == 32) {
200 : do {
201 : xy_x_6tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
202 : src_ptr += src_stride;
203 : im += 32;
204 : } while (--y);
205 : }
206 : else if (w == 64) {
207 : do {
208 : xy_x_6tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
209 : xy_x_6tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
210 : src_ptr += src_stride;
211 : im += 64;
212 : } while (--y);
213 : }
214 : else {
215 : assert(w == 128);
216 :
217 : do {
218 : xy_x_6tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
219 : xy_x_6tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
220 : xy_x_6tap_32_avx2(src_ptr + 64, 16, coeffs_256, filt_256, im + 64);
221 : xy_x_6tap_32_avx2(src_ptr + 96, 16, coeffs_256, filt_256, im + 96);
222 : src_ptr += src_stride;
223 : im += 128;
224 : } while (--y);
225 : }
226 : }
227 :
228 : static void convolve_2d_sr_hor_8tap_avx2(
229 : const uint8_t *const src, const int32_t src_stride, const int32_t w,
230 : const int32_t h, const InterpFilterParams *const filter_params_x,
231 : const int32_t subpel_x_q4, int16_t *const im_block) {
232 : const uint8_t *src_ptr = src - 3;
233 : int32_t y = h;
234 : int16_t *im = im_block;
235 : __m256i coeffs_256[4], filt_256[4];
236 :
237 : filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
238 : filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
239 : filt_256[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
240 : filt_256[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
241 :
242 : prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
243 :
244 : if (w == 8) {
245 : do {
246 : const __m256i res = x_convolve_8tap_8x2_avx2(
247 : src_ptr, src_stride, coeffs_256, filt_256);
248 : xy_x_round_store_8x2_avx2(res, im);
249 : src_ptr += 2 * src_stride;
250 : im += 2 * 8;
251 : y -= 2;
252 : } while (y);
253 : }
254 : else if (w == 16) {
255 : do {
256 : __m256i r[2];
257 :
258 : x_convolve_8tap_16x2_avx2(
259 : src_ptr, src_stride, coeffs_256, filt_256, r);
260 : xy_x_round_store_32_avx2(r, im);
261 : src_ptr += 2 * src_stride;
262 : im += 2 * 16;
263 : y -= 2;
264 : } while (y);
265 : }
266 : else if (w == 32) {
267 : do {
268 : xy_x_8tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
269 : src_ptr += src_stride;
270 : im += 32;
271 : } while (--y);
272 : }
273 : else if (w == 64) {
274 : do {
275 : xy_x_8tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
276 : xy_x_8tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
277 : src_ptr += src_stride;
278 : im += 64;
279 : } while (--y);
280 : }
281 : else {
282 : assert(w == 128);
283 :
284 : do {
285 : xy_x_8tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
286 : xy_x_8tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
287 : xy_x_8tap_32_avx2(src_ptr + 64, 16, coeffs_256, filt_256, im + 64);
288 : xy_x_8tap_32_avx2(src_ptr + 96, 16, coeffs_256, filt_256, im + 96);
289 : src_ptr += src_stride;
290 : im += 128;
291 : } while (--y);
292 : }
293 : }
294 :
295 : static void convolve_2d_sr_ver_2tap_avx2(uint8_t *dst, int32_t dst_stride,
296 : int32_t w, int32_t h,
297 : InterpFilterParams *filter_params_y,
298 : const int32_t subpel_y_q4,
299 : int16_t *im_block) {
300 : int32_t y = h;
301 : int16_t *im = im_block;
302 : __m128i coeffs_128[4];
303 : __m256i coeffs_256[4];
304 :
305 : if (w <= 4) {
306 : prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
307 :
308 : if (w == 2) {
309 : __m128i s_32[2];
310 :
311 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
312 :
313 : do {
314 : const __m128i res =
315 : xy_y_convolve_2tap_2x2_sse2(im, s_32, coeffs_128);
316 : xy_y_round_store_2x2_sse2(res, dst, dst_stride);
317 : im += 2 * 2;
318 : dst += 2 * dst_stride;
319 : y -= 2;
320 : } while (y);
321 : }
322 : else {
323 : __m128i s_64[2], r[2];
324 :
325 : assert(w == 4);
326 :
327 : s_64[0] = _mm_loadl_epi64((__m128i *)im);
328 :
329 : do {
330 : xy_y_convolve_2tap_4x2_sse2(im, s_64, coeffs_128, r);
331 : r[0] = xy_y_round_sse2(r[0]);
332 : r[1] = xy_y_round_sse2(r[1]);
333 : const __m128i rr = _mm_packs_epi32(r[0], r[1]);
334 : pack_store_4x2_sse2(rr, dst, dst_stride);
335 : im += 2 * 4;
336 : dst += 2 * dst_stride;
337 : y -= 2;
338 : } while (y);
339 : }
340 : }
341 : else {
342 : prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
343 :
344 : if (w == 8) {
345 : __m128i s_128[2];
346 : __m256i r[2];
347 :
348 : s_128[0] = _mm_load_si128((__m128i *)im);
349 :
350 : do {
351 : xy_y_convolve_2tap_8x2_avx2(im, s_128, coeffs_256, r);
352 : xy_y_round_store_8x2_avx2(r, dst, dst_stride);
353 : im += 2 * 8;
354 : dst += 2 * dst_stride;
355 : y -= 2;
356 : } while (y);
357 : }
358 : else if (w == 16) {
359 : __m256i s_256[2], r[4];
360 :
361 : s_256[0] = _mm256_load_si256((__m256i *)im);
362 :
363 : do {
364 : xy_y_convolve_2tap_16x2_avx2(im, s_256, coeffs_256, r);
365 : xy_y_round_store_16x2_avx2(r, dst, dst_stride);
366 : im += 2 * 16;
367 : dst += 2 * dst_stride;
368 : y -= 2;
369 : } while (y);
370 : }
371 : else if (w == 32) {
372 : __m256i s_256[2][2];
373 :
374 : s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
375 : s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
376 :
377 : do {
378 : xy_y_convolve_2tap_32_all_avx2(
379 : im + 32, s_256[0], s_256[1], coeffs_256, dst);
380 : xy_y_convolve_2tap_32_all_avx2(im + 2 * 32,
381 : s_256[1],
382 : s_256[0],
383 : coeffs_256,
384 : dst + dst_stride);
385 : im += 2 * 32;
386 : dst += 2 * dst_stride;
387 : y -= 2;
388 : } while (y);
389 : }
390 : else if (w == 64) {
391 : __m256i s_256[2][4];
392 :
393 : s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
394 : s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
395 : s_256[0][2] = _mm256_load_si256((__m256i *)(im + 2 * 16));
396 : s_256[0][3] = _mm256_load_si256((__m256i *)(im + 3 * 16));
397 :
398 : do {
399 : xy_y_convolve_2tap_32_all_avx2(
400 : im + 64, s_256[0] + 0, s_256[1] + 0, coeffs_256, dst);
401 : xy_y_convolve_2tap_32_all_avx2(
402 : im + 96, s_256[0] + 2, s_256[1] + 2, coeffs_256, dst + 32);
403 : im += 2 * 64;
404 : xy_y_convolve_2tap_32_all_avx2(im,
405 : s_256[1] + 0,
406 : s_256[0] + 0,
407 : coeffs_256,
408 : dst + dst_stride);
409 : xy_y_convolve_2tap_32_all_avx2(im + 32,
410 : s_256[1] + 2,
411 : s_256[0] + 2,
412 : coeffs_256,
413 : dst + dst_stride + 32);
414 : dst += 2 * dst_stride;
415 : y -= 2;
416 : } while (y);
417 : }
418 : else {
419 : __m256i s_256[2][8];
420 :
421 : assert(w == 128);
422 :
423 : load_16bit_8rows_avx2(im, 16, s_256[0]);
424 :
425 : do {
426 : xy_y_convolve_2tap_32_all_avx2(
427 : im + 128, s_256[0] + 0, s_256[1] + 0, coeffs_256, dst);
428 : xy_y_convolve_2tap_32_all_avx2(im + 160,
429 : s_256[0] + 2,
430 : s_256[1] + 2,
431 : coeffs_256,
432 : dst + 1 * 32);
433 : xy_y_convolve_2tap_32_all_avx2(im + 192,
434 : s_256[0] + 4,
435 : s_256[1] + 4,
436 : coeffs_256,
437 : dst + 2 * 32);
438 : xy_y_convolve_2tap_32_all_avx2(im + 224,
439 : s_256[0] + 6,
440 : s_256[1] + 6,
441 : coeffs_256,
442 : dst + 3 * 32);
443 : im += 2 * 128;
444 : xy_y_convolve_2tap_32_all_avx2(im,
445 : s_256[1] + 0,
446 : s_256[0] + 0,
447 : coeffs_256,
448 : dst + dst_stride);
449 : xy_y_convolve_2tap_32_all_avx2(im + 32,
450 : s_256[1] + 2,
451 : s_256[0] + 2,
452 : coeffs_256,
453 : dst + dst_stride + 1 * 32);
454 : xy_y_convolve_2tap_32_all_avx2(im + 64,
455 : s_256[1] + 4,
456 : s_256[0] + 4,
457 : coeffs_256,
458 : dst + dst_stride + 2 * 32);
459 : xy_y_convolve_2tap_32_all_avx2(im + 96,
460 : s_256[1] + 6,
461 : s_256[0] + 6,
462 : coeffs_256,
463 : dst + dst_stride + 3 * 32);
464 : dst += 2 * dst_stride;
465 : y -= 2;
466 : } while (y);
467 : }
468 : }
469 : }
470 :
471 : static void convolve_2d_sr_ver_2tap_half_avx2(
472 : uint8_t *dst, int32_t dst_stride, int32_t w, int32_t h,
473 : InterpFilterParams *filter_params_y, const int32_t subpel_y_q4,
474 : int16_t *im_block) {
475 : int32_t y = h;
476 : int16_t *im = im_block;
477 :
478 : (void)filter_params_y;
479 : (void)subpel_y_q4;
480 :
481 : if (w == 2) {
482 : __m128i s_32[2];
483 :
484 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
485 :
486 : do {
487 : const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
488 : const __m128i r = xy_y_round_half_pel_sse2(res);
489 : pack_store_2x2_sse2(r, dst, dst_stride);
490 : im += 2 * 2;
491 : dst += 2 * dst_stride;
492 : y -= 2;
493 : } while (y);
494 : }
495 : else if (w == 4) {
496 : __m128i s_64[2];
497 :
498 : s_64[0] = _mm_loadl_epi64((__m128i *)im);
499 :
500 : do {
501 : const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
502 : const __m128i r = xy_y_round_half_pel_sse2(res);
503 : pack_store_4x2_sse2(r, dst, dst_stride);
504 : im += 2 * 4;
505 : dst += 2 * dst_stride;
506 : y -= 2;
507 : } while (y);
508 : }
509 : else if (w == 8) {
510 : __m128i s_128[2];
511 :
512 : s_128[0] = _mm_load_si128((__m128i *)im);
513 :
514 : do {
515 : const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
516 : const __m256i r = xy_y_round_half_pel_avx2(res);
517 : pack_store_8x2_avx2(r, dst, dst_stride);
518 : im += 2 * 8;
519 : dst += 2 * dst_stride;
520 : y -= 2;
521 : } while (y);
522 : }
523 : else if (w == 16) {
524 : __m256i s_256[2], r[2];
525 :
526 : s_256[0] = _mm256_load_si256((__m256i *)im);
527 :
528 : do {
529 : xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
530 : const __m256i r0 = xy_y_round_half_pel_avx2(r[0]);
531 : const __m256i r1 = xy_y_round_half_pel_avx2(r[1]);
532 : xy_y_pack_store_16x2_avx2(r0, r1, dst, dst_stride);
533 : im += 2 * 16;
534 : dst += 2 * dst_stride;
535 : y -= 2;
536 : } while (y);
537 : }
538 : else if (w == 32) {
539 : __m256i s_256[2][2];
540 :
541 : s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
542 : s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
543 :
544 : do {
545 : xy_y_convolve_2tap_half_pel_32_all_avx2(
546 : im + 32, s_256[0], s_256[1], dst);
547 : xy_y_convolve_2tap_half_pel_32_all_avx2(
548 : im + 2 * 32, s_256[1], s_256[0], dst + dst_stride);
549 : im += 2 * 32;
550 : dst += 2 * dst_stride;
551 : y -= 2;
552 : } while (y);
553 : }
554 : else if (w == 64) {
555 : __m256i s_256[2][4];
556 :
557 : s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
558 : s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
559 : s_256[0][2] = _mm256_load_si256((__m256i *)(im + 2 * 16));
560 : s_256[0][3] = _mm256_load_si256((__m256i *)(im + 3 * 16));
561 :
562 : do {
563 : xy_y_convolve_2tap_half_pel_32_all_avx2(
564 : im + 64, s_256[0] + 0, s_256[1] + 0, dst);
565 : xy_y_convolve_2tap_half_pel_32_all_avx2(
566 : im + 96, s_256[0] + 2, s_256[1] + 2, dst + 32);
567 : im += 2 * 64;
568 : xy_y_convolve_2tap_half_pel_32_all_avx2(
569 : im, s_256[1] + 0, s_256[0] + 0, dst + dst_stride);
570 : xy_y_convolve_2tap_half_pel_32_all_avx2(
571 : im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 32);
572 : dst += 2 * dst_stride;
573 : y -= 2;
574 : } while (y);
575 : }
576 : else {
577 : __m256i s_256[2][8];
578 :
579 : assert(w == 128);
580 :
581 : load_16bit_8rows_avx2(im, 16, s_256[0]);
582 :
583 : do {
584 : xy_y_convolve_2tap_half_pel_32_all_avx2(
585 : im + 128, s_256[0] + 0, s_256[1] + 0, dst);
586 : xy_y_convolve_2tap_half_pel_32_all_avx2(
587 : im + 160, s_256[0] + 2, s_256[1] + 2, dst + 1 * 32);
588 : xy_y_convolve_2tap_half_pel_32_all_avx2(
589 : im + 192, s_256[0] + 4, s_256[1] + 4, dst + 2 * 32);
590 : xy_y_convolve_2tap_half_pel_32_all_avx2(
591 : im + 224, s_256[0] + 6, s_256[1] + 6, dst + 3 * 32);
592 : im += 2 * 128;
593 : xy_y_convolve_2tap_half_pel_32_all_avx2(
594 : im, s_256[1] + 0, s_256[0] + 0, dst + dst_stride);
595 : xy_y_convolve_2tap_half_pel_32_all_avx2(
596 : im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 1 * 32);
597 : xy_y_convolve_2tap_half_pel_32_all_avx2(
598 : im + 64, s_256[1] + 4, s_256[0] + 4, dst + dst_stride + 2 * 32);
599 : xy_y_convolve_2tap_half_pel_32_all_avx2(
600 : im + 96, s_256[1] + 6, s_256[0] + 6, dst + dst_stride + 3 * 32);
601 : dst += 2 * dst_stride;
602 : y -= 2;
603 : } while (y);
604 : }
605 : }
606 :
607 : static void convolve_2d_sr_ver_4tap_avx2(uint8_t *dst, int32_t dst_stride,
608 : int32_t w, int32_t h,
609 : InterpFilterParams *filter_params_y,
610 : const int32_t subpel_y_q4,
611 : int16_t *im_block) {
612 : int32_t y = h;
613 : int16_t *im = im_block;
614 : __m128i coeffs_128[4];
615 : __m256i coeffs_256[4];
616 :
617 : if (w == 2) {
618 : __m128i s_32[4], ss_128[2];
619 :
620 : prepare_coeffs_4tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
621 :
622 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
623 : s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
624 : s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
625 :
626 : const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
627 : const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
628 :
629 : ss_128[0] = _mm_unpacklo_epi16(src01, src12);
630 :
631 : do {
632 : const __m128i res =
633 : xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
634 : xy_y_round_store_2x2_sse2(res, dst, dst_stride);
635 : im += 2 * 2;
636 : dst += 2 * dst_stride;
637 : y -= 2;
638 : } while (y);
639 : }
640 : else {
641 : prepare_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
642 :
643 : if (w == 4) {
644 : __m128i s_64[4];
645 : __m256i s_256[2], ss_256[2];
646 :
647 : s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
648 : s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
649 : s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
650 :
651 : // Load lines a and b. Line a to lower 128, line b to upper 128
652 : s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
653 : s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
654 :
655 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
656 :
657 : do {
658 : const __m256i res =
659 : xy_y_convolve_4tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
660 : xy_y_round_store_4x2_avx2(res, dst, dst_stride);
661 : im += 2 * 4;
662 : dst += 2 * dst_stride;
663 : y -= 2;
664 : } while (y);
665 : }
666 : else if (w == 8) {
667 : __m256i s_256[4], r[2];
668 :
669 : s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
670 : s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
671 :
672 : if (subpel_y_q4 != 8) {
673 : __m256i ss_256[4];
674 :
675 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
676 : ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
677 :
678 : do {
679 : xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
680 : xy_y_round_store_8x2_avx2(r, dst, dst_stride);
681 : im += 2 * 8;
682 : dst += 2 * dst_stride;
683 : y -= 2;
684 : } while (y);
685 : }
686 : else {
687 : do {
688 : xy_y_convolve_4tap_8x2_half_pel_avx2(
689 : im, coeffs_256, s_256, r);
690 : xy_y_round_store_8x2_avx2(r, dst, dst_stride);
691 : im += 2 * 8;
692 : dst += 2 * dst_stride;
693 : y -= 2;
694 : } while (y);
695 : }
696 : }
697 : else if (w == 16) {
698 : __m256i s_256[5];
699 :
700 : s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
701 : s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
702 : s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
703 :
704 : if (subpel_y_q4 != 8) {
705 : __m256i ss_256[4], tt_256[4], r[4];
706 :
707 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
708 : ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
709 :
710 : tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
711 : tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
712 :
713 : do {
714 : xy_y_convolve_4tap_16x2_avx2(
715 : im, s_256, ss_256, tt_256, coeffs_256, r);
716 : xy_y_round_store_16x2_avx2(r, dst, dst_stride);
717 : im += 2 * 16;
718 : dst += 2 * dst_stride;
719 : y -= 2;
720 : } while (y);
721 : }
722 : else {
723 : __m256i r[4];
724 :
725 : do {
726 : xy_y_convolve_4tap_16x2_half_pelavx2(
727 : im, s_256, coeffs_256, r);
728 : xy_y_round_store_16x2_avx2(r, dst, dst_stride);
729 : im += 2 * 16;
730 : dst += 2 * dst_stride;
731 : y -= 2;
732 : } while (y);
733 : }
734 : }
735 : else {
736 : /*It's a special condition for OBMC. A/c to Av1 spec 4-tap won't
737 : support for width(w)>16, but for OBMC while predicting above block
738 : it reduces size block to Wx(h/2), for example, if above block size
739 : is 32x8, we get block size as 32x4 for OBMC.*/
740 : int32_t x = 0;
741 :
742 : assert(!(w % 32));
743 :
744 : __m256i s_256[2][4], ss_256[2][4], tt_256[2][4], r0[4], r1[4];
745 : do {
746 : const int16_t *s = im + x;
747 : uint8_t *d = dst + x;
748 :
749 : loadu_unpack_16bit_3rows_avx2(
750 : s, w, s_256[0], ss_256[0], tt_256[0]);
751 : loadu_unpack_16bit_3rows_avx2(
752 : s + 16, w, s_256[1], ss_256[1], tt_256[1]);
753 :
754 : y = h;
755 : do {
756 : xy_y_convolve_4tap_32x2_avx2(
757 : s, w, s_256[0], ss_256[0], tt_256[0], coeffs_256, r0);
758 : xy_y_convolve_4tap_32x2_avx2(s + 16, w, s_256[1],
759 : ss_256[1], tt_256[1], coeffs_256, r1);
760 :
761 : xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
762 : xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
763 :
764 : s += 2 * w;
765 : d += 2 * dst_stride;
766 : y -= 2;
767 : } while (y);
768 :
769 : x += 32;
770 : } while (x < w);
771 : }
772 : }
773 : }
774 :
775 : static void convolve_2d_sr_ver_6tap_avx2(uint8_t *dst, int32_t dst_stride,
776 : int32_t w, int32_t h,
777 : InterpFilterParams *filter_params_y,
778 : const int32_t subpel_y_q4,
779 : int16_t *im_block) {
780 : int32_t y;
781 : int16_t *im = im_block;
782 : __m128i coeffs_128[4];
783 : __m256i coeffs_256[4];
784 :
785 : if (w == 2) {
786 : __m128i s_32[6], ss_128[3];
787 :
788 : prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
789 :
790 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
791 : s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
792 : s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
793 : s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
794 : s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
795 :
796 : const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
797 : const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
798 : const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
799 : const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
800 :
801 : ss_128[0] = _mm_unpacklo_epi16(src01, src12);
802 : ss_128[1] = _mm_unpacklo_epi16(src23, src34);
803 :
804 : y = h;
805 : do {
806 : const __m128i res =
807 : xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
808 : xy_y_round_store_2x2_sse2(res, dst, dst_stride);
809 : im += 2 * 2;
810 : dst += 2 * dst_stride;
811 : y -= 2;
812 : } while (y);
813 : }
814 : else {
815 : prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
816 :
817 : if (w == 4) {
818 : __m128i s_64[6];
819 : __m256i s_256[6], ss_256[3];
820 :
821 : s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
822 : s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
823 : s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
824 : s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
825 : s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
826 :
827 : // Load lines a and b. Line a to lower 128, line b to upper 128
828 : s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
829 : s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
830 : s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
831 : s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
832 :
833 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
834 : ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
835 :
836 : y = h;
837 : do {
838 : const __m256i res =
839 : xy_y_convolve_6tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
840 : xy_y_round_store_4x2_avx2(res, dst, dst_stride);
841 : im += 2 * 4;
842 : dst += 2 * dst_stride;
843 : y -= 2;
844 : } while (y);
845 : }
846 : else if (w == 8) {
847 : __m256i s_256[6], r[2];
848 :
849 : s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
850 : s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
851 : s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
852 : s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
853 : y = h;
854 :
855 : if (subpel_y_q4 != 8) {
856 : __m256i ss_256[6];
857 :
858 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
859 : ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
860 :
861 : ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
862 : ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
863 :
864 : do {
865 : xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
866 : xy_y_round_store_8x2_avx2(r, dst, dst_stride);
867 : im += 2 * 8;
868 : dst += 2 * dst_stride;
869 : y -= 2;
870 : } while (y);
871 : }
872 : else {
873 : do {
874 : xy_y_convolve_6tap_8x2_half_pel_avx2(
875 : im, coeffs_256, s_256, r);
876 : xy_y_round_store_8x2_avx2(r, dst, dst_stride);
877 : im += 2 * 8;
878 : dst += 2 * dst_stride;
879 : y -= 2;
880 : } while (y);
881 : }
882 : }
883 : else if (w == 16) {
884 : __m256i s_256[6];
885 :
886 : s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
887 : s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
888 : s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
889 : s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
890 : s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 16));
891 : y = h;
892 :
893 : if (subpel_y_q4 != 8) {
894 : __m256i ss_256[6], tt_256[6], r[4];
895 :
896 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
897 : ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
898 : ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
899 : ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
900 :
901 : tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
902 : tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
903 : tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
904 : tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
905 :
906 : do {
907 : xy_y_convolve_6tap_16x2_avx2(
908 : im, 16, s_256, ss_256, tt_256, coeffs_256, r);
909 : xy_y_round_store_16x2_avx2(r, dst, dst_stride);
910 : im += 2 * 16;
911 : dst += 2 * dst_stride;
912 : y -= 2;
913 : } while (y);
914 : }
915 : else {
916 : __m256i ss_256[4], r[4];
917 :
918 : do {
919 : xy_y_convolve_6tap_16x2_half_pel_avx2(
920 : im, 16, s_256, ss_256, coeffs_256, r);
921 : xy_y_round_store_16x2_avx2(r, dst, dst_stride);
922 :
923 : im += 2 * 16;
924 : dst += 2 * dst_stride;
925 : y -= 2;
926 : } while (y);
927 : }
928 : }
929 : else {
930 : int32_t x = 0;
931 :
932 : assert(!(w % 32));
933 :
934 : __m256i s_256[2][6], ss_256[2][6], tt_256[2][6], r0[4], r1[4];
935 :
936 : do {
937 : const int16_t *s = im + x;
938 : uint8_t *d = dst + x;
939 :
940 : loadu_unpack_16bit_5rows_avx2(
941 : s, w, s_256[0], ss_256[0], tt_256[0]);
942 : loadu_unpack_16bit_5rows_avx2(
943 : s + 16, w, s_256[1], ss_256[1], tt_256[1]);
944 :
945 : y = h;
946 : do {
947 : xy_y_convolve_6tap_16x2_avx2(
948 : s, w, s_256[0], ss_256[0], tt_256[0], coeffs_256, r0);
949 : xy_y_convolve_6tap_16x2_avx2(s + 16,
950 : w,
951 : s_256[1],
952 : ss_256[1],
953 : tt_256[1],
954 : coeffs_256,
955 : r1);
956 :
957 : xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
958 : xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
959 :
960 : s += 2 * w;
961 : d += 2 * dst_stride;
962 : y -= 2;
963 : } while (y);
964 :
965 : x += 32;
966 : } while (x < w);
967 : }
968 : }
969 : }
970 :
971 : static void convolve_2d_sr_ver_8tap_avx2(uint8_t *dst, int32_t dst_stride,
972 : int32_t w, int32_t h,
973 : InterpFilterParams *filter_params_y,
974 : const int32_t subpel_y_q4,
975 : int16_t *im_block) {
976 : int32_t y;
977 : int16_t *im = im_block;
978 : __m128i coeffs_128[4];
979 : __m256i coeffs_256[4];
980 :
981 : if (w == 2) {
982 : __m128i s_32[8], ss_128[4];
983 :
984 : prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
985 :
986 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
987 : s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
988 : s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
989 : s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
990 : s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
991 : s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2));
992 : s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2));
993 :
994 : const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
995 : const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
996 : const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
997 : const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
998 : const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
999 : const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
1000 :
1001 : ss_128[0] = _mm_unpacklo_epi16(src01, src12);
1002 : ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1003 : ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1004 :
1005 : y = h;
1006 : do {
1007 : const __m128i res =
1008 : xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1009 : xy_y_round_store_2x2_sse2(res, dst, dst_stride);
1010 : im += 2 * 2;
1011 : dst += 2 * dst_stride;
1012 : y -= 2;
1013 : } while (y);
1014 : }
1015 : else {
1016 : prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1017 :
1018 : if (w == 4) {
1019 : __m128i s_64[8];
1020 : __m256i s_256[8], ss_256[4];
1021 :
1022 : s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
1023 : s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
1024 : s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
1025 : s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
1026 : s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
1027 : s_64[5] = _mm_loadl_epi64((__m128i *)(im + 5 * 4));
1028 : s_64[6] = _mm_loadl_epi64((__m128i *)(im + 6 * 4));
1029 :
1030 : // Load lines a and b. Line a to lower 128, line b to upper 128
1031 : s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1032 : s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
1033 : s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
1034 : s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
1035 : s_256[4] = _mm256_setr_m128i(s_64[4], s_64[5]);
1036 : s_256[5] = _mm256_setr_m128i(s_64[5], s_64[6]);
1037 :
1038 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1039 : ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1040 : ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1041 :
1042 : y = h;
1043 : do {
1044 : const __m256i res =
1045 : xy_y_convolve_8tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
1046 : xy_y_round_store_4x2_avx2(res, dst, dst_stride);
1047 : im += 2 * 4;
1048 : dst += 2 * dst_stride;
1049 : y -= 2;
1050 : } while (y);
1051 : }
1052 : else if (w == 8) {
1053 : __m256i s_256[8], r[2];
1054 :
1055 : s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
1056 : s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
1057 : s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
1058 : s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
1059 : s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 8));
1060 : s_256[5] = _mm256_loadu_si256((__m256i *)(im + 5 * 8));
1061 : y = h;
1062 :
1063 : if (subpel_y_q4 != 8) {
1064 : __m256i ss_256[8];
1065 :
1066 : convolve_8tap_unapck_avx2(s_256, ss_256);
1067 :
1068 : do {
1069 : xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
1070 : xy_y_round_store_8x2_avx2(r, dst, dst_stride);
1071 : im += 2 * 8;
1072 : dst += 2 * dst_stride;
1073 : y -= 2;
1074 : } while (y);
1075 : }
1076 : else {
1077 : do {
1078 : xy_y_convolve_8tap_8x2_half_pel_avx2(
1079 : im, coeffs_256, s_256, r);
1080 : xy_y_round_store_8x2_avx2(r, dst, dst_stride);
1081 : im += 2 * 8;
1082 : dst += 2 * dst_stride;
1083 : y -= 2;
1084 : } while (y);
1085 : }
1086 : }
1087 : else if (w == 16) {
1088 : __m256i s_256[8], r[4];
1089 :
1090 : load_16bit_7rows_avx2(im, 16, s_256);
1091 : y = h;
1092 :
1093 : if (subpel_y_q4 != 8) {
1094 : __m256i ss_256[8], tt_256[8];
1095 :
1096 : convolve_8tap_unapck_avx2(s_256, ss_256);
1097 : convolve_8tap_unapck_avx2(s_256 + 1, tt_256);
1098 :
1099 : do {
1100 : xy_y_convolve_8tap_16x2_avx2(
1101 : im, 16, coeffs_256, s_256, ss_256, tt_256, r);
1102 : xy_y_round_store_16x2_avx2(r, dst, dst_stride);
1103 :
1104 : im += 2 * 16;
1105 : dst += 2 * dst_stride;
1106 : y -= 2;
1107 : } while (y);
1108 : }
1109 : else {
1110 : do {
1111 : xy_y_convolve_8tap_16x2_half_pel_avx2(
1112 : im, 16, coeffs_256, s_256, r);
1113 : xy_y_round_store_16x2_avx2(r, dst, dst_stride);
1114 :
1115 : im += 2 * 16;
1116 : dst += 2 * dst_stride;
1117 : y -= 2;
1118 : } while (y);
1119 : }
1120 : }
1121 : else {
1122 : int32_t x = 0;
1123 : __m256i s_256[2][8], r0[4], r1[4];
1124 :
1125 : assert(!(w % 32));
1126 :
1127 : __m256i ss_256[2][8], tt_256[2][8];
1128 :
1129 : do {
1130 : const int16_t *s = im + x;
1131 : uint8_t *d = dst + x;
1132 :
1133 : load_16bit_7rows_avx2(s, w, s_256[0]);
1134 : convolve_8tap_unapck_avx2(s_256[0], ss_256[0]);
1135 : convolve_8tap_unapck_avx2(s_256[0] + 1, tt_256[0]);
1136 :
1137 : load_16bit_7rows_avx2(s + 16, w, s_256[1]);
1138 : convolve_8tap_unapck_avx2(s_256[1], ss_256[1]);
1139 : convolve_8tap_unapck_avx2(s_256[1] + 1, tt_256[1]);
1140 :
1141 : y = h;
1142 : do {
1143 : xy_y_convolve_8tap_16x2_avx2(
1144 : s, w, coeffs_256, s_256[0], ss_256[0], tt_256[0], r0);
1145 : xy_y_convolve_8tap_16x2_avx2(s + 16,
1146 : w,
1147 : coeffs_256,
1148 : s_256[1],
1149 : ss_256[1],
1150 : tt_256[1],
1151 : r1);
1152 : xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
1153 : xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
1154 :
1155 : s += 2 * w;
1156 : d += 2 * dst_stride;
1157 : y -= 2;
1158 : } while (y);
1159 :
1160 : x += 32;
1161 : } while (x < w);
1162 : }
1163 : }
1164 : }
1165 :
1166 : typedef void(*convolve_2d_sr_hor_tap_func)(
1167 : const uint8_t *const src, const int32_t src_stride, const int32_t w,
1168 : const int32_t h, const InterpFilterParams *const filter_params_x,
1169 : const int32_t subpel_x_q4, int16_t *const im_block);
1170 :
1171 : typedef void(*convolve_2d_sr_ver_tap_func)(uint8_t *dst, int32_t dst_stride,
1172 : int32_t w, int32_t h,
1173 : InterpFilterParams *filter_params_y,
1174 : const int32_t subpel_y_q4,
1175 : int16_t *im_block);
1176 : #endif
1177 : #if OBMC_CONVOLVE
1178 :
1179 155088000 : void eb_av1_convolve_2d_sr_avx2(const uint8_t *src, int32_t src_stride,
1180 : uint8_t *dst, int32_t dst_stride, int32_t w, int32_t h,
1181 : InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y,
1182 : const int32_t subpel_x_qn, const int32_t subpel_y_qn,
1183 : ConvolveParams *conv_params) {
1184 155088000 : const int32_t bd = 8;
1185 155088000 : const int32_t h_tap = get_convolve_tap(filter_params_x->filter_ptr);
1186 154954000 : const int32_t v_tap = get_convolve_tap(filter_params_y->filter_ptr);
1187 155330000 : int32_t im_stride = 8;
1188 : int32_t i;
1189 : DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
1190 155330000 : const int32_t bits =
1191 155330000 : FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
1192 155330000 : const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1193 :
1194 155330000 : assert(conv_params->round_0 > 0);
1195 :
1196 310660000 : const __m256i round_const_h = _mm256_set1_epi16(
1197 155330000 : ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
1198 155330000 : const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
1199 :
1200 310660000 : const __m256i sum_round_v = _mm256_set1_epi32(
1201 155330000 : (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
1202 155330000 : const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
1203 :
1204 155330000 : const __m256i round_const_v = _mm256_set1_epi32(
1205 155330000 : ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
1206 155330000 : ((1 << (offset_bits - conv_params->round_1)) >> 1));
1207 155330000 : const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
1208 :
1209 : __m256i filt[4], coeffs_h[4], coeffs_v[4];
1210 :
1211 155330000 : filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
1212 155330000 : filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
1213 :
1214 155330000 : prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_qn, coeffs_h);
1215 155783000 : prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_qn, coeffs_v);
1216 :
1217 155622000 : if (h_tap == 2) {
1218 20136600 : int32_t im_h = h + filter_params_y->taps - 1;
1219 20136600 : const int32_t fo_vert = filter_params_y->taps / 2 - 1;
1220 20136600 : const int32_t fo_horiz = 0;
1221 20136600 : const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
1222 :
1223 20136600 : prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_qn, coeffs_h);
1224 :
1225 20147300 : if (v_tap == 2) {
1226 20147300 : const int16_t *const t_block = im_block + 3 * im_stride;
1227 20147300 : prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_qn, coeffs_v);
1228 59977100 : for (int32_t j = 0; j < w; j += 8) {
1229 2634610000 : CONVOLVE_SR_HORIZONTAL_FILTER_2TAP;
1230 5943740000 : CONVOLVE_SR_VERTICAL_FILTER_2TAP;
1231 : }
1232 : }
1233 0 : else if (v_tap == 4) {
1234 0 : const int16_t *const t_block = im_block + 2 * im_stride;
1235 0 : for (int32_t j = 0; j < w; j += 8) {
1236 0 : CONVOLVE_SR_HORIZONTAL_FILTER_2TAP;
1237 0 : CONVOLVE_SR_VERTICAL_FILTER_4TAP;
1238 : }
1239 : }
1240 : else {
1241 0 : const int16_t *const t_block = im_block;
1242 0 : for (int32_t j = 0; j < w; j += 8) {
1243 0 : CONVOLVE_SR_HORIZONTAL_FILTER_2TAP;
1244 0 : CONVOLVE_SR_VERTICAL_FILTER_8TAP;
1245 : }
1246 : }
1247 : }
1248 135486000 : else if (v_tap == 2) {
1249 0 : int32_t im_h = h + 3;
1250 0 : const int32_t fo_vert = 0;
1251 0 : const int16_t *const t_block = im_block;
1252 :
1253 0 : prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_qn, coeffs_v);
1254 0 : filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
1255 0 : filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
1256 :
1257 0 : if (h_tap == 4) {
1258 0 : const int32_t fo_horiz = 1;
1259 0 : const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
1260 0 : for (int32_t j = 0; j < w; j += 8) {
1261 0 : CONVOLVE_SR_HORIZONTAL_FILTER_4TAP;
1262 0 : CONVOLVE_SR_VERTICAL_FILTER_2TAP;
1263 : }
1264 : }
1265 : else {
1266 0 : const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
1267 0 : const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
1268 0 : for (int32_t j = 0; j < w; j += 8) {
1269 0 : CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
1270 0 : CONVOLVE_SR_VERTICAL_FILTER_2TAP;
1271 : }
1272 : }
1273 : }
1274 135486000 : else if (h_tap == 4) {
1275 13616900 : int32_t im_h = h + filter_params_y->taps - 1;
1276 13616900 : const int32_t fo_vert = filter_params_y->taps / 2 - 1;
1277 13616900 : const int32_t fo_horiz = 1;
1278 13616900 : const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
1279 13616900 : const int16_t *const t_block = im_block;
1280 :
1281 27211000 : for (int32_t j = 0; j < w; j += 8) {
1282 646396000 : CONVOLVE_SR_HORIZONTAL_FILTER_4TAP;
1283 1291200000 : CONVOLVE_SR_VERTICAL_FILTER_8TAP;
1284 : }
1285 : }
1286 121869000 : else if (v_tap == 4) {
1287 10999300 : int32_t im_h = h + 3;
1288 10999300 : const int32_t fo_vert = 1;
1289 10999300 : const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
1290 10999300 : const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
1291 10999300 : const int16_t *const t_block = im_block;
1292 :
1293 10999300 : filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
1294 10999300 : filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
1295 :
1296 28684500 : for (int32_t j = 0; j < w; j += 8) {
1297 352315000 : CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
1298 703226000 : CONVOLVE_SR_VERTICAL_FILTER_4TAP;
1299 : }
1300 : }
1301 : else {
1302 : int32_t j;
1303 110869000 : int32_t im_h = h + filter_params_y->taps - 1;
1304 110869000 : const int32_t fo_vert = filter_params_y->taps / 2 - 1;
1305 110869000 : const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
1306 110869000 : const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
1307 110869000 : const int16_t *const t_block = im_block;
1308 :
1309 110869000 : filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
1310 110869000 : filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
1311 :
1312 377468000 : for (j = 0; j < w; j += 8) {
1313 15005000000 : CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
1314 37303400000 : CONVOLVE_SR_VERTICAL_FILTER_8TAP;
1315 : }
1316 : }
1317 188210000 : }
1318 : #else
1319 : void eb_av1_convolve_2d_sr_avx2(const uint8_t *src, int32_t src_stride,
1320 : uint8_t *dst, int32_t dst_stride, int32_t w,
1321 : int32_t h, InterpFilterParams *filter_params_x,
1322 : InterpFilterParams *filter_params_y,
1323 : const int32_t subpel_x_q4,
1324 : const int32_t subpel_y_q4,
1325 : ConvolveParams *conv_params) {
1326 : static const convolve_2d_sr_hor_tap_func
1327 : convolve_2d_sr_hor_tap_func_table[MAX_FILTER_TAP + 1] = {
1328 : NULL,
1329 : NULL,
1330 : convolve_2d_sr_hor_2tap_avx2,
1331 : NULL,
1332 : convolve_2d_sr_hor_4tap_avx2,
1333 : NULL,
1334 : convolve_2d_sr_hor_6tap_avx2,
1335 : NULL,
1336 : convolve_2d_sr_hor_8tap_avx2 };
1337 : static const convolve_2d_sr_ver_tap_func
1338 : convolve_2d_sr_ver_tap_func_table[MAX_FILTER_TAP + 1] = {
1339 : NULL,
1340 : convolve_2d_sr_ver_2tap_half_avx2,
1341 : convolve_2d_sr_ver_2tap_avx2,
1342 : convolve_2d_sr_ver_4tap_avx2,
1343 : convolve_2d_sr_ver_4tap_avx2,
1344 : convolve_2d_sr_ver_6tap_avx2,
1345 : convolve_2d_sr_ver_6tap_avx2,
1346 : convolve_2d_sr_ver_8tap_avx2,
1347 : convolve_2d_sr_ver_8tap_avx2 };
1348 : const int32_t tap_x = get_convolve_tap(filter_params_x->filter_ptr);
1349 : const int32_t tap_y = get_convolve_tap(filter_params_y->filter_ptr);
1350 : const uint8_t *src_ptr =
1351 : src + ((MAX_FILTER_TAP - tap_y) / 2 - 3) * src_stride;
1352 : // Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data
1353 : // permutation.
1354 : DECLARE_ALIGNED(
1355 : 32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1356 :
1357 : (void)conv_params;
1358 :
1359 : assert(conv_params->round_0 == 3);
1360 : assert(conv_params->round_1 == 11);
1361 :
1362 : // horizontal filter
1363 :
1364 : // Have to calculate 1 more row for small widths, since 2 lines are
1365 : // calculated in each loop for them.
1366 : const int32_t hh = h + tap_y - (w >= 32);
1367 :
1368 : convolve_2d_sr_hor_tap_func_table[tap_x](
1369 : src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block);
1370 :
1371 : // vertical filter
1372 : convolve_2d_sr_ver_tap_func_table[tap_y - (subpel_y_q4 == 8)](
1373 : dst, dst_stride, w, h, filter_params_y, subpel_y_q4, im_block);
1374 : }
1375 : #endif
1376 0 : static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
1377 : __m256i s[4];
1378 0 : s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
1379 0 : s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
1380 0 : s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
1381 0 : s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
1382 0 : _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
1383 0 : _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
1384 0 : _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
1385 0 : _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
1386 0 : }
1387 :
1388 8688620 : void eb_av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int32_t src_stride,
1389 : uint8_t *dst, int32_t dst_stride, int32_t w, int32_t h,
1390 : InterpFilterParams *filter_params_x,
1391 : InterpFilterParams *filter_params_y,
1392 : const int32_t subpel_x_q4, const int32_t subpel_y_q4,
1393 : ConvolveParams *conv_params) {
1394 : (void)filter_params_x;
1395 : (void)filter_params_y;
1396 : (void)subpel_x_q4;
1397 : (void)subpel_y_q4;
1398 : (void)conv_params;
1399 :
1400 8688620 : if (w == 2) {
1401 : do {
1402 9771 : memcpy(dst, src, 2 * sizeof(*src));
1403 9771 : src += src_stride;
1404 9771 : dst += dst_stride;
1405 9771 : memcpy(dst, src, 2 * sizeof(*src));
1406 9771 : src += src_stride;
1407 9771 : dst += dst_stride;
1408 9771 : h -= 2;
1409 9771 : } while (h);
1410 : }
1411 8685260 : else if (w == 4) {
1412 : do {
1413 3385780 : memcpy(dst, src, 4 * sizeof(*src));
1414 3385780 : src += src_stride;
1415 3385780 : dst += dst_stride;
1416 3385780 : memcpy(dst, src, 4 * sizeof(*src));
1417 3385780 : src += src_stride;
1418 3385780 : dst += dst_stride;
1419 3385780 : h -= 2;
1420 3385780 : } while (h);
1421 : }
1422 7812980 : else if (w == 8) {
1423 : do {
1424 : __m128i s[2];
1425 21817300 : s[0] = _mm_loadl_epi64((__m128i *)src);
1426 21817300 : src += src_stride;
1427 21817300 : s[1] = _mm_loadl_epi64((__m128i *)src);
1428 21817300 : src += src_stride;
1429 21817300 : _mm_storel_epi64((__m128i *)dst, s[0]);
1430 21817300 : dst += dst_stride;
1431 21817300 : _mm_storel_epi64((__m128i *)dst, s[1]);
1432 21817300 : dst += dst_stride;
1433 21817300 : h -= 2;
1434 21817300 : } while (h);
1435 : }
1436 4262830 : else if (w == 16) {
1437 : do {
1438 : __m128i s[2];
1439 23695300 : s[0] = _mm_loadu_si128((__m128i *)src);
1440 23695300 : src += src_stride;
1441 23695300 : s[1] = _mm_loadu_si128((__m128i *)src);
1442 23695300 : src += src_stride;
1443 23695300 : _mm_storeu_si128((__m128i *)dst, s[0]);
1444 23695300 : dst += dst_stride;
1445 23695300 : _mm_storeu_si128((__m128i *)dst, s[1]);
1446 23695300 : dst += dst_stride;
1447 23695300 : h -= 2;
1448 23695300 : } while (h);
1449 : }
1450 1626020 : else if (w == 32) {
1451 : do {
1452 : __m256i s[2];
1453 14931300 : s[0] = _mm256_loadu_si256((__m256i *)src);
1454 14931300 : src += src_stride;
1455 14931300 : s[1] = _mm256_loadu_si256((__m256i *)src);
1456 14931300 : src += src_stride;
1457 14931300 : _mm256_storeu_si256((__m256i *)dst, s[0]);
1458 14931300 : dst += dst_stride;
1459 14931300 : _mm256_storeu_si256((__m256i *)dst, s[1]);
1460 14931300 : dst += dst_stride;
1461 14931300 : h -= 2;
1462 14931300 : } while (h);
1463 : }
1464 365102 : else if (w == 64) {
1465 : do {
1466 : __m256i s[4];
1467 6325020 : s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
1468 6325020 : s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
1469 6325020 : src += src_stride;
1470 6325020 : s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
1471 6325020 : s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
1472 6325020 : src += src_stride;
1473 6325020 : _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
1474 6325020 : _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
1475 6325020 : dst += dst_stride;
1476 6325020 : _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
1477 6325020 : _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
1478 6325020 : dst += dst_stride;
1479 6325020 : h -= 2;
1480 6325020 : } while (h);
1481 : }
1482 : else {
1483 : do {
1484 0 : copy_128(src, dst);
1485 0 : src += src_stride;
1486 0 : dst += dst_stride;
1487 0 : copy_128(src, dst);
1488 0 : src += src_stride;
1489 0 : dst += dst_stride;
1490 0 : h -= 2;
1491 0 : } while (h);
1492 : }
1493 8688620 : }
|