Line data Source code
1 : /*
2 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <immintrin.h>
13 : #include "aom_dsp_rtcd.h"
14 : #include "convolve.h"
15 : #include "convolve_avx2.h"
16 : #include "EbDefinitions.h"
17 : #include "EbMemory_SSE4_1.h"
18 :
19 : SIMD_INLINE void jnt_y_comp_avg_2tap_32_avx2(
20 : const uint8_t *const src, const __m256i *const coeffs, const __m256i factor,
21 : const __m256i offset, const __m256i s0, __m256i *const s1,
22 : ConvBufType *const dst, uint8_t *const dst8) {
23 : __m256i r[2];
24 :
25 22440900 : y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
26 : jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
27 22440700 : }
28 :
29 22427700 : static INLINE void jnt_y_avg_2tap_32_avx2(const uint8_t *const src,
30 : const __m256i *const coeffs,
31 : const __m256i offset,
32 : const __m256i s0, __m256i *const s1,
33 : const ConvBufType *const dst,
34 : uint8_t *const dst8) {
35 : __m256i r[2];
36 :
37 22427700 : y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
38 22427800 : jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
39 22426700 : }
40 :
41 102196000 : static INLINE void jnt_y_no_avg_2tap_32_avx2(
42 : const uint8_t *const src, const __m256i *const coeffs, const __m256i offset,
43 : const __m256i s0, __m256i *const s1, ConvBufType *const dst) {
44 : __m256i r[2];
45 :
46 102196000 : y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
47 102192000 : jnt_no_avg_round_store_32_avx2(r, offset, dst);
48 102184000 : }
49 :
50 20259700 : static void jnt_convolve_y_2tap_avx2(
51 : const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
52 : const int32_t dst8_stride, const int32_t w, const int32_t h,
53 : const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
54 : const ConvolveParams *const conv_params) {
55 20259700 : const uint8_t *src_ptr = src;
56 20259700 : const int32_t dst_stride = conv_params->dst_stride;
57 20259700 : const int32_t round_0 = 3;
58 20259700 : const int32_t round_1 = COMPOUND_ROUND1_BITS;
59 20259700 : const int32_t bits = FILTER_BITS - round_0;
60 20259700 : const int32_t bd = 8;
61 20259700 : const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
62 20259700 : const int32_t round_offset =
63 20259700 : (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
64 20259700 : const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
65 20259700 : const int32_t offset_comp_avg =
66 20259700 : round_offset * conv_params->bck_offset +
67 20259700 : (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
68 20259700 : (round_offset << DIST_PRECISION_BITS);
69 20259700 : const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
70 20259700 : (1 << (round_1 - bits - 2)) -
71 20259700 : (round_offset << (round_1 - bits - 1));
72 20259700 : const int16_t offset_no_avg =
73 20259700 : (round_offset << (round_1 - bits - 1)) + (1 << (round_1 - bits - 2));
74 20259700 : const int32_t factor =
75 20259700 : conv_params->fwd_offset | (conv_params->bck_offset << 16);
76 20259700 : const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
77 20259700 : const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
78 40519400 : const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
79 20259700 : const __m128i factor_128 = _mm_set1_epi32(factor);
80 20259700 : const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
81 20259700 : const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
82 40519400 : const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
83 20259700 : const __m256i factor_256 = _mm256_set1_epi32(factor);
84 20259700 : ConvBufType *dst = conv_params->dst;
85 20259700 : int32_t y = h;
86 : __m128i coeffs_128[4];
87 : __m256i coeffs_256[4];
88 :
89 20259700 : if (w <= 4) {
90 0 : prepare_half_coeffs_2tap_ssse3(
91 : filter_params_y, subpel_y_q4, coeffs_128);
92 :
93 0 : if (w == 2) {
94 : __m128i s_16[2];
95 :
96 0 : s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
97 :
98 0 : if (conv_params->do_average) {
99 0 : if (conv_params->use_jnt_comp_avg) {
100 : do {
101 0 : const __m128i res = y_convolve_2tap_2x2_ssse3(
102 : src_ptr, src_stride, coeffs_128, s_16);
103 0 : jnt_comp_avg_round_store_2x2_sse2(res,
104 : factor_128,
105 : offset_comp_avg_128,
106 : dst,
107 : dst_stride,
108 : dst8,
109 : dst8_stride);
110 0 : src_ptr += 2 * src_stride;
111 0 : dst += 2 * dst_stride;
112 0 : dst8 += 2 * dst8_stride;
113 0 : y -= 2;
114 0 : } while (y);
115 : }
116 : else {
117 : do {
118 0 : const __m128i res = y_convolve_2tap_2x2_ssse3(
119 : src_ptr, src_stride, coeffs_128, s_16);
120 0 : jnt_avg_round_store_2x2_sse2(res,
121 : offset_avg_128,
122 : dst,
123 : dst_stride,
124 : dst8,
125 : dst8_stride);
126 0 : src_ptr += 2 * src_stride;
127 0 : dst += 2 * dst_stride;
128 0 : dst8 += 2 * dst8_stride;
129 0 : y -= 2;
130 0 : } while (y);
131 : }
132 : }
133 : else {
134 : do {
135 0 : const __m128i res = y_convolve_2tap_2x2_ssse3(
136 : src_ptr, src_stride, coeffs_128, s_16);
137 0 : jnt_no_avg_round_store_2x2_sse2(
138 : res, offset_no_avg_128, dst, dst_stride);
139 0 : src_ptr += 2 * src_stride;
140 0 : dst += 2 * dst_stride;
141 0 : y -= 2;
142 0 : } while (y);
143 : }
144 : }
145 : else {
146 : __m128i s_32[2];
147 :
148 0 : assert(w == 4);
149 :
150 0 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
151 :
152 0 : if (conv_params->do_average) {
153 0 : if (conv_params->use_jnt_comp_avg) {
154 : do {
155 0 : const __m128i res = y_convolve_2tap_4x2_ssse3(
156 : src_ptr, src_stride, coeffs_128, s_32);
157 0 : jnt_comp_avg_round_store_4x2_sse2(res,
158 : factor_128,
159 : offset_comp_avg_128,
160 : dst,
161 : dst_stride,
162 : dst8,
163 : dst8_stride);
164 0 : src_ptr += 2 * src_stride;
165 0 : dst += 2 * dst_stride;
166 0 : dst8 += 2 * dst8_stride;
167 0 : y -= 2;
168 0 : } while (y);
169 : }
170 : else {
171 : do {
172 0 : const __m128i res = y_convolve_2tap_4x2_ssse3(
173 : src_ptr, src_stride, coeffs_128, s_32);
174 0 : jnt_avg_round_store_4x2_sse2(res,
175 : offset_avg_128,
176 : dst,
177 : dst_stride,
178 : dst8,
179 : dst8_stride);
180 0 : src_ptr += 2 * src_stride;
181 0 : dst += 2 * dst_stride;
182 0 : dst8 += 2 * dst8_stride;
183 0 : y -= 2;
184 0 : } while (y);
185 : }
186 : }
187 : else {
188 : do {
189 0 : const __m128i res = y_convolve_2tap_4x2_ssse3(
190 : src_ptr, src_stride, coeffs_128, s_32);
191 0 : jnt_no_avg_round_store_4x2_sse2(
192 : res, offset_no_avg_128, dst, dst_stride);
193 0 : src_ptr += 2 * src_stride;
194 0 : dst += 2 * dst_stride;
195 0 : y -= 2;
196 0 : } while (y);
197 : }
198 : }
199 : }
200 : else {
201 20259700 : prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
202 :
203 20269000 : if (w == 8) {
204 : __m128i s_64[2];
205 :
206 8669440 : s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
207 :
208 8669440 : if (conv_params->do_average) {
209 2348170 : if (conv_params->use_jnt_comp_avg) {
210 : do {
211 8617400 : const __m256i res = y_convolve_2tap_8x2_avx2(
212 : src_ptr, src_stride, coeffs_256, s_64);
213 8617040 : jnt_comp_avg_round_store_8x2_avx2(res,
214 : factor_256,
215 : offset_comp_avg_256,
216 : dst,
217 : dst_stride,
218 : dst8,
219 : dst8_stride);
220 8617030 : src_ptr += 2 * src_stride;
221 8617030 : dst += 2 * dst_stride;
222 8617030 : dst8 += 2 * dst8_stride;
223 8617030 : y -= 2;
224 8617030 : } while (y);
225 : }
226 : else {
227 : do {
228 8617090 : const __m256i res = y_convolve_2tap_8x2_avx2(
229 : src_ptr, src_stride, coeffs_256, s_64);
230 8616940 : jnt_avg_round_store_8x2_sse2(res,
231 : offset_avg_256,
232 : dst,
233 : dst_stride,
234 : dst8,
235 : dst8_stride);
236 8616700 : src_ptr += 2 * src_stride;
237 8616700 : dst += 2 * dst_stride;
238 8616700 : dst8 += 2 * dst8_stride;
239 8616700 : y -= 2;
240 8616700 : } while (y);
241 : }
242 : }
243 : else {
244 : do {
245 43828800 : const __m256i res = y_convolve_2tap_8x2_avx2(
246 : src_ptr, src_stride, coeffs_256, s_64);
247 43832200 : jnt_no_avg_round_store_8x2_avx2(
248 : res, offset_no_avg_256, dst, dst_stride);
249 43829000 : src_ptr += 2 * src_stride;
250 43829000 : dst += 2 * dst_stride;
251 43829000 : y -= 2;
252 43829000 : } while (y);
253 : }
254 : }
255 11599600 : else if (w == 16) {
256 : __m128i s_128[2];
257 : __m256i r[2];
258 :
259 7005530 : s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
260 :
261 7005530 : if (conv_params->do_average) {
262 1860260 : if (conv_params->use_jnt_comp_avg) {
263 : do {
264 9114440 : y_convolve_2tap_16x2_avx2(
265 : src_ptr, src_stride, coeffs_256, s_128, r);
266 : jnt_comp_avg_round_store_16x2_avx2(r,
267 : factor_256,
268 : offset_comp_avg_256,
269 : dst,
270 : dst_stride,
271 : dst8,
272 : dst8_stride);
273 9114160 : src_ptr += 2 * src_stride;
274 9114160 : dst += 2 * dst_stride;
275 9114160 : dst8 += 2 * dst8_stride;
276 9114160 : y -= 2;
277 9114160 : } while (y);
278 : }
279 : else {
280 : do {
281 9114470 : y_convolve_2tap_16x2_avx2(
282 : src_ptr, src_stride, coeffs_256, s_128, r);
283 9113920 : jnt_avg_round_store_16x2_avx2(r,
284 : offset_avg_256,
285 : dst,
286 : dst_stride,
287 : dst8,
288 : dst8_stride);
289 9114220 : src_ptr += 2 * src_stride;
290 9114220 : dst += 2 * dst_stride;
291 9114220 : dst8 += 2 * dst8_stride;
292 9114220 : y -= 2;
293 9114220 : } while (y);
294 : }
295 : }
296 : else {
297 : do {
298 47448400 : y_convolve_2tap_16x2_avx2(
299 : src_ptr, src_stride, coeffs_256, s_128, r);
300 47458100 : jnt_no_avg_round_store_16x2_avx2(
301 : r, offset_no_avg_256, dst, dst_stride);
302 47448800 : src_ptr += 2 * src_stride;
303 47448800 : dst += 2 * dst_stride;
304 47448800 : y -= 2;
305 47448800 : } while (y);
306 : }
307 : }
308 4594040 : else if (w == 32) {
309 : __m256i s_256[2];
310 :
311 3606640 : s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
312 :
313 3606640 : if (conv_params->do_average) {
314 1052260 : if (conv_params->use_jnt_comp_avg) {
315 : do {
316 5832260 : jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride,
317 : coeffs_256,
318 : factor_256,
319 : offset_comp_avg_256,
320 : s_256[0],
321 : &s_256[1],
322 : dst,
323 : dst8);
324 5832040 : jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
325 : coeffs_256,
326 : factor_256,
327 : offset_comp_avg_256,
328 : s_256[1],
329 : &s_256[0],
330 5832040 : dst + dst_stride,
331 : dst8 + dst8_stride);
332 5832100 : src_ptr += 2 * src_stride;
333 5832100 : dst += 2 * dst_stride;
334 5832100 : dst8 += 2 * dst8_stride;
335 5832100 : y -= 2;
336 5832100 : } while (y);
337 : }
338 : else {
339 : do {
340 5832200 : jnt_y_avg_2tap_32_avx2(src_ptr + src_stride,
341 : coeffs_256,
342 : offset_avg_256,
343 : s_256[0],
344 : &s_256[1],
345 : dst,
346 : dst8);
347 5831990 : jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
348 : coeffs_256,
349 : offset_avg_256,
350 : s_256[1],
351 : &s_256[0],
352 5831990 : dst + dst_stride,
353 : dst8 + dst8_stride);
354 5832050 : src_ptr += 2 * src_stride;
355 5832050 : dst += 2 * dst_stride;
356 5832050 : dst8 += 2 * dst8_stride;
357 5832050 : y -= 2;
358 5832050 : } while (y);
359 : }
360 : }
361 : else {
362 : do {
363 28920400 : jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride,
364 : coeffs_256,
365 : offset_no_avg_256,
366 : s_256[0],
367 : &s_256[1],
368 : dst);
369 28920700 : jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
370 : coeffs_256,
371 : offset_no_avg_256,
372 : s_256[1],
373 : &s_256[0],
374 28920700 : dst + dst_stride);
375 28920800 : src_ptr += 2 * src_stride;
376 28920800 : dst += 2 * dst_stride;
377 28920800 : y -= 2;
378 28920800 : } while (y);
379 : }
380 : }
381 987400 : else if (w == 64) {
382 : __m256i s_256[2][2];
383 :
384 995046 : s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
385 995046 : s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
386 :
387 995046 : if (conv_params->do_average) {
388 325072 : if (conv_params->use_jnt_comp_avg) {
389 : do {
390 2694190 : jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride,
391 : coeffs_256,
392 : factor_256,
393 : offset_comp_avg_256,
394 : s_256[0][0],
395 : &s_256[1][0],
396 : dst,
397 : dst8);
398 2694120 : jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride + 32,
399 : coeffs_256,
400 : factor_256,
401 : offset_comp_avg_256,
402 : s_256[0][1],
403 : &s_256[1][1],
404 : dst + 32,
405 : dst8 + 32);
406 2694170 : jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
407 : coeffs_256,
408 : factor_256,
409 : offset_comp_avg_256,
410 : s_256[1][0],
411 : &s_256[0][0],
412 2694170 : dst + dst_stride,
413 : dst8 + dst8_stride);
414 2694140 : jnt_y_comp_avg_2tap_32_avx2(
415 2694140 : src_ptr + 2 * src_stride + 32,
416 : coeffs_256,
417 : factor_256,
418 : offset_comp_avg_256,
419 : s_256[1][1],
420 : &s_256[0][1],
421 2694140 : dst + dst_stride + 32,
422 2694140 : dst8 + dst8_stride + 32);
423 :
424 2694160 : src_ptr += 2 * src_stride;
425 2694160 : dst += 2 * dst_stride;
426 2694160 : dst8 += 2 * dst8_stride;
427 2694160 : y -= 2;
428 2694160 : } while (y);
429 : }
430 : else {
431 : do {
432 2694040 : jnt_y_avg_2tap_32_avx2(src_ptr + src_stride,
433 : coeffs_256,
434 : offset_avg_256,
435 : s_256[0][0],
436 : &s_256[1][0],
437 : dst,
438 : dst8);
439 2694050 : jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 32,
440 : coeffs_256,
441 : offset_avg_256,
442 : s_256[0][1],
443 : &s_256[1][1],
444 2694050 : dst + 32,
445 : dst8 + 32);
446 2694060 : jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
447 : coeffs_256,
448 : offset_avg_256,
449 : s_256[1][0],
450 : &s_256[0][0],
451 2694060 : dst + dst_stride,
452 : dst8 + dst8_stride);
453 2694050 : jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 32,
454 : coeffs_256,
455 : offset_avg_256,
456 : s_256[1][1],
457 : &s_256[0][1],
458 2694050 : dst + dst_stride + 32,
459 2694050 : dst8 + dst8_stride + 32);
460 :
461 2694040 : src_ptr += 2 * src_stride;
462 2694040 : dst += 2 * dst_stride;
463 2694040 : dst8 += 2 * dst8_stride;
464 2694040 : y -= 2;
465 2694040 : } while (y);
466 : }
467 : }
468 : else {
469 : do {
470 11137000 : jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride,
471 : coeffs_256,
472 : offset_no_avg_256,
473 : s_256[0][0],
474 : &s_256[1][0],
475 : dst);
476 11137100 : jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 32,
477 : coeffs_256,
478 : offset_no_avg_256,
479 : s_256[0][1],
480 : &s_256[1][1],
481 : dst + 32);
482 11137100 : jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
483 : coeffs_256,
484 : offset_no_avg_256,
485 : s_256[1][0],
486 : &s_256[0][0],
487 11137100 : dst + dst_stride);
488 11137000 : jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 32,
489 : coeffs_256,
490 : offset_no_avg_256,
491 : s_256[1][1],
492 : &s_256[0][1],
493 11137000 : dst + dst_stride + 32);
494 :
495 11137100 : src_ptr += 2 * src_stride;
496 11137100 : dst += 2 * dst_stride;
497 11137100 : y -= 2;
498 11137100 : } while (y);
499 : }
500 : }
501 : else {
502 : __m256i s_256[2][4];
503 :
504 0 : assert(w == 128);
505 :
506 0 : s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
507 0 : s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
508 0 : s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
509 0 : s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
510 :
511 0 : if (conv_params->do_average) {
512 0 : if (conv_params->use_jnt_comp_avg) {
513 : do {
514 0 : jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride,
515 : coeffs_256,
516 : factor_256,
517 : offset_comp_avg_256,
518 : s_256[0][0],
519 : &s_256[1][0],
520 : dst,
521 : dst8);
522 0 : jnt_y_comp_avg_2tap_32_avx2(
523 0 : src_ptr + src_stride + 1 * 32,
524 : coeffs_256,
525 : factor_256,
526 : offset_comp_avg_256,
527 : s_256[0][1],
528 : &s_256[1][1],
529 : dst + 1 * 32,
530 : dst8 + 1 * 32);
531 0 : jnt_y_comp_avg_2tap_32_avx2(
532 0 : src_ptr + src_stride + 2 * 32,
533 : coeffs_256,
534 : factor_256,
535 : offset_comp_avg_256,
536 : s_256[0][2],
537 : &s_256[1][2],
538 : dst + 2 * 32,
539 : dst8 + 2 * 32);
540 0 : jnt_y_comp_avg_2tap_32_avx2(
541 0 : src_ptr + src_stride + 3 * 32,
542 : coeffs_256,
543 : factor_256,
544 : offset_comp_avg_256,
545 : s_256[0][3],
546 : &s_256[1][3],
547 : dst + 3 * 32,
548 : dst8 + 3 * 32);
549 0 : jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
550 : coeffs_256,
551 : factor_256,
552 : offset_comp_avg_256,
553 : s_256[1][0],
554 : &s_256[0][0],
555 0 : dst + dst_stride,
556 : dst8 + dst8_stride);
557 0 : jnt_y_comp_avg_2tap_32_avx2(
558 0 : src_ptr + 2 * src_stride + 1 * 32,
559 : coeffs_256,
560 : factor_256,
561 : offset_comp_avg_256,
562 : s_256[1][1],
563 : &s_256[0][1],
564 0 : dst + dst_stride + 1 * 32,
565 0 : dst8 + dst8_stride + 1 * 32);
566 0 : jnt_y_comp_avg_2tap_32_avx2(
567 0 : src_ptr + 2 * src_stride + 2 * 32,
568 : coeffs_256,
569 : factor_256,
570 : offset_comp_avg_256,
571 : s_256[1][2],
572 : &s_256[0][2],
573 0 : dst + dst_stride + 2 * 32,
574 0 : dst8 + dst8_stride + 2 * 32);
575 0 : jnt_y_comp_avg_2tap_32_avx2(
576 0 : src_ptr + 2 * src_stride + 3 * 32,
577 : coeffs_256,
578 : factor_256,
579 : offset_comp_avg_256,
580 : s_256[1][3],
581 : &s_256[0][3],
582 0 : dst + dst_stride + 3 * 32,
583 0 : dst8 + dst8_stride + 3 * 32);
584 :
585 0 : src_ptr += 2 * src_stride;
586 0 : dst += 2 * dst_stride;
587 0 : dst8 += 2 * dst8_stride;
588 0 : y -= 2;
589 0 : } while (y);
590 : }
591 : else {
592 : do {
593 0 : jnt_y_avg_2tap_32_avx2(src_ptr + src_stride,
594 : coeffs_256,
595 : offset_avg_256,
596 : s_256[0][0],
597 : &s_256[1][0],
598 : dst,
599 : dst8);
600 0 : jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 1 * 32,
601 : coeffs_256,
602 : offset_avg_256,
603 : s_256[0][1],
604 : &s_256[1][1],
605 0 : dst + 1 * 32,
606 : dst8 + 1 * 32);
607 0 : jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 2 * 32,
608 : coeffs_256,
609 : offset_avg_256,
610 : s_256[0][2],
611 : &s_256[1][2],
612 0 : dst + 2 * 32,
613 : dst8 + 2 * 32);
614 0 : jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 3 * 32,
615 : coeffs_256,
616 : offset_avg_256,
617 : s_256[0][3],
618 : &s_256[1][3],
619 0 : dst + 3 * 32,
620 : dst8 + 3 * 32);
621 0 : jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
622 : coeffs_256,
623 : offset_avg_256,
624 : s_256[1][0],
625 : &s_256[0][0],
626 0 : dst + dst_stride,
627 : dst8 + dst8_stride);
628 0 : jnt_y_avg_2tap_32_avx2(
629 0 : src_ptr + 2 * src_stride + 1 * 32,
630 : coeffs_256,
631 : offset_avg_256,
632 : s_256[1][1],
633 : &s_256[0][1],
634 0 : dst + dst_stride + 1 * 32,
635 0 : dst8 + dst8_stride + 1 * 32);
636 0 : jnt_y_avg_2tap_32_avx2(
637 0 : src_ptr + 2 * src_stride + 2 * 32,
638 : coeffs_256,
639 : offset_avg_256,
640 : s_256[1][2],
641 : &s_256[0][2],
642 0 : dst + dst_stride + 2 * 32,
643 0 : dst8 + dst8_stride + 2 * 32);
644 0 : jnt_y_avg_2tap_32_avx2(
645 0 : src_ptr + 2 * src_stride + 3 * 32,
646 : coeffs_256,
647 : offset_avg_256,
648 : s_256[1][3],
649 : &s_256[0][3],
650 0 : dst + dst_stride + 3 * 32,
651 0 : dst8 + dst8_stride + 3 * 32);
652 :
653 0 : src_ptr += 2 * src_stride;
654 0 : dst += 2 * dst_stride;
655 0 : dst8 += 2 * dst8_stride;
656 0 : y -= 2;
657 0 : } while (y);
658 : }
659 : }
660 : else {
661 : do {
662 0 : jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride,
663 : coeffs_256,
664 : offset_no_avg_256,
665 : s_256[0][0],
666 : &s_256[1][0],
667 : dst);
668 0 : jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 1 * 32,
669 : coeffs_256,
670 : offset_no_avg_256,
671 : s_256[0][1],
672 : &s_256[1][1],
673 : dst + 1 * 32);
674 0 : jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 2 * 32,
675 : coeffs_256,
676 : offset_no_avg_256,
677 : s_256[0][2],
678 : &s_256[1][2],
679 : dst + 2 * 32);
680 0 : jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 3 * 32,
681 : coeffs_256,
682 : offset_no_avg_256,
683 : s_256[0][3],
684 : &s_256[1][3],
685 : dst + 3 * 32);
686 0 : jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
687 : coeffs_256,
688 : offset_no_avg_256,
689 : s_256[1][0],
690 : &s_256[0][0],
691 0 : dst + dst_stride);
692 0 : jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32,
693 : coeffs_256,
694 : offset_no_avg_256,
695 : s_256[1][1],
696 : &s_256[0][1],
697 0 : dst + dst_stride + 1 * 32);
698 0 : jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32,
699 : coeffs_256,
700 : offset_no_avg_256,
701 : s_256[1][2],
702 : &s_256[0][2],
703 0 : dst + dst_stride + 2 * 32);
704 0 : jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32,
705 : coeffs_256,
706 : offset_no_avg_256,
707 : s_256[1][3],
708 : &s_256[0][3],
709 0 : dst + dst_stride + 3 * 32);
710 :
711 0 : src_ptr += 2 * src_stride;
712 0 : dst += 2 * dst_stride;
713 0 : y -= 2;
714 0 : } while (y);
715 : }
716 : }
717 : }
718 20276000 : }
719 :
720 143763 : static void jnt_convolve_y_4tap_avx2(
721 : const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
722 : const int32_t dst8_stride, const int32_t w, const int32_t h,
723 : const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
724 : const ConvolveParams *const conv_params) {
725 143763 : const uint8_t *src_ptr = src - src_stride;
726 143763 : const int32_t dst_stride = conv_params->dst_stride;
727 143763 : const int32_t round_0 = 3;
728 143763 : const int32_t round_1 = COMPOUND_ROUND1_BITS;
729 143763 : const int32_t bits = FILTER_BITS - round_0;
730 143763 : const int32_t bd = 8;
731 143763 : const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
732 143763 : const int32_t round_offset =
733 143763 : (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
734 143763 : const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
735 143763 : const int32_t offset_comp_avg =
736 143763 : round_offset * conv_params->bck_offset +
737 143763 : (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
738 143763 : (round_offset << DIST_PRECISION_BITS);
739 143763 : const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
740 143763 : (1 << (round_1 - bits - 2)) -
741 143763 : (round_offset << (round_1 - bits - 1));
742 143763 : const int16_t offset_no_avg =
743 143763 : (round_offset << (round_1 - bits - 1)) + (1 << (round_1 - bits - 2));
744 143763 : const int32_t factor =
745 143763 : conv_params->fwd_offset | (conv_params->bck_offset << 16);
746 143763 : const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
747 143763 : const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
748 287526 : const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
749 143763 : const __m128i factor_128 = _mm_set1_epi32(factor);
750 143763 : const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
751 143763 : const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
752 287526 : const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
753 143763 : const __m256i factor_256 = _mm256_set1_epi32(factor);
754 143763 : ConvBufType *dst = conv_params->dst;
755 143763 : int32_t y = h;
756 : __m128i coeffs_128[4];
757 : __m256i coeffs_256[4];
758 :
759 143763 : if (w <= 4) {
760 62618 : prepare_half_coeffs_4tap_ssse3(
761 : filter_params_y, subpel_y_q4, coeffs_128);
762 :
763 62618 : if (w == 2) {
764 : __m128i s_16[4], ss_128[2];
765 :
766 0 : s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
767 0 : s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
768 0 : s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
769 :
770 0 : const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
771 0 : const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
772 :
773 0 : ss_128[0] = _mm_unpacklo_epi8(src01, src12);
774 :
775 0 : if (conv_params->do_average) {
776 0 : if (conv_params->use_jnt_comp_avg) {
777 : do {
778 0 : src_ptr += 2 * src_stride;
779 0 : const __m128i res = y_convolve_4tap_2x2_ssse3(
780 : src_ptr, src_stride, coeffs_128, s_16, ss_128);
781 0 : jnt_comp_avg_round_store_2x2_sse2(res,
782 : factor_128,
783 : offset_comp_avg_128,
784 : dst,
785 : dst_stride,
786 : dst8,
787 : dst8_stride);
788 0 : ss_128[0] = ss_128[1];
789 0 : dst += 2 * dst_stride;
790 0 : dst8 += 2 * dst8_stride;
791 0 : y -= 2;
792 0 : } while (y);
793 : }
794 : else {
795 : do {
796 0 : src_ptr += 2 * src_stride;
797 0 : const __m128i res = y_convolve_4tap_2x2_ssse3(
798 : src_ptr, src_stride, coeffs_128, s_16, ss_128);
799 0 : jnt_avg_round_store_2x2_sse2(res,
800 : offset_avg_128,
801 : dst,
802 : dst_stride,
803 : dst8,
804 : dst8_stride);
805 0 : ss_128[0] = ss_128[1];
806 0 : dst += 2 * dst_stride;
807 0 : dst8 += 2 * dst8_stride;
808 0 : y -= 2;
809 0 : } while (y);
810 : }
811 : }
812 : else {
813 : do {
814 0 : src_ptr += 2 * src_stride;
815 0 : const __m128i res = y_convolve_4tap_2x2_ssse3(
816 : src_ptr, src_stride, coeffs_128, s_16, ss_128);
817 0 : jnt_no_avg_round_store_2x2_sse2(
818 : res, offset_no_avg_128, dst, dst_stride);
819 0 : ss_128[0] = ss_128[1];
820 0 : dst += 2 * dst_stride;
821 0 : y -= 2;
822 0 : } while (y);
823 : }
824 : }
825 : else {
826 : __m128i s_32[4], ss_128[2];
827 :
828 62618 : assert(w == 4);
829 :
830 62618 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
831 62618 : s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
832 62618 : s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
833 :
834 62618 : const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
835 125236 : const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
836 :
837 62618 : ss_128[0] = _mm_unpacklo_epi8(src01, src12);
838 :
839 62618 : if (conv_params->do_average) {
840 26716 : if (conv_params->use_jnt_comp_avg) {
841 : do {
842 19232 : src_ptr += 2 * src_stride;
843 19232 : const __m128i res = y_convolve_4tap_4x2_ssse3(
844 : src_ptr, src_stride, coeffs_128, s_32, ss_128);
845 19232 : jnt_comp_avg_round_store_4x2_sse2(res,
846 : factor_128,
847 : offset_comp_avg_128,
848 : dst,
849 : dst_stride,
850 : dst8,
851 : dst8_stride);
852 19232 : ss_128[0] = ss_128[1];
853 19232 : dst += 2 * dst_stride;
854 19232 : dst8 += 2 * dst8_stride;
855 19232 : y -= 2;
856 19232 : } while (y);
857 : }
858 : else {
859 : do {
860 34200 : src_ptr += 2 * src_stride;
861 34200 : const __m128i res = y_convolve_4tap_4x2_ssse3(
862 : src_ptr, src_stride, coeffs_128, s_32, ss_128);
863 34200 : jnt_avg_round_store_4x2_sse2(res,
864 : offset_avg_128,
865 : dst,
866 : dst_stride,
867 : dst8,
868 : dst8_stride);
869 34200 : ss_128[0] = ss_128[1];
870 34200 : dst += 2 * dst_stride;
871 34200 : dst8 += 2 * dst8_stride;
872 34200 : y -= 2;
873 34200 : } while (y);
874 : }
875 : }
876 : else {
877 : do {
878 71804 : src_ptr += 2 * src_stride;
879 71804 : const __m128i res = y_convolve_4tap_4x2_ssse3(
880 : src_ptr, src_stride, coeffs_128, s_32, ss_128);
881 71804 : jnt_no_avg_round_store_4x2_sse2(
882 : res, offset_no_avg_128, dst, dst_stride);
883 71804 : ss_128[0] = ss_128[1];
884 71804 : dst += 2 * dst_stride;
885 71804 : y -= 2;
886 71804 : } while (y);
887 : }
888 : }
889 : }
890 : else {
891 81145 : prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
892 :
893 81146 : if (w == 8) {
894 : __m128i s_64[4];
895 : __m256i ss_256[2];
896 :
897 45428 : s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
898 45428 : s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
899 45428 : s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
900 :
901 : // Load lines a and b. Line a to lower 128, line b to upper
902 : // 128
903 45428 : const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
904 90856 : const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
905 :
906 45428 : ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
907 :
908 45428 : if (conv_params->do_average) {
909 19472 : if (conv_params->use_jnt_comp_avg) {
910 : do {
911 14528 : src_ptr += 2 * src_stride;
912 14528 : const __m256i res = y_convolve_4tap_8x2_avx2(
913 : src_ptr, src_stride, coeffs_256, s_64, ss_256);
914 14528 : jnt_comp_avg_round_store_8x2_avx2(res,
915 : factor_256,
916 : offset_comp_avg_256,
917 : dst,
918 : dst_stride,
919 : dst8,
920 : dst8_stride);
921 14528 : ss_256[0] = ss_256[1];
922 14528 : dst += 2 * dst_stride;
923 14528 : dst8 += 2 * dst8_stride;
924 14528 : y -= 2;
925 14528 : } while (y);
926 : }
927 : else {
928 : do {
929 24416 : src_ptr += 2 * src_stride;
930 24416 : const __m256i res = y_convolve_4tap_8x2_avx2(
931 : src_ptr, src_stride, coeffs_256, s_64, ss_256);
932 24416 : jnt_avg_round_store_8x2_sse2(res,
933 : offset_avg_256,
934 : dst,
935 : dst_stride,
936 : dst8,
937 : dst8_stride);
938 24416 : ss_256[0] = ss_256[1];
939 24416 : dst += 2 * dst_stride;
940 24416 : dst8 += 2 * dst8_stride;
941 24416 : y -= 2;
942 24416 : } while (y);
943 : }
944 : }
945 : else {
946 : do {
947 51912 : src_ptr += 2 * src_stride;
948 51912 : const __m256i res = y_convolve_4tap_8x2_avx2(
949 : src_ptr, src_stride, coeffs_256, s_64, ss_256);
950 51912 : jnt_no_avg_round_store_8x2_avx2(
951 : res, offset_no_avg_256, dst, dst_stride);
952 51912 : ss_256[0] = ss_256[1];
953 51912 : dst += 2 * dst_stride;
954 51912 : y -= 2;
955 51912 : } while (y);
956 : }
957 : }
958 : else {
959 : __m128i s_128[4];
960 : __m256i ss_256[4], r[2];
961 :
962 35718 : assert(w == 16);
963 :
964 35718 : s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
965 35718 : s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
966 35718 : s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
967 :
968 : // Load lines a and b. Line a to lower 128, line b to upper
969 : // 128
970 35718 : const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
971 71436 : const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
972 :
973 35718 : ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
974 35718 : ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
975 :
976 35718 : if (conv_params->do_average) {
977 15706 : if (conv_params->use_jnt_comp_avg) {
978 : do {
979 12316 : src_ptr += 2 * src_stride;
980 12316 : y_convolve_4tap_16x2_avx2(
981 : src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
982 : jnt_comp_avg_round_store_16x2_avx2(r,
983 : factor_256,
984 : offset_comp_avg_256,
985 : dst,
986 : dst_stride,
987 : dst8,
988 : dst8_stride);
989 12316 : ss_256[0] = ss_256[1];
990 12316 : ss_256[2] = ss_256[3];
991 12316 : dst += 2 * dst_stride;
992 12316 : dst8 += 2 * dst8_stride;
993 12316 : y -= 2;
994 12316 : } while (y);
995 : }
996 : else {
997 : do {
998 19096 : src_ptr += 2 * src_stride;
999 19096 : y_convolve_4tap_16x2_avx2(
1000 : src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1001 19096 : jnt_avg_round_store_16x2_avx2(r,
1002 : offset_avg_256,
1003 : dst,
1004 : dst_stride,
1005 : dst8,
1006 : dst8_stride);
1007 19096 : ss_256[0] = ss_256[1];
1008 19096 : ss_256[2] = ss_256[3];
1009 19096 : dst += 2 * dst_stride;
1010 19096 : dst8 += 2 * dst8_stride;
1011 19096 : y -= 2;
1012 19096 : } while (y);
1013 : }
1014 : }
1015 : else {
1016 : do {
1017 40023 : src_ptr += 2 * src_stride;
1018 40023 : y_convolve_4tap_16x2_avx2(
1019 : src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1020 40024 : jnt_no_avg_round_store_16x2_avx2(
1021 : r, offset_no_avg_256, dst, dst_stride);
1022 40023 : ss_256[0] = ss_256[1];
1023 40023 : ss_256[2] = ss_256[3];
1024 40023 : dst += 2 * dst_stride;
1025 40023 : y -= 2;
1026 40023 : } while (y);
1027 : }
1028 : }
1029 : }
1030 143764 : }
1031 :
1032 7752470 : static void jnt_convolve_y_6tap_avx2(
1033 : const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
1034 : const int32_t dst8_stride, const int32_t w, const int32_t h,
1035 : const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
1036 : const ConvolveParams *const conv_params) {
1037 7752470 : const uint8_t *src_ptr = src - 2 * src_stride;
1038 7752470 : const int32_t dst_stride = conv_params->dst_stride;
1039 7752470 : const int32_t round_0 = 3;
1040 7752470 : const int32_t round_1 = COMPOUND_ROUND1_BITS;
1041 7752470 : const int32_t bits = FILTER_BITS - round_0;
1042 7752470 : const int32_t bd = 8;
1043 7752470 : const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
1044 7752470 : const int32_t round_offset =
1045 7752470 : (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
1046 7752470 : const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
1047 7752470 : const int32_t offset_comp_avg =
1048 7752470 : round_offset * conv_params->bck_offset +
1049 7752470 : (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
1050 7752470 : (round_offset << DIST_PRECISION_BITS);
1051 7752470 : const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
1052 7752470 : (1 << (round_1 - bits - 2)) -
1053 7752470 : (round_offset << (round_1 - bits - 1));
1054 7752470 : const int16_t offset_no_avg =
1055 7752470 : (round_offset << (round_1 - bits - 1)) + (1 << (round_1 - bits - 2));
1056 7752470 : const int32_t factor =
1057 7752470 : conv_params->fwd_offset | (conv_params->bck_offset << 16);
1058 7752470 : const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
1059 7752470 : const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
1060 15504900 : const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
1061 7752470 : const __m128i factor_128 = _mm_set1_epi32(factor);
1062 7752470 : const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1063 7752470 : const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
1064 15504900 : const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
1065 7752470 : const __m256i factor_256 = _mm256_set1_epi32(factor);
1066 7752470 : ConvBufType *dst = conv_params->dst;
1067 : int32_t x;
1068 7752470 : int32_t y = h;
1069 : __m128i coeffs_128[4];
1070 : __m256i coeffs_256[4];
1071 :
1072 7752470 : if (w <= 4) {
1073 82460 : prepare_half_coeffs_6tap_ssse3(
1074 : filter_params_y, subpel_y_q4, coeffs_128);
1075 :
1076 82460 : y = h;
1077 :
1078 82460 : if (w == 2) {
1079 : __m128i s_16[6], ss_128[3];
1080 :
1081 0 : s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
1082 0 : s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
1083 0 : s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
1084 0 : s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
1085 0 : s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
1086 :
1087 0 : const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1088 0 : const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
1089 0 : const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1090 0 : const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
1091 :
1092 0 : ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1093 0 : ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1094 :
1095 0 : if (conv_params->do_average) {
1096 0 : if (conv_params->use_jnt_comp_avg) {
1097 : do {
1098 0 : src_ptr += 2 * src_stride;
1099 0 : const __m128i res = y_convolve_6tap_2x2_ssse3(
1100 : src_ptr, src_stride, coeffs_128, s_16, ss_128);
1101 0 : jnt_comp_avg_round_store_2x2_sse2(res,
1102 : factor_128,
1103 : offset_comp_avg_128,
1104 : dst,
1105 : dst_stride,
1106 : dst8,
1107 : dst8_stride);
1108 0 : ss_128[0] = ss_128[1];
1109 0 : ss_128[1] = ss_128[2];
1110 0 : dst += 2 * dst_stride;
1111 0 : dst8 += 2 * dst8_stride;
1112 0 : y -= 2;
1113 0 : } while (y);
1114 : }
1115 : else {
1116 : do {
1117 0 : src_ptr += 2 * src_stride;
1118 0 : const __m128i res = y_convolve_6tap_2x2_ssse3(
1119 : src_ptr, src_stride, coeffs_128, s_16, ss_128);
1120 0 : jnt_avg_round_store_2x2_sse2(res,
1121 : offset_avg_128,
1122 : dst,
1123 : dst_stride,
1124 : dst8,
1125 : dst8_stride);
1126 0 : ss_128[0] = ss_128[1];
1127 0 : ss_128[1] = ss_128[2];
1128 0 : dst += 2 * dst_stride;
1129 0 : dst8 += 2 * dst8_stride;
1130 0 : y -= 2;
1131 0 : } while (y);
1132 : }
1133 : }
1134 : else {
1135 : do {
1136 0 : src_ptr += 2 * src_stride;
1137 0 : const __m128i res = y_convolve_6tap_2x2_ssse3(
1138 : src_ptr, src_stride, coeffs_128, s_16, ss_128);
1139 0 : jnt_no_avg_round_store_2x2_sse2(
1140 : res, offset_no_avg_128, dst, dst_stride);
1141 0 : ss_128[0] = ss_128[1];
1142 0 : ss_128[1] = ss_128[2];
1143 0 : dst += 2 * dst_stride;
1144 0 : y -= 2;
1145 0 : } while (y);
1146 : }
1147 : }
1148 : else {
1149 : __m128i s_32[6], ss_128[3];
1150 :
1151 82460 : assert(w == 4);
1152 :
1153 82460 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
1154 82460 : s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
1155 82460 : s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
1156 82460 : s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
1157 82460 : s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
1158 :
1159 82460 : const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1160 82460 : const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1161 82460 : const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1162 164920 : const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1163 :
1164 82460 : ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1165 82460 : ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1166 :
1167 82460 : if (conv_params->do_average) {
1168 36452 : if (conv_params->use_jnt_comp_avg) {
1169 : do {
1170 74704 : src_ptr += 2 * src_stride;
1171 74704 : const __m128i res = y_convolve_6tap_4x2_ssse3(
1172 : src_ptr, src_stride, coeffs_128, s_32, ss_128);
1173 74704 : jnt_comp_avg_round_store_4x2_sse2(res,
1174 : factor_128,
1175 : offset_comp_avg_128,
1176 : dst,
1177 : dst_stride,
1178 : dst8,
1179 : dst8_stride);
1180 74704 : ss_128[0] = ss_128[1];
1181 74704 : ss_128[1] = ss_128[2];
1182 74704 : dst += 2 * dst_stride;
1183 74704 : dst8 += 2 * dst8_stride;
1184 74704 : y -= 2;
1185 74704 : } while (y);
1186 : }
1187 : else {
1188 : do {
1189 122920 : src_ptr += 2 * src_stride;
1190 122920 : const __m128i res = y_convolve_6tap_4x2_ssse3(
1191 : src_ptr, src_stride, coeffs_128, s_32, ss_128);
1192 122920 : jnt_avg_round_store_4x2_sse2(res,
1193 : offset_avg_128,
1194 : dst,
1195 : dst_stride,
1196 : dst8,
1197 : dst8_stride);
1198 122920 : ss_128[0] = ss_128[1];
1199 122920 : ss_128[1] = ss_128[2];
1200 122920 : dst += 2 * dst_stride;
1201 122920 : dst8 += 2 * dst8_stride;
1202 122920 : y -= 2;
1203 122920 : } while (y);
1204 : }
1205 : }
1206 : else {
1207 : do {
1208 248624 : src_ptr += 2 * src_stride;
1209 248624 : const __m128i res = y_convolve_6tap_4x2_ssse3(
1210 : src_ptr, src_stride, coeffs_128, s_32, ss_128);
1211 248624 : jnt_no_avg_round_store_4x2_sse2(
1212 : res, offset_no_avg_128, dst, dst_stride);
1213 248624 : ss_128[0] = ss_128[1];
1214 248624 : ss_128[1] = ss_128[2];
1215 248624 : dst += 2 * dst_stride;
1216 248624 : y -= 2;
1217 248624 : } while (y);
1218 : }
1219 : }
1220 : }
1221 : else {
1222 7670010 : prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1223 :
1224 7671280 : if (w == 8) {
1225 : __m128i s_64[6];
1226 : __m256i ss_256[3];
1227 :
1228 3378780 : s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
1229 3378780 : s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
1230 3378780 : s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
1231 3378780 : s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
1232 3378780 : s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
1233 :
1234 : // Load lines a and b. Line a to lower 128, line b to upper
1235 : // 128
1236 3378780 : const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
1237 3378780 : const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
1238 3378780 : const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1239 6757560 : const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
1240 :
1241 3378780 : ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1242 3378780 : ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1243 :
1244 3378780 : y = h;
1245 :
1246 3378780 : if (conv_params->do_average) {
1247 1347710 : if (conv_params->use_jnt_comp_avg) {
1248 : do {
1249 4385580 : src_ptr += 2 * src_stride;
1250 4385580 : const __m256i res = y_convolve_6tap_8x2_avx2(
1251 : src_ptr, src_stride, coeffs_256, s_64, ss_256);
1252 4385540 : jnt_comp_avg_round_store_8x2_avx2(res,
1253 : factor_256,
1254 : offset_comp_avg_256,
1255 : dst,
1256 : dst_stride,
1257 : dst8,
1258 : dst8_stride);
1259 4385510 : ss_256[0] = ss_256[1];
1260 4385510 : ss_256[1] = ss_256[2];
1261 4385510 : dst += 2 * dst_stride;
1262 4385510 : dst8 += 2 * dst8_stride;
1263 4385510 : y -= 2;
1264 4385510 : } while (y);
1265 : }
1266 : else {
1267 : do {
1268 5559460 : src_ptr += 2 * src_stride;
1269 5559460 : const __m256i res = y_convolve_6tap_8x2_avx2(
1270 : src_ptr, src_stride, coeffs_256, s_64, ss_256);
1271 5559510 : jnt_avg_round_store_8x2_sse2(res,
1272 : offset_avg_256,
1273 : dst,
1274 : dst_stride,
1275 : dst8,
1276 : dst8_stride);
1277 5559300 : ss_256[0] = ss_256[1];
1278 5559300 : ss_256[1] = ss_256[2];
1279 5559300 : dst += 2 * dst_stride;
1280 5559300 : dst8 += 2 * dst8_stride;
1281 5559300 : y -= 2;
1282 5559300 : } while (y);
1283 : }
1284 : }
1285 : else {
1286 : do {
1287 15063000 : src_ptr += 2 * src_stride;
1288 15063000 : const __m256i res = y_convolve_6tap_8x2_avx2(
1289 : src_ptr, src_stride, coeffs_256, s_64, ss_256);
1290 15063600 : jnt_no_avg_round_store_8x2_avx2(
1291 : res, offset_no_avg_256, dst, dst_stride);
1292 15063100 : ss_256[0] = ss_256[1];
1293 15063100 : ss_256[1] = ss_256[2];
1294 15063100 : dst += 2 * dst_stride;
1295 15063100 : y -= 2;
1296 15063100 : } while (y);
1297 : }
1298 : }
1299 4292500 : else if (w == 16) {
1300 : __m128i s_128[6];
1301 : __m256i ss_256[6], r[2];
1302 :
1303 2423560 : s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
1304 2423560 : s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
1305 2423560 : s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
1306 2423560 : s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
1307 2423560 : s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
1308 :
1309 : // Load lines a and b. Line a to lower 128, line b to upper
1310 : // 128
1311 2423560 : const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
1312 2423560 : const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
1313 2423560 : const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1314 4847120 : const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
1315 :
1316 2423560 : ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1317 2423560 : ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1318 :
1319 2423560 : ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
1320 2423560 : ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
1321 :
1322 2423560 : y = h;
1323 :
1324 2423560 : if (conv_params->do_average) {
1325 953053 : if (conv_params->use_jnt_comp_avg) {
1326 : do {
1327 3759680 : src_ptr += 2 * src_stride;
1328 3759680 : y_convolve_6tap_16x2_avx2(
1329 : src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1330 : jnt_comp_avg_round_store_16x2_avx2(r,
1331 : factor_256,
1332 : offset_comp_avg_256,
1333 : dst,
1334 : dst_stride,
1335 : dst8,
1336 : dst8_stride);
1337 3759650 : ss_256[0] = ss_256[1];
1338 3759650 : ss_256[1] = ss_256[2];
1339 3759650 : ss_256[3] = ss_256[4];
1340 3759650 : ss_256[4] = ss_256[5];
1341 3759650 : dst += 2 * dst_stride;
1342 3759650 : dst8 += 2 * dst8_stride;
1343 3759650 : y -= 2;
1344 3759650 : } while (y);
1345 : }
1346 : else {
1347 : do {
1348 5641710 : src_ptr += 2 * src_stride;
1349 5641710 : y_convolve_6tap_16x2_avx2(
1350 : src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1351 5641570 : jnt_avg_round_store_16x2_avx2(r,
1352 : offset_avg_256,
1353 : dst,
1354 : dst_stride,
1355 : dst8,
1356 : dst8_stride);
1357 5641580 : ss_256[0] = ss_256[1];
1358 5641580 : ss_256[1] = ss_256[2];
1359 5641580 : ss_256[3] = ss_256[4];
1360 5641580 : ss_256[4] = ss_256[5];
1361 5641580 : dst += 2 * dst_stride;
1362 5641580 : dst8 += 2 * dst8_stride;
1363 5641580 : y -= 2;
1364 5641580 : } while (y);
1365 : }
1366 : }
1367 : else {
1368 : do {
1369 14654200 : src_ptr += 2 * src_stride;
1370 14654200 : y_convolve_6tap_16x2_avx2(
1371 : src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1372 14655500 : jnt_no_avg_round_store_16x2_avx2(
1373 : r, offset_no_avg_256, dst, dst_stride);
1374 14654300 : ss_256[0] = ss_256[1];
1375 14654300 : ss_256[1] = ss_256[2];
1376 14654300 : ss_256[3] = ss_256[4];
1377 14654300 : ss_256[4] = ss_256[5];
1378 14654300 : dst += 2 * dst_stride;
1379 14654300 : y -= 2;
1380 14654300 : } while (y);
1381 : }
1382 : }
1383 : else {
1384 : __m256i s_256[6], ss_256[6], tt_256[6], r[4];
1385 :
1386 1868940 : assert(!(w % 32));
1387 :
1388 1868940 : x = 0;
1389 : do {
1390 2304440 : const uint8_t *s = src_ptr + x;
1391 2304440 : ConvBufType *d = dst + x;
1392 2304440 : uint8_t *d8 = dst8 + x;
1393 :
1394 2304440 : s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
1395 2304440 : s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
1396 2304440 : s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
1397 2304440 : s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
1398 2304440 : s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
1399 :
1400 2304440 : ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1401 2304440 : ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1402 2304440 : ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1403 2304440 : ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1404 :
1405 2304440 : tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
1406 2304440 : tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
1407 2304440 : tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
1408 2304440 : tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
1409 :
1410 2304440 : y = h;
1411 :
1412 2304440 : if (conv_params->do_average) {
1413 904976 : if (conv_params->use_jnt_comp_avg) {
1414 : do {
1415 4191810 : s += 2 * src_stride;
1416 4191810 : y_convolve_6tap_32x2_avx2(s,
1417 : src_stride,
1418 : coeffs_256,
1419 : s_256,
1420 : ss_256,
1421 : tt_256,
1422 : r);
1423 : jnt_comp_avg_round_store_32_avx2(
1424 : r, factor_256, offset_comp_avg_256, d, d8);
1425 4191900 : jnt_comp_avg_round_store_32_avx2(
1426 : r + 2,
1427 : factor_256,
1428 : offset_comp_avg_256,
1429 4191900 : d + dst_stride,
1430 : d8 + dst8_stride);
1431 :
1432 4191800 : ss_256[0] = ss_256[1];
1433 4191800 : ss_256[1] = ss_256[2];
1434 4191800 : ss_256[3] = ss_256[4];
1435 4191800 : ss_256[4] = ss_256[5];
1436 :
1437 4191800 : tt_256[0] = tt_256[1];
1438 4191800 : tt_256[1] = tt_256[2];
1439 4191800 : tt_256[3] = tt_256[4];
1440 4191800 : tt_256[4] = tt_256[5];
1441 4191800 : d += 2 * dst_stride;
1442 4191800 : d8 += 2 * dst8_stride;
1443 4191800 : y -= 2;
1444 4191800 : } while (y);
1445 : }
1446 : else {
1447 : do {
1448 6978040 : s += 2 * src_stride;
1449 6978040 : y_convolve_6tap_32x2_avx2(s,
1450 : src_stride,
1451 : coeffs_256,
1452 : s_256,
1453 : ss_256,
1454 : tt_256,
1455 : r);
1456 6978550 : jnt_avg_round_store_32_avx2(
1457 : r, offset_avg_256, d, d8);
1458 6977890 : jnt_avg_round_store_32_avx2(r + 2,
1459 : offset_avg_256,
1460 6977890 : d + dst_stride,
1461 : d8 + dst8_stride);
1462 :
1463 6977700 : ss_256[0] = ss_256[1];
1464 6977700 : ss_256[1] = ss_256[2];
1465 6977700 : ss_256[3] = ss_256[4];
1466 6977700 : ss_256[4] = ss_256[5];
1467 :
1468 6977700 : tt_256[0] = tt_256[1];
1469 6977700 : tt_256[1] = tt_256[2];
1470 6977700 : tt_256[3] = tt_256[4];
1471 6977700 : tt_256[4] = tt_256[5];
1472 6977700 : d += 2 * dst_stride;
1473 6977700 : d8 += 2 * dst8_stride;
1474 6977700 : y -= 2;
1475 6977700 : } while (y);
1476 : }
1477 : }
1478 : else {
1479 : do {
1480 17348300 : s += 2 * src_stride;
1481 17348300 : y_convolve_6tap_32x2_avx2(s,
1482 : src_stride,
1483 : coeffs_256,
1484 : s_256,
1485 : ss_256,
1486 : tt_256,
1487 : r);
1488 17352000 : jnt_no_avg_round_store_32_avx2(r, offset_no_avg_256, d);
1489 17351000 : jnt_no_avg_round_store_32_avx2(
1490 17351000 : r + 2, offset_no_avg_256, d + dst_stride);
1491 :
1492 17349400 : ss_256[0] = ss_256[1];
1493 17349400 : ss_256[1] = ss_256[2];
1494 17349400 : ss_256[3] = ss_256[4];
1495 17349400 : ss_256[4] = ss_256[5];
1496 :
1497 17349400 : tt_256[0] = tt_256[1];
1498 17349400 : tt_256[1] = tt_256[2];
1499 17349400 : tt_256[3] = tt_256[4];
1500 17349400 : tt_256[4] = tt_256[5];
1501 17349400 : d += 2 * dst_stride;
1502 17349400 : y -= 2;
1503 17349400 : } while (y);
1504 : }
1505 :
1506 2305160 : x += 32;
1507 2305160 : } while (x < w);
1508 : }
1509 : }
1510 7754270 : }
1511 :
1512 3388010 : static void jnt_convolve_y_8tap_avx2(
1513 : const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
1514 : const int32_t dst8_stride, const int32_t w, const int32_t h,
1515 : const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
1516 : const ConvolveParams *const conv_params) {
1517 3388010 : const uint8_t *src_ptr = src - 3 * src_stride;
1518 3388010 : const int32_t dst_stride = conv_params->dst_stride;
1519 3388010 : const int32_t round_0 = 3;
1520 3388010 : const int32_t round_1 = COMPOUND_ROUND1_BITS;
1521 3388010 : const int32_t bits = FILTER_BITS - round_0;
1522 3388010 : const int32_t bd = 8;
1523 3388010 : const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
1524 3388010 : const int32_t round_offset =
1525 3388010 : (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
1526 3388010 : const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
1527 3388010 : const int32_t offset_comp_avg =
1528 3388010 : round_offset * conv_params->bck_offset +
1529 3388010 : (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
1530 3388010 : (round_offset << DIST_PRECISION_BITS);
1531 3388010 : const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
1532 3388010 : (1 << (round_1 - bits - 2)) -
1533 3388010 : (round_offset << (round_1 - bits - 1));
1534 3388010 : const int16_t offset_no_avg =
1535 3388010 : (round_offset << (round_1 - bits - 1)) + (1 << (round_1 - bits - 2));
1536 3388010 : const int32_t factor =
1537 3388010 : conv_params->fwd_offset | (conv_params->bck_offset << 16);
1538 3388010 : const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
1539 3388010 : const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
1540 6776020 : const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
1541 3388010 : const __m128i factor_128 = _mm_set1_epi32(factor);
1542 3388010 : const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1543 3388010 : const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
1544 6776020 : const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
1545 3388010 : const __m256i factor_256 = _mm256_set1_epi32(factor);
1546 3388010 : ConvBufType *dst = conv_params->dst;
1547 : int32_t x;
1548 3388010 : int32_t y = h;
1549 : __m128i coeffs_128[4];
1550 : __m256i coeffs_256[4];
1551 :
1552 3388010 : if (w <= 4) {
1553 11132 : prepare_half_coeffs_8tap_ssse3(
1554 : filter_params_y, subpel_y_q4, coeffs_128);
1555 :
1556 11132 : y = h;
1557 :
1558 11132 : if (w == 2) {
1559 : __m128i s_16[8], ss_128[4];
1560 :
1561 0 : s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
1562 0 : s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
1563 0 : s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
1564 0 : s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
1565 0 : s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
1566 0 : s_16[5] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 5 * src_stride));
1567 0 : s_16[6] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 6 * src_stride));
1568 :
1569 0 : const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1570 0 : const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
1571 0 : const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1572 0 : const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
1573 0 : const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1574 0 : const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
1575 :
1576 0 : ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1577 0 : ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1578 0 : ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1579 :
1580 0 : if (conv_params->do_average) {
1581 0 : if (conv_params->use_jnt_comp_avg) {
1582 : do {
1583 0 : const __m128i res = y_convolve_8tap_2x2_ssse3(
1584 : src_ptr, src_stride, coeffs_128, s_16, ss_128);
1585 0 : jnt_comp_avg_round_store_2x2_sse2(res,
1586 : factor_128,
1587 : offset_comp_avg_128,
1588 : dst,
1589 : dst_stride,
1590 : dst8,
1591 : dst8_stride);
1592 0 : ss_128[0] = ss_128[1];
1593 0 : ss_128[1] = ss_128[2];
1594 0 : ss_128[2] = ss_128[3];
1595 0 : src_ptr += 2 * src_stride;
1596 0 : dst += 2 * dst_stride;
1597 0 : dst8 += 2 * dst8_stride;
1598 0 : y -= 2;
1599 0 : } while (y);
1600 : }
1601 : else {
1602 : do {
1603 0 : const __m128i res = y_convolve_8tap_2x2_ssse3(
1604 : src_ptr, src_stride, coeffs_128, s_16, ss_128);
1605 0 : jnt_avg_round_store_2x2_sse2(res,
1606 : offset_avg_128,
1607 : dst,
1608 : dst_stride,
1609 : dst8,
1610 : dst8_stride);
1611 0 : ss_128[0] = ss_128[1];
1612 0 : ss_128[1] = ss_128[2];
1613 0 : ss_128[2] = ss_128[3];
1614 0 : src_ptr += 2 * src_stride;
1615 0 : dst += 2 * dst_stride;
1616 0 : dst8 += 2 * dst8_stride;
1617 0 : y -= 2;
1618 0 : } while (y);
1619 : }
1620 : }
1621 : else {
1622 : do {
1623 0 : const __m128i res = y_convolve_8tap_2x2_ssse3(
1624 : src_ptr, src_stride, coeffs_128, s_16, ss_128);
1625 0 : jnt_no_avg_round_store_2x2_sse2(
1626 : res, offset_no_avg_128, dst, dst_stride);
1627 0 : ss_128[0] = ss_128[1];
1628 0 : ss_128[1] = ss_128[2];
1629 0 : ss_128[2] = ss_128[3];
1630 0 : src_ptr += 2 * src_stride;
1631 0 : dst += 2 * dst_stride;
1632 0 : y -= 2;
1633 0 : } while (y);
1634 : }
1635 : }
1636 : else {
1637 : __m128i s_32[8], ss_128[4];
1638 :
1639 11132 : assert(w == 4);
1640 :
1641 11132 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
1642 11132 : s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
1643 11132 : s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
1644 11132 : s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
1645 11132 : s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
1646 11132 : s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 5 * src_stride));
1647 11132 : s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 6 * src_stride));
1648 :
1649 11132 : const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1650 11132 : const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1651 11132 : const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1652 11132 : const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1653 11132 : const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1654 22264 : const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
1655 :
1656 11132 : ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1657 11132 : ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1658 11132 : ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1659 :
1660 11132 : if (conv_params->do_average) {
1661 4458 : if (conv_params->use_jnt_comp_avg) {
1662 : do {
1663 13464 : const __m128i res = y_convolve_8tap_4x2_ssse3(
1664 : src_ptr, src_stride, coeffs_128, s_32, ss_128);
1665 13464 : jnt_comp_avg_round_store_4x2_sse2(res,
1666 : factor_128,
1667 : offset_comp_avg_128,
1668 : dst,
1669 : dst_stride,
1670 : dst8,
1671 : dst8_stride);
1672 13464 : ss_128[0] = ss_128[1];
1673 13464 : ss_128[1] = ss_128[2];
1674 13464 : ss_128[2] = ss_128[3];
1675 13464 : src_ptr += 2 * src_stride;
1676 13464 : dst += 2 * dst_stride;
1677 13464 : dst8 += 2 * dst8_stride;
1678 13464 : y -= 2;
1679 13464 : } while (y);
1680 : }
1681 : else {
1682 : do {
1683 12984 : const __m128i res = y_convolve_8tap_4x2_ssse3(
1684 : src_ptr, src_stride, coeffs_128, s_32, ss_128);
1685 12984 : jnt_avg_round_store_4x2_sse2(res,
1686 : offset_avg_128,
1687 : dst,
1688 : dst_stride,
1689 : dst8,
1690 : dst8_stride);
1691 12984 : ss_128[0] = ss_128[1];
1692 12984 : ss_128[1] = ss_128[2];
1693 12984 : ss_128[2] = ss_128[3];
1694 12984 : src_ptr += 2 * src_stride;
1695 12984 : dst += 2 * dst_stride;
1696 12984 : dst8 += 2 * dst8_stride;
1697 12984 : y -= 2;
1698 12984 : } while (y);
1699 : }
1700 : }
1701 : else {
1702 : do {
1703 39664 : const __m128i res = y_convolve_8tap_4x2_ssse3(
1704 : src_ptr, src_stride, coeffs_128, s_32, ss_128);
1705 39664 : jnt_no_avg_round_store_4x2_sse2(
1706 : res, offset_no_avg_128, dst, dst_stride);
1707 39664 : ss_128[0] = ss_128[1];
1708 39664 : ss_128[1] = ss_128[2];
1709 39664 : ss_128[2] = ss_128[3];
1710 39664 : src_ptr += 2 * src_stride;
1711 39664 : dst += 2 * dst_stride;
1712 39664 : y -= 2;
1713 39664 : } while (y);
1714 : }
1715 : }
1716 : }
1717 : else {
1718 3376880 : prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1719 :
1720 3377070 : if (w == 8) {
1721 : __m128i s_64[8];
1722 : __m256i ss_256[4];
1723 :
1724 1412110 : s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
1725 1412110 : s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
1726 1412110 : s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
1727 1412110 : s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
1728 1412110 : s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
1729 1412110 : s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
1730 1412110 : s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
1731 :
1732 : // Load lines a and b. Line a to lower 128, line b to upper
1733 : // 128
1734 1412110 : const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
1735 1412110 : const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
1736 1412110 : const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1737 1412110 : const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
1738 1412110 : const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1739 2824220 : const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
1740 :
1741 1412110 : ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1742 1412110 : ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1743 1412110 : ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1744 :
1745 1412110 : y = h;
1746 :
1747 1412110 : if (conv_params->do_average) {
1748 576251 : if (conv_params->use_jnt_comp_avg) {
1749 : do {
1750 1927600 : const __m256i res = y_convolve_8tap_8x2_avx2(
1751 : src_ptr, src_stride, coeffs_256, s_64, ss_256);
1752 1927600 : jnt_comp_avg_round_store_8x2_avx2(res,
1753 : factor_256,
1754 : offset_comp_avg_256,
1755 : dst,
1756 : dst_stride,
1757 : dst8,
1758 : dst8_stride);
1759 1927580 : ss_256[0] = ss_256[1];
1760 1927580 : ss_256[1] = ss_256[2];
1761 1927580 : ss_256[2] = ss_256[3];
1762 1927580 : src_ptr += 2 * src_stride;
1763 1927580 : dst += 2 * dst_stride;
1764 1927580 : dst8 += 2 * dst8_stride;
1765 1927580 : y -= 2;
1766 1927580 : } while (y);
1767 : }
1768 : else {
1769 : do {
1770 2456560 : const __m256i res = y_convolve_8tap_8x2_avx2(
1771 : src_ptr, src_stride, coeffs_256, s_64, ss_256);
1772 2456570 : jnt_avg_round_store_8x2_sse2(res,
1773 : offset_avg_256,
1774 : dst,
1775 : dst_stride,
1776 : dst8,
1777 : dst8_stride);
1778 2456550 : ss_256[0] = ss_256[1];
1779 2456550 : ss_256[1] = ss_256[2];
1780 2456550 : ss_256[2] = ss_256[3];
1781 2456550 : src_ptr += 2 * src_stride;
1782 2456550 : dst += 2 * dst_stride;
1783 2456550 : dst8 += 2 * dst8_stride;
1784 2456550 : y -= 2;
1785 2456550 : } while (y);
1786 : }
1787 : }
1788 : else {
1789 : do {
1790 6360280 : const __m256i res = y_convolve_8tap_8x2_avx2(
1791 : src_ptr, src_stride, coeffs_256, s_64, ss_256);
1792 6360390 : jnt_no_avg_round_store_8x2_avx2(
1793 : res, offset_no_avg_256, dst, dst_stride);
1794 6360300 : ss_256[0] = ss_256[1];
1795 6360300 : ss_256[1] = ss_256[2];
1796 6360300 : ss_256[2] = ss_256[3];
1797 6360300 : src_ptr += 2 * src_stride;
1798 6360300 : dst += 2 * dst_stride;
1799 6360300 : y -= 2;
1800 6360300 : } while (y);
1801 : }
1802 : }
1803 1964960 : else if (w == 16) {
1804 : __m128i s_128[8];
1805 : __m256i ss_256[8], r[2];
1806 :
1807 1054100 : s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
1808 1054100 : s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
1809 1054100 : s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
1810 1054100 : s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
1811 1054100 : s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
1812 1054100 : s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
1813 1054100 : s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
1814 :
1815 : // Load lines a and b. Line a to lower 128, line b to upper
1816 : // 128
1817 1054100 : const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
1818 1054100 : const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
1819 1054100 : const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1820 1054100 : const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
1821 1054100 : const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1822 2108190 : const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
1823 :
1824 1054100 : ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1825 1054100 : ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1826 1054100 : ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1827 :
1828 1054100 : ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
1829 1054100 : ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
1830 1054100 : ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
1831 :
1832 1054100 : y = h;
1833 :
1834 1054100 : if (conv_params->do_average) {
1835 424222 : if (conv_params->use_jnt_comp_avg) {
1836 : do {
1837 1742750 : y_convolve_8tap_16x2_avx2(
1838 : src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1839 : jnt_comp_avg_round_store_16x2_avx2(r,
1840 : factor_256,
1841 : offset_comp_avg_256,
1842 : dst,
1843 : dst_stride,
1844 : dst8,
1845 : dst8_stride);
1846 1742760 : ss_256[0] = ss_256[1];
1847 1742760 : ss_256[1] = ss_256[2];
1848 1742760 : ss_256[2] = ss_256[3];
1849 1742760 : ss_256[4] = ss_256[5];
1850 1742760 : ss_256[5] = ss_256[6];
1851 1742760 : ss_256[6] = ss_256[7];
1852 1742760 : src_ptr += 2 * src_stride;
1853 1742760 : dst += 2 * dst_stride;
1854 1742760 : dst8 += 2 * dst8_stride;
1855 1742760 : y -= 2;
1856 1742760 : } while (y);
1857 : }
1858 : else {
1859 : do {
1860 2625770 : y_convolve_8tap_16x2_avx2(
1861 : src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1862 2625800 : jnt_avg_round_store_16x2_avx2(r,
1863 : offset_avg_256,
1864 : dst,
1865 : dst_stride,
1866 : dst8,
1867 : dst8_stride);
1868 2625760 : ss_256[0] = ss_256[1];
1869 2625760 : ss_256[1] = ss_256[2];
1870 2625760 : ss_256[2] = ss_256[3];
1871 2625760 : ss_256[4] = ss_256[5];
1872 2625760 : ss_256[5] = ss_256[6];
1873 2625760 : ss_256[6] = ss_256[7];
1874 2625760 : src_ptr += 2 * src_stride;
1875 2625760 : dst += 2 * dst_stride;
1876 2625760 : dst8 += 2 * dst8_stride;
1877 2625760 : y -= 2;
1878 2625760 : } while (y);
1879 : }
1880 : }
1881 : else {
1882 : do {
1883 6517960 : y_convolve_8tap_16x2_avx2(
1884 : src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1885 6518300 : jnt_no_avg_round_store_16x2_avx2(
1886 : r, offset_no_avg_256, dst, dst_stride);
1887 6517960 : ss_256[0] = ss_256[1];
1888 6517960 : ss_256[1] = ss_256[2];
1889 6517960 : ss_256[2] = ss_256[3];
1890 6517960 : ss_256[4] = ss_256[5];
1891 6517960 : ss_256[5] = ss_256[6];
1892 6517960 : ss_256[6] = ss_256[7];
1893 6517960 : src_ptr += 2 * src_stride;
1894 6517960 : dst += 2 * dst_stride;
1895 6517960 : y -= 2;
1896 6517960 : } while (y);
1897 : }
1898 : }
1899 : else {
1900 : __m256i s_256[8], ss_256[8], tt_256[8], r[4];
1901 :
1902 910864 : assert(!(w % 32));
1903 :
1904 910864 : x = 0;
1905 : do {
1906 1142520 : const uint8_t *s = src_ptr + x;
1907 1142520 : ConvBufType *d = dst + x;
1908 1142520 : uint8_t *d8 = dst8 + x;
1909 :
1910 1142520 : s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
1911 1142520 : s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
1912 1142520 : s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
1913 1142520 : s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
1914 1142520 : s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
1915 1142520 : s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
1916 1142520 : s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
1917 :
1918 1142520 : ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1919 1142520 : ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1920 1142520 : ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1921 1142520 : ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1922 1142520 : ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1923 1142520 : ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1924 :
1925 1142520 : tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
1926 1142520 : tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
1927 1142520 : tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
1928 1142520 : tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
1929 1142520 : tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
1930 1142520 : tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
1931 :
1932 1142520 : y = h;
1933 :
1934 1142520 : if (conv_params->do_average) {
1935 460332 : if (conv_params->use_jnt_comp_avg) {
1936 : do {
1937 2229770 : y_convolve_8tap_32x2_avx2(s,
1938 : src_stride,
1939 : coeffs_256,
1940 : s_256,
1941 : ss_256,
1942 : tt_256,
1943 : r);
1944 : jnt_comp_avg_round_store_32_avx2(
1945 : r, factor_256, offset_comp_avg_256, d, d8);
1946 2229750 : jnt_comp_avg_round_store_32_avx2(
1947 : r + 2,
1948 : factor_256,
1949 : offset_comp_avg_256,
1950 2229750 : d + dst_stride,
1951 : d8 + dst8_stride);
1952 :
1953 2229770 : ss_256[0] = ss_256[1];
1954 2229770 : ss_256[1] = ss_256[2];
1955 2229770 : ss_256[2] = ss_256[3];
1956 2229770 : ss_256[4] = ss_256[5];
1957 2229770 : ss_256[5] = ss_256[6];
1958 2229770 : ss_256[6] = ss_256[7];
1959 :
1960 2229770 : tt_256[0] = tt_256[1];
1961 2229770 : tt_256[1] = tt_256[2];
1962 2229770 : tt_256[2] = tt_256[3];
1963 2229770 : tt_256[4] = tt_256[5];
1964 2229770 : tt_256[5] = tt_256[6];
1965 2229770 : tt_256[6] = tt_256[7];
1966 2229770 : s += 2 * src_stride;
1967 2229770 : d += 2 * dst_stride;
1968 2229770 : d8 += 2 * dst8_stride;
1969 2229770 : y -= 2;
1970 2229770 : } while (y);
1971 : }
1972 : else {
1973 : do {
1974 3744240 : y_convolve_8tap_32x2_avx2(s,
1975 : src_stride,
1976 : coeffs_256,
1977 : s_256,
1978 : ss_256,
1979 : tt_256,
1980 : r);
1981 3744290 : jnt_avg_round_store_32_avx2(
1982 : r, offset_avg_256, d, d8);
1983 3744170 : jnt_avg_round_store_32_avx2(r + 2,
1984 : offset_avg_256,
1985 3744170 : d + dst_stride,
1986 : d8 + dst8_stride);
1987 :
1988 3744130 : ss_256[0] = ss_256[1];
1989 3744130 : ss_256[1] = ss_256[2];
1990 3744130 : ss_256[2] = ss_256[3];
1991 3744130 : ss_256[4] = ss_256[5];
1992 3744130 : ss_256[5] = ss_256[6];
1993 3744130 : ss_256[6] = ss_256[7];
1994 :
1995 3744130 : tt_256[0] = tt_256[1];
1996 3744130 : tt_256[1] = tt_256[2];
1997 3744130 : tt_256[2] = tt_256[3];
1998 3744130 : tt_256[4] = tt_256[5];
1999 3744130 : tt_256[5] = tt_256[6];
2000 3744130 : tt_256[6] = tt_256[7];
2001 3744130 : s += 2 * src_stride;
2002 3744130 : d += 2 * dst_stride;
2003 3744130 : d8 += 2 * dst8_stride;
2004 3744130 : y -= 2;
2005 3744130 : } while (y);
2006 : }
2007 : }
2008 : else {
2009 : do {
2010 8907200 : y_convolve_8tap_32x2_avx2(s,
2011 : src_stride,
2012 : coeffs_256,
2013 : s_256,
2014 : ss_256,
2015 : tt_256,
2016 : r);
2017 8908280 : jnt_no_avg_round_store_32_avx2(r, offset_no_avg_256, d);
2018 8907840 : jnt_no_avg_round_store_32_avx2(
2019 8907840 : r + 2, offset_no_avg_256, d + dst_stride);
2020 :
2021 8907480 : ss_256[0] = ss_256[1];
2022 8907480 : ss_256[1] = ss_256[2];
2023 8907480 : ss_256[2] = ss_256[3];
2024 8907480 : ss_256[4] = ss_256[5];
2025 8907480 : ss_256[5] = ss_256[6];
2026 8907480 : ss_256[6] = ss_256[7];
2027 :
2028 8907480 : tt_256[0] = tt_256[1];
2029 8907480 : tt_256[1] = tt_256[2];
2030 8907480 : tt_256[2] = tt_256[3];
2031 8907480 : tt_256[4] = tt_256[5];
2032 8907480 : tt_256[5] = tt_256[6];
2033 8907480 : tt_256[6] = tt_256[7];
2034 8907480 : s += 2 * src_stride;
2035 8907480 : d += 2 * dst_stride;
2036 8907480 : y -= 2;
2037 8907480 : } while (y);
2038 : }
2039 :
2040 1142700 : x += 32;
2041 1142700 : } while (x < w);
2042 : }
2043 : }
2044 3388370 : }
2045 :
2046 : typedef void(*jnt_convolve_y_tap_func)(
2047 : const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
2048 : const int32_t dst8_stride, const int32_t w, const int32_t h,
2049 : const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
2050 : const ConvolveParams *const conv_params);
2051 :
2052 31524800 : void eb_av1_jnt_convolve_y_avx2(const uint8_t *src, int32_t src_stride,
2053 : uint8_t *dst8, int32_t dst8_stride, int32_t w,
2054 : int32_t h, InterpFilterParams *filter_params_x,
2055 : InterpFilterParams *filter_params_y,
2056 : const int32_t subpel_x_q4,
2057 : const int32_t subpel_y_q4,
2058 : ConvolveParams *conv_params) {
2059 : static const jnt_convolve_y_tap_func
2060 : jnt_convolve_y_tap_func_table[MAX_FILTER_TAP + 1] = {
2061 : NULL,
2062 : NULL,
2063 : jnt_convolve_y_2tap_avx2,
2064 : NULL,
2065 : jnt_convolve_y_4tap_avx2,
2066 : NULL,
2067 : jnt_convolve_y_6tap_avx2,
2068 : NULL,
2069 : jnt_convolve_y_8tap_avx2 };
2070 31524800 : const int32_t tap_y = get_convolve_tap(filter_params_y->filter_ptr);
2071 :
2072 : (void)filter_params_x;
2073 : (void)subpel_x_q4;
2074 :
2075 31521300 : assert(conv_params->round_0 == 3);
2076 31522000 : assert(conv_params->round_1 == COMPOUND_ROUND1_BITS);
2077 :
2078 31522000 : jnt_convolve_y_tap_func_table[tap_y](src,
2079 : src_stride,
2080 : dst8,
2081 : dst8_stride,
2082 : w,
2083 : h,
2084 : filter_params_y,
2085 : subpel_y_q4,
2086 : conv_params);
2087 31544900 : }
2088 :
2089 : // =============================================================================
2090 :
2091 12106500 : static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
2092 12106500 : const int32_t w0 = conv_params->fwd_offset;
2093 12106500 : const int32_t w1 = conv_params->bck_offset;
2094 12106500 : const __m256i wt0 = _mm256_set1_epi16(w0);
2095 24213100 : const __m256i wt1 = _mm256_set1_epi16(w1);
2096 12106500 : const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
2097 12106500 : return wt;
2098 : }
2099 :
2100 11208500 : static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
2101 44834200 : return _mm256_permute2x128_si256(
2102 : _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
2103 : _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
2104 : }
2105 :
2106 12107200 : void eb_av1_jnt_convolve_2d_copy_avx2(
2107 : const uint8_t *src, int32_t src_stride, uint8_t *dst0, int32_t dst_stride0,
2108 : int32_t w, int32_t h, InterpFilterParams *filter_params_x,
2109 : InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
2110 : const int32_t subpel_y_q4, ConvolveParams *conv_params) {
2111 12107200 : const int32_t bd = 8;
2112 12107200 : ConvBufType *dst = conv_params->dst;
2113 12107200 : int32_t dst_stride = conv_params->dst_stride;
2114 : (void)filter_params_x;
2115 : (void)filter_params_y;
2116 : (void)subpel_x_q4;
2117 : (void)subpel_y_q4;
2118 :
2119 12107200 : const int32_t bits =
2120 12107200 : FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
2121 12107200 : const __m128i left_shift = _mm_cvtsi32_si128(bits);
2122 12107200 : const int32_t do_average = conv_params->do_average;
2123 12107200 : const int32_t use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
2124 12107200 : const __m256i wt = unpack_weights_avx2(conv_params);
2125 12109500 : const __m256i zero = _mm256_setzero_si256();
2126 :
2127 12109500 : const int32_t offset_0 =
2128 12109500 : bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
2129 12109500 : const int32_t offset = (1 << offset_0) + (1 << (offset_0 - 1));
2130 12109500 : const __m256i offset_const = _mm256_set1_epi16(offset);
2131 12109500 : const int32_t rounding_shift =
2132 12109500 : 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
2133 12109500 : const __m256i rounding_const =
2134 12109500 : _mm256_set1_epi16((1 << rounding_shift) >> 1);
2135 : int32_t i, j;
2136 :
2137 12109500 : if (!(w % 16)) {
2138 183824000 : for (i = 0; i < h; i += 1) {
2139 507682000 : for (j = 0; j < w; j += 16) {
2140 662560000 : const __m256i src_16bit = _mm256_cvtepu8_epi16(
2141 331280000 : _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])));
2142 :
2143 331280000 : const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
2144 331280000 : const __m256i res_unsigned =
2145 331280000 : _mm256_add_epi16(res, offset_const);
2146 :
2147 331280000 : if (do_average) {
2148 117196000 : const __m256i data_ref_0 = _mm256_loadu_si256(
2149 117196000 : (__m256i *)(&dst[i * dst_stride + j]));
2150 :
2151 117196000 : const __m256i comp_avg_res = comp_avg(
2152 : &data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
2153 :
2154 : const __m256i round_result =
2155 117185000 : convolve_rounding(&comp_avg_res,
2156 : &offset_const,
2157 : &rounding_const,
2158 : rounding_shift);
2159 :
2160 : const __m256i res_8 =
2161 117145000 : _mm256_packus_epi16(round_result, round_result);
2162 117145000 : const __m256i res_0 = _mm256_permute4x64_epi64(res_8, 0xD8);
2163 :
2164 117145000 : _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
2165 : _mm256_castsi256_si128(res_0));
2166 : }
2167 : else {
2168 214084000 : _mm256_storeu_si256((__m256i *)(&dst[i * dst_stride + j]),
2169 : res_unsigned);
2170 : }
2171 : }
2172 : }
2173 : }
2174 4688120 : else if (!(w % 4)) {
2175 38177000 : for (i = 0; i < h; i += 2) {
2176 66975300 : for (j = 0; j < w; j += 8) {
2177 : const __m128i src_row_0 =
2178 33488200 : _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
2179 33488200 : const __m128i src_row_1 = _mm_loadl_epi64(
2180 33488200 : (__m128i *)(&src[i * src_stride + j + src_stride]));
2181 : // since not all compilers yet support _mm256_set_m128i()
2182 66976400 : const __m256i src_10 = _mm256_insertf128_si256(
2183 : _mm256_castsi128_si256(src_row_0), src_row_1, 1);
2184 :
2185 33488200 : const __m256i src_16bit = _mm256_unpacklo_epi8(src_10, zero);
2186 :
2187 33488200 : const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
2188 :
2189 33488200 : const __m256i res_unsigned =
2190 33488200 : _mm256_add_epi16(res, offset_const);
2191 :
2192 : // Accumulate values into the destination buffer
2193 33488200 : if (do_average) {
2194 11208600 : const __m256i data_ref_0 =
2195 11208400 : load_line2_avx2(&dst[i * dst_stride + j],
2196 11208400 : &dst[i * dst_stride + j + dst_stride]);
2197 :
2198 11208600 : const __m256i comp_avg_res = comp_avg(
2199 : &data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
2200 :
2201 : const __m256i round_result =
2202 11208100 : convolve_rounding(&comp_avg_res,
2203 : &offset_const,
2204 : &rounding_const,
2205 : rounding_shift);
2206 :
2207 : const __m256i res_8 =
2208 11207600 : _mm256_packus_epi16(round_result, round_result);
2209 11207600 : const __m128i res_0 = _mm256_castsi256_si128(res_8);
2210 11207600 : const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
2211 :
2212 11207600 : if (w > 4) {
2213 11181500 : _mm_storel_epi64(
2214 11181500 : (__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
2215 11181500 : _mm_storel_epi64(
2216 : (__m128i *)((
2217 11181500 : &dst0[i * dst_stride0 + j + dst_stride0])),
2218 : res_1);
2219 : }
2220 : else {
2221 26189 : *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
2222 26189 : _mm_cvtsi128_si32(res_0);
2223 : *(uint32_t
2224 26189 : *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
2225 26189 : _mm_cvtsi128_si32(res_1);
2226 : }
2227 : }
2228 : else {
2229 22279800 : const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
2230 22279800 : _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]),
2231 : res_0);
2232 :
2233 : const __m128i res_1 =
2234 22279800 : _mm256_extracti128_si256(res_unsigned, 1);
2235 : _mm_storeu_si128(
2236 22279800 : (__m128i *)(&dst[i * dst_stride + j + dst_stride]),
2237 : res_1);
2238 : }
2239 : }
2240 : }
2241 : }
2242 12057600 : }
2243 :
2244 : // =============================================================================
2245 :
2246 : SIMD_INLINE void jnt_x_comp_avg_2tap_32_avx2(
2247 : const uint8_t *const src, const __m256i *const coeffs, const __m256i factor,
2248 : const __m256i offset, ConvBufType *const dst, uint8_t *const dst8) {
2249 : __m256i r[2];
2250 :
2251 17989200 : x_convolve_2tap_32_avx2(src, coeffs, r);
2252 : jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
2253 23975500 : }
2254 :
2255 23960600 : static INLINE void jnt_x_avg_2tap_32_avx2(const uint8_t *const src,
2256 : const __m256i *const coeffs,
2257 : const __m256i offset,
2258 : const ConvBufType *const dst,
2259 : uint8_t *const dst8) {
2260 : __m256i r[2];
2261 :
2262 23960600 : x_convolve_2tap_32_avx2(src, coeffs, r);
2263 23961800 : jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
2264 23959800 : }
2265 :
2266 109224000 : static INLINE void jnt_x_no_avg_2tap_32_avx2(const uint8_t *const src,
2267 : const __m256i *const coeffs,
2268 : const __m256i offset,
2269 : ConvBufType *const dst) {
2270 : __m256i r[2];
2271 :
2272 109224000 : x_convolve_2tap_32_avx2(src, coeffs, r);
2273 109219000 : jnt_no_avg_round_store_32_avx2(r, offset, dst);
2274 109217000 : }
2275 :
2276 : SIMD_INLINE void jnt_x_comp_avg_6tap_16x2_avx2(
2277 : const uint8_t *const src, const int32_t src_stride, const __m256i coeffs[3],
2278 : const __m256i *const filt, const __m256i factor, const __m256i offset,
2279 : ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
2280 : const int32_t dst8_stride) {
2281 : __m256i r[2];
2282 :
2283 5716750 : x_convolve_6tap_16x2_avx2(src, src_stride, coeffs, filt, r);
2284 : jnt_comp_avg_round_store_16x2_avx2(
2285 : r, factor, offset, dst, dst_stride, dst8, dst8_stride);
2286 7573690 : }
2287 :
2288 : SIMD_INLINE void jnt_x_avg_6tap_16x2_avx2(
2289 : const uint8_t *const src, const int32_t src_stride, const __m256i coeffs[3],
2290 : const __m256i *const filt, const __m256i offset, ConvBufType *const dst,
2291 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
2292 : __m256i r[2];
2293 :
2294 8824300 : x_convolve_6tap_16x2_avx2(src, src_stride, coeffs, filt, r);
2295 12040100 : jnt_avg_round_store_16x2_avx2(
2296 : r, offset, dst, dst_stride, dst8, dst8_stride);
2297 12039900 : }
2298 :
2299 : SIMD_INLINE void jnt_x_no_avg_6tap_16x2_avx2(
2300 : const uint8_t *const src, const int32_t src_stride, const __m256i coeffs[3],
2301 : const __m256i *const filt, const __m256i offset, ConvBufType *const dst,
2302 : const int32_t dst_stride) {
2303 : __m256i r[2];
2304 :
2305 23049400 : x_convolve_6tap_16x2_avx2(src, src_stride, coeffs, filt, r);
2306 31148500 : jnt_no_avg_round_store_16x2_avx2(r, offset, dst, dst_stride);
2307 31147000 : }
2308 :
2309 4622440 : static INLINE void jnt_x_comp_avg_6tap_32_avx2(
2310 : const uint8_t *const src, const __m256i coeffs[3],
2311 : const __m256i *const filt, const __m256i factor, const __m256i offset,
2312 : ConvBufType *const dst, uint8_t *const dst8) {
2313 : __m256i r[2];
2314 :
2315 4622440 : x_convolve_6tap_16x2_avx2(src, 16, coeffs, filt, r);
2316 : jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
2317 4622420 : }
2318 :
2319 7053250 : static INLINE void jnt_x_avg_6tap_32_avx2(const uint8_t *const src,
2320 : const __m256i coeffs[3],
2321 : const __m256i *const filt,
2322 : const __m256i offset,
2323 : ConvBufType *const dst,
2324 : uint8_t *const dst8) {
2325 : __m256i r[2];
2326 :
2327 7053250 : x_convolve_6tap_16x2_avx2(src, 16, coeffs, filt, r);
2328 7053590 : jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
2329 7053190 : }
2330 :
2331 19119200 : static INLINE void jnt_x_no_avg_6tap_32_avx2(const uint8_t *const src,
2332 : const __m256i coeffs[3],
2333 : const __m256i *const filt,
2334 : const __m256i offset,
2335 : ConvBufType *const dst) {
2336 : __m256i r[2];
2337 :
2338 19119200 : x_convolve_6tap_16x2_avx2(src, 16, coeffs, filt, r);
2339 19120300 : jnt_no_avg_round_store_32_avx2(r, offset, dst);
2340 19119500 : }
2341 :
2342 1690010 : static INLINE void jnt_x_comp_avg_8tap_16x2_avx2(
2343 : const uint8_t *const src, const int32_t src_stride, const __m256i coeffs[3],
2344 : const __m256i *const filt, const __m256i factor, const __m256i offset,
2345 : ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
2346 : const int32_t dst8_stride) {
2347 : __m256i r[2];
2348 :
2349 : x_convolve_8tap_16x2_avx2(src, src_stride, coeffs, filt, r);
2350 : jnt_comp_avg_round_store_16x2_avx2(
2351 : r, factor, offset, dst, dst_stride, dst8, dst8_stride);
2352 1690020 : }
2353 :
2354 2376430 : static INLINE void jnt_x_avg_8tap_16x2_avx2(
2355 : const uint8_t *const src, const int32_t src_stride, const __m256i coeffs[3],
2356 : const __m256i *const filt, const __m256i offset, ConvBufType *const dst,
2357 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
2358 : __m256i r[2];
2359 :
2360 : x_convolve_8tap_16x2_avx2(src, src_stride, coeffs, filt, r);
2361 2376440 : jnt_avg_round_store_16x2_avx2(
2362 : r, offset, dst, dst_stride, dst8, dst8_stride);
2363 2376460 : }
2364 :
2365 6335660 : static INLINE void jnt_x_no_avg_8tap_16x2_avx2(
2366 : const uint8_t *const src, const int32_t src_stride, const __m256i coeffs[3],
2367 : const __m256i *const filt, const __m256i offset, ConvBufType *const dst,
2368 : const int32_t dst_stride) {
2369 : __m256i r[2];
2370 :
2371 : x_convolve_8tap_16x2_avx2(src, src_stride, coeffs, filt, r);
2372 6335660 : jnt_no_avg_round_store_16x2_avx2(r, offset, dst, dst_stride);
2373 6335540 : }
2374 :
2375 : SIMD_INLINE void jnt_x_comp_avg_8tap_32_avx2(
2376 : const uint8_t *const src, const __m256i coeffs[4],
2377 : const __m256i *const filt, const __m256i factor, const __m256i offset,
2378 : ConvBufType *const dst, uint8_t *const dst8) {
2379 : __m256i r[2];
2380 :
2381 : x_convolve_8tap_16x2_avx2(src, 16, coeffs, filt, r);
2382 : jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
2383 3960570 : }
2384 :
2385 : SIMD_INLINE void jnt_x_avg_8tap_32_avx2(const uint8_t *const src,
2386 : const __m256i coeffs[4],
2387 : const __m256i *const filt,
2388 : const __m256i offset,
2389 : ConvBufType *const dst,
2390 : uint8_t *const dst8) {
2391 : __m256i r[2];
2392 :
2393 : x_convolve_8tap_16x2_avx2(src, 16, coeffs, filt, r);
2394 6115990 : jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
2395 6115870 : }
2396 :
2397 : SIMD_INLINE void jnt_x_no_avg_8tap_32_avx2(const uint8_t *const src,
2398 : const __m256i coeffs[4],
2399 : const __m256i *const filt,
2400 : const __m256i offset,
2401 : ConvBufType *const dst) {
2402 : __m256i r[2];
2403 :
2404 : x_convolve_8tap_16x2_avx2(src, 16, coeffs, filt, r);
2405 16299200 : jnt_no_avg_round_store_32_avx2(r, offset, dst);
2406 16299100 : }
2407 :
2408 20591400 : static void jnt_convolve_x_2tap_avx2(
2409 : const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
2410 : const int32_t dst8_stride, const int32_t w, const int32_t h,
2411 : const InterpFilterParams *const filter_params_x, const int32_t subpel_x_q4,
2412 : const ConvolveParams *const conv_params) {
2413 20591400 : const uint8_t *src_ptr = src;
2414 20591400 : const int32_t dst_stride = conv_params->dst_stride;
2415 20591400 : const int32_t round_0 = 3;
2416 20591400 : const int32_t round_1 = COMPOUND_ROUND1_BITS;
2417 20591400 : const int32_t bits = FILTER_BITS - round_1;
2418 20591400 : const int32_t bd = 8;
2419 20591400 : const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
2420 20591400 : const int32_t round_offset =
2421 20591400 : (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
2422 20591400 : const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
2423 20591400 : const int32_t offset_comp_avg =
2424 20591400 : round_offset * conv_params->bck_offset +
2425 20591400 : (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
2426 20591400 : (round_offset << DIST_PRECISION_BITS);
2427 20591400 : const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
2428 20591400 : (1 << (round_0 - bits - 2)) -
2429 20591400 : (round_offset << (round_0 - bits - 1));
2430 20591400 : const int16_t offset_no_avg =
2431 20591400 : (round_offset << (round_0 - bits - 1)) + (1 << (round_0 - bits - 2));
2432 20591400 : const int32_t factor =
2433 20591400 : conv_params->fwd_offset | (conv_params->bck_offset << 16);
2434 20591400 : const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
2435 20591400 : const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
2436 41182700 : const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
2437 20591400 : const __m128i factor_128 = _mm_set1_epi32(factor);
2438 20591400 : const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
2439 20591400 : const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
2440 41182700 : const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
2441 20591400 : const __m256i factor_256 = _mm256_set1_epi32(factor);
2442 20591400 : ConvBufType *dst = conv_params->dst;
2443 20591400 : int32_t y = h;
2444 : __m128i coeffs_128[4];
2445 : __m256i coeffs_256[4];
2446 :
2447 20591400 : if (w <= 4) {
2448 0 : prepare_half_coeffs_2tap_ssse3(
2449 : filter_params_x, subpel_x_q4, coeffs_128);
2450 :
2451 0 : if (w == 2) {
2452 0 : if (conv_params->do_average) {
2453 0 : if (conv_params->use_jnt_comp_avg) {
2454 : do {
2455 0 : const __m128i res = x_convolve_2tap_2x2_sse4_1(
2456 : src_ptr, src_stride, coeffs_128);
2457 0 : jnt_comp_avg_round_store_2x2_sse2(res,
2458 : factor_128,
2459 : offset_comp_avg_128,
2460 : dst,
2461 : dst_stride,
2462 : dst8,
2463 : dst8_stride);
2464 0 : src_ptr += 2 * src_stride;
2465 0 : dst += 2 * dst_stride;
2466 0 : dst8 += 2 * dst8_stride;
2467 0 : y -= 2;
2468 0 : } while (y);
2469 : }
2470 : else {
2471 : do {
2472 0 : const __m128i res = x_convolve_2tap_2x2_sse4_1(
2473 : src_ptr, src_stride, coeffs_128);
2474 0 : jnt_avg_round_store_2x2_sse2(res,
2475 : offset_avg_128,
2476 : dst,
2477 : dst_stride,
2478 : dst8,
2479 : dst8_stride);
2480 0 : src_ptr += 2 * src_stride;
2481 0 : dst += 2 * dst_stride;
2482 0 : dst8 += 2 * dst8_stride;
2483 0 : y -= 2;
2484 0 : } while (y);
2485 : }
2486 : }
2487 : else {
2488 : do {
2489 0 : const __m128i res = x_convolve_2tap_2x2_sse4_1(
2490 : src_ptr, src_stride, coeffs_128);
2491 0 : jnt_no_avg_round_store_2x2_sse2(
2492 : res, offset_no_avg_128, dst, dst_stride);
2493 0 : src_ptr += 2 * src_stride;
2494 0 : dst += 2 * dst_stride;
2495 0 : y -= 2;
2496 0 : } while (y);
2497 : }
2498 : }
2499 0 : else if (w == 4) {
2500 0 : assert(w == 4);
2501 :
2502 0 : if (conv_params->do_average) {
2503 0 : if (conv_params->use_jnt_comp_avg) {
2504 : do {
2505 0 : const __m128i res = x_convolve_2tap_4x2_ssse3(
2506 : src_ptr, src_stride, coeffs_128);
2507 0 : jnt_comp_avg_round_store_4x2_sse2(res,
2508 : factor_128,
2509 : offset_comp_avg_128,
2510 : dst,
2511 : dst_stride,
2512 : dst8,
2513 : dst8_stride);
2514 0 : src_ptr += 2 * src_stride;
2515 0 : dst += 2 * dst_stride;
2516 0 : dst8 += 2 * dst8_stride;
2517 0 : y -= 2;
2518 0 : } while (y);
2519 : }
2520 : else {
2521 : do {
2522 0 : const __m128i res = x_convolve_2tap_4x2_ssse3(
2523 : src_ptr, src_stride, coeffs_128);
2524 0 : jnt_avg_round_store_4x2_sse2(res,
2525 : offset_avg_128,
2526 : dst,
2527 : dst_stride,
2528 : dst8,
2529 : dst8_stride);
2530 0 : src_ptr += 2 * src_stride;
2531 0 : dst += 2 * dst_stride;
2532 0 : dst8 += 2 * dst8_stride;
2533 0 : y -= 2;
2534 0 : } while (y);
2535 : }
2536 : }
2537 : else {
2538 : do {
2539 0 : const __m128i res = x_convolve_2tap_4x2_ssse3(
2540 : src_ptr, src_stride, coeffs_128);
2541 0 : jnt_no_avg_round_store_4x2_sse2(
2542 : res, offset_no_avg_128, dst, dst_stride);
2543 0 : src_ptr += 2 * src_stride;
2544 0 : dst += 2 * dst_stride;
2545 0 : y -= 2;
2546 0 : } while (y);
2547 : }
2548 : }
2549 : else {
2550 : }
2551 : }
2552 : else {
2553 : __m256i r[2];
2554 :
2555 20591400 : prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
2556 :
2557 20600800 : if (w == 8) {
2558 8883610 : if (conv_params->do_average) {
2559 2415100 : if (conv_params->use_jnt_comp_avg) {
2560 : do {
2561 8673410 : const __m256i res = x_convolve_2tap_8x2_avx2(
2562 : src_ptr, src_stride, coeffs_256);
2563 8673080 : jnt_comp_avg_round_store_8x2_avx2(res,
2564 : factor_256,
2565 : offset_comp_avg_256,
2566 : dst,
2567 : dst_stride,
2568 : dst8,
2569 : dst8_stride);
2570 8673030 : src_ptr += 2 * src_stride;
2571 8673030 : dst += 2 * dst_stride;
2572 8673030 : dst8 += 2 * dst8_stride;
2573 8673030 : y -= 2;
2574 8673030 : } while (y);
2575 : }
2576 : else {
2577 : do {
2578 8673460 : const __m256i res = x_convolve_2tap_8x2_avx2(
2579 : src_ptr, src_stride, coeffs_256);
2580 8673230 : jnt_avg_round_store_8x2_sse2(res,
2581 : offset_avg_256,
2582 : dst,
2583 : dst_stride,
2584 : dst8,
2585 : dst8_stride);
2586 8673150 : src_ptr += 2 * src_stride;
2587 8673150 : dst += 2 * dst_stride;
2588 8673150 : dst8 += 2 * dst8_stride;
2589 8673150 : y -= 2;
2590 8673150 : } while (y);
2591 : }
2592 : }
2593 : else {
2594 : do {
2595 44150100 : const __m256i res = x_convolve_2tap_8x2_avx2(
2596 : src_ptr, src_stride, coeffs_256);
2597 44150900 : jnt_no_avg_round_store_8x2_avx2(
2598 : res, offset_no_avg_256, dst, dst_stride);
2599 44150100 : src_ptr += 2 * src_stride;
2600 44150100 : dst += 2 * dst_stride;
2601 44150100 : y -= 2;
2602 44150100 : } while (y);
2603 : }
2604 : }
2605 11717200 : else if (w == 16) {
2606 7093380 : if (conv_params->do_average) {
2607 1897170 : if (conv_params->use_jnt_comp_avg) {
2608 : do {
2609 9339940 : x_convolve_2tap_16x2_avx2(
2610 : src_ptr, src_stride, coeffs_256, r);
2611 : jnt_comp_avg_round_store_16x2_avx2(r,
2612 : factor_256,
2613 : offset_comp_avg_256,
2614 : dst,
2615 : dst_stride,
2616 : dst8,
2617 : dst8_stride);
2618 9339590 : src_ptr += 2 * src_stride;
2619 9339590 : dst += 2 * dst_stride;
2620 9339590 : dst8 += 2 * dst8_stride;
2621 9339590 : y -= 2;
2622 9339590 : } while (y);
2623 : }
2624 : else {
2625 : do {
2626 9339930 : x_convolve_2tap_16x2_avx2(
2627 : src_ptr, src_stride, coeffs_256, r);
2628 9339370 : jnt_avg_round_store_16x2_avx2(r,
2629 : offset_avg_256,
2630 : dst,
2631 : dst_stride,
2632 : dst8,
2633 : dst8_stride);
2634 9339620 : src_ptr += 2 * src_stride;
2635 9339620 : dst += 2 * dst_stride;
2636 9339620 : dst8 += 2 * dst8_stride;
2637 9339620 : y -= 2;
2638 9339620 : } while (y);
2639 : }
2640 : }
2641 : else {
2642 : do {
2643 48314700 : x_convolve_2tap_16x2_avx2(
2644 : src_ptr, src_stride, coeffs_256, r);
2645 48310900 : jnt_no_avg_round_store_16x2_avx2(
2646 : r, offset_no_avg_256, dst, dst_stride);
2647 48304900 : src_ptr += 2 * src_stride;
2648 48304900 : dst += 2 * dst_stride;
2649 48304900 : y -= 2;
2650 48304900 : } while (y);
2651 : }
2652 : }
2653 4623800 : else if (w == 32) {
2654 3565750 : if (conv_params->do_average) {
2655 1038530 : if (conv_params->use_jnt_comp_avg) {
2656 : do {
2657 : jnt_x_comp_avg_2tap_32_avx2(src_ptr,
2658 : coeffs_256,
2659 : factor_256,
2660 : offset_comp_avg_256,
2661 : dst,
2662 : dst8);
2663 12002100 : src_ptr += src_stride;
2664 12002100 : dst += dst_stride;
2665 12002100 : dst8 += dst8_stride;
2666 12002100 : } while (--y);
2667 : }
2668 : else {
2669 : do {
2670 12001300 : jnt_x_avg_2tap_32_avx2(
2671 : src_ptr, coeffs_256, offset_avg_256, dst, dst8);
2672 12000900 : src_ptr += src_stride;
2673 12000900 : dst += dst_stride;
2674 12000900 : dst8 += dst8_stride;
2675 12000900 : } while (--y);
2676 : }
2677 : }
2678 : else {
2679 : do {
2680 59306200 : jnt_x_no_avg_2tap_32_avx2(
2681 : src_ptr, coeffs_256, offset_no_avg_256, dst);
2682 59299300 : src_ptr += src_stride;
2683 59299300 : dst += dst_stride;
2684 59299300 : } while (--y);
2685 : }
2686 : }
2687 1058050 : else if (w == 64) {
2688 1058280 : if (conv_params->do_average) {
2689 342144 : if (conv_params->use_jnt_comp_avg) {
2690 : do {
2691 : jnt_x_comp_avg_2tap_32_avx2(src_ptr,
2692 : coeffs_256,
2693 : factor_256,
2694 : offset_comp_avg_256,
2695 : dst,
2696 : dst8);
2697 5986580 : jnt_x_comp_avg_2tap_32_avx2(src_ptr + 32,
2698 : coeffs_256,
2699 : factor_256,
2700 : offset_comp_avg_256,
2701 : dst + 32,
2702 : dst8 + 32);
2703 5986790 : src_ptr += src_stride;
2704 5986790 : dst += dst_stride;
2705 5986790 : dst8 += dst8_stride;
2706 5986790 : } while (--y);
2707 : }
2708 : else {
2709 : do {
2710 5985770 : jnt_x_avg_2tap_32_avx2(
2711 : src_ptr, coeffs_256, offset_avg_256, dst, dst8);
2712 5985690 : jnt_x_avg_2tap_32_avx2(src_ptr + 32,
2713 : coeffs_256,
2714 : offset_avg_256,
2715 5985690 : dst + 32,
2716 : dst8 + 32);
2717 5985680 : src_ptr += src_stride;
2718 5985680 : dst += dst_stride;
2719 5985680 : dst8 += dst8_stride;
2720 5985680 : } while (--y);
2721 : }
2722 : }
2723 : else {
2724 : do {
2725 25043100 : jnt_x_no_avg_2tap_32_avx2(
2726 : src_ptr, coeffs_256, offset_no_avg_256, dst);
2727 25042500 : jnt_x_no_avg_2tap_32_avx2(
2728 : src_ptr + 32, coeffs_256, offset_no_avg_256, dst + 32);
2729 25042200 : src_ptr += src_stride;
2730 25042200 : dst += dst_stride;
2731 25042200 : } while (--y);
2732 : }
2733 : }
2734 : else {
2735 0 : assert(w == 128);
2736 :
2737 0 : if (conv_params->do_average) {
2738 0 : if (conv_params->use_jnt_comp_avg) {
2739 : do {
2740 : jnt_x_comp_avg_2tap_32_avx2(src_ptr,
2741 : coeffs_256,
2742 : factor_256,
2743 : offset_comp_avg_256,
2744 : dst,
2745 : dst8);
2746 0 : jnt_x_comp_avg_2tap_32_avx2(src_ptr + 1 * 32,
2747 : coeffs_256,
2748 : factor_256,
2749 : offset_comp_avg_256,
2750 : dst + 1 * 32,
2751 : dst8 + 1 * 32);
2752 0 : jnt_x_comp_avg_2tap_32_avx2(src_ptr + 2 * 32,
2753 : coeffs_256,
2754 : factor_256,
2755 : offset_comp_avg_256,
2756 : dst + 2 * 32,
2757 : dst8 + 2 * 32);
2758 0 : jnt_x_comp_avg_2tap_32_avx2(src_ptr + 3 * 32,
2759 : coeffs_256,
2760 : factor_256,
2761 : offset_comp_avg_256,
2762 : dst + 3 * 32,
2763 : dst8 + 3 * 32);
2764 0 : src_ptr += src_stride;
2765 0 : dst += dst_stride;
2766 0 : dst8 += dst8_stride;
2767 0 : } while (--y);
2768 : }
2769 : else {
2770 : do {
2771 0 : jnt_x_avg_2tap_32_avx2(
2772 : src_ptr, coeffs_256, offset_avg_256, dst, dst8);
2773 0 : jnt_x_avg_2tap_32_avx2(src_ptr + 1 * 32,
2774 : coeffs_256,
2775 : offset_avg_256,
2776 0 : dst + 1 * 32,
2777 : dst8 + 1 * 32);
2778 0 : jnt_x_avg_2tap_32_avx2(src_ptr + 2 * 32,
2779 : coeffs_256,
2780 : offset_avg_256,
2781 0 : dst + 2 * 32,
2782 : dst8 + 2 * 32);
2783 0 : jnt_x_avg_2tap_32_avx2(src_ptr + 3 * 32,
2784 : coeffs_256,
2785 : offset_avg_256,
2786 0 : dst + 3 * 32,
2787 : dst8 + 3 * 32);
2788 0 : src_ptr += src_stride;
2789 0 : dst += dst_stride;
2790 0 : dst8 += dst8_stride;
2791 0 : } while (--y);
2792 : }
2793 : }
2794 : else {
2795 : do {
2796 0 : jnt_x_no_avg_2tap_32_avx2(
2797 : src_ptr, coeffs_256, offset_no_avg_256, dst);
2798 0 : jnt_x_no_avg_2tap_32_avx2(src_ptr + 1 * 32,
2799 : coeffs_256,
2800 : offset_no_avg_256,
2801 : dst + 1 * 32);
2802 0 : jnt_x_no_avg_2tap_32_avx2(src_ptr + 2 * 32,
2803 : coeffs_256,
2804 : offset_no_avg_256,
2805 : dst + 2 * 32);
2806 0 : jnt_x_no_avg_2tap_32_avx2(src_ptr + 3 * 32,
2807 : coeffs_256,
2808 : offset_no_avg_256,
2809 : dst + 3 * 32);
2810 0 : src_ptr += src_stride;
2811 0 : dst += dst_stride;
2812 0 : } while (--y);
2813 : }
2814 : }
2815 : }
2816 20602600 : }
2817 :
2818 165000 : static void jnt_convolve_x_4tap_ssse3(
2819 : const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
2820 : const int32_t dst8_stride, const int32_t w, const int32_t h,
2821 : const InterpFilterParams *const filter_params_x, const int32_t subpel_x_q4,
2822 : const ConvolveParams *const conv_params) {
2823 165000 : const uint8_t *src_ptr = src - 1;
2824 165000 : const int32_t dst_stride = conv_params->dst_stride;
2825 165000 : const int32_t round_0 = 3;
2826 165000 : const int32_t round_1 = COMPOUND_ROUND1_BITS;
2827 165000 : const int32_t bits = FILTER_BITS - round_1;
2828 165000 : const int32_t bd = 8;
2829 165000 : const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
2830 165000 : const int32_t round_offset =
2831 165000 : (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
2832 165000 : const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
2833 165000 : const int32_t offset_comp_avg =
2834 165000 : round_offset * conv_params->bck_offset +
2835 165000 : (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
2836 165000 : (round_offset << DIST_PRECISION_BITS);
2837 165000 : const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
2838 165000 : (1 << (round_0 - bits - 2)) -
2839 165000 : (round_offset << (round_0 - bits - 1));
2840 165000 : const int16_t offset_no_avg =
2841 165000 : (round_offset << (round_0 - bits - 1)) + (1 << (round_0 - bits - 2));
2842 165000 : const int32_t factor =
2843 165000 : conv_params->fwd_offset | (conv_params->bck_offset << 16);
2844 165000 : const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
2845 165000 : const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
2846 330000 : const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
2847 165000 : const __m128i factor_128 = _mm_set1_epi32(factor);
2848 165000 : ConvBufType *dst = conv_params->dst;
2849 165000 : int32_t y = h;
2850 : __m128i coeffs_128[4];
2851 :
2852 165000 : prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
2853 :
2854 165002 : if (w == 2) {
2855 1 : if (conv_params->do_average) {
2856 0 : if (conv_params->use_jnt_comp_avg) {
2857 : do {
2858 0 : const __m128i res = x_convolve_4tap_2x2_ssse3(
2859 : src_ptr, src_stride, coeffs_128);
2860 0 : jnt_comp_avg_round_store_2x2_sse2(res,
2861 : factor_128,
2862 : offset_comp_avg_128,
2863 : dst,
2864 : dst_stride,
2865 : dst8,
2866 : dst8_stride);
2867 0 : src_ptr += 2 * src_stride;
2868 0 : dst += 2 * dst_stride;
2869 0 : dst8 += 2 * dst8_stride;
2870 0 : y -= 2;
2871 0 : } while (y);
2872 : }
2873 : else {
2874 : do {
2875 0 : const __m128i res = x_convolve_4tap_2x2_ssse3(
2876 : src_ptr, src_stride, coeffs_128);
2877 0 : jnt_avg_round_store_2x2_sse2(res,
2878 : offset_avg_128,
2879 : dst,
2880 : dst_stride,
2881 : dst8,
2882 : dst8_stride);
2883 0 : src_ptr += 2 * src_stride;
2884 0 : dst += 2 * dst_stride;
2885 0 : dst8 += 2 * dst8_stride;
2886 0 : y -= 2;
2887 0 : } while (y);
2888 : }
2889 : }
2890 : else {
2891 : do {
2892 : const __m128i res =
2893 1 : x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
2894 0 : jnt_no_avg_round_store_2x2_sse2(
2895 : res, offset_no_avg_128, dst, dst_stride);
2896 0 : src_ptr += 2 * src_stride;
2897 0 : dst += 2 * dst_stride;
2898 0 : y -= 2;
2899 0 : } while (y);
2900 : }
2901 : }
2902 : else {
2903 165001 : assert(w == 4);
2904 :
2905 165001 : if (conv_params->do_average) {
2906 73890 : if (conv_params->use_jnt_comp_avg) {
2907 : do {
2908 117812 : const __m128i res = x_convolve_4tap_4x2_ssse3(
2909 : src_ptr, src_stride, coeffs_128);
2910 117812 : jnt_comp_avg_round_store_4x2_sse2(res,
2911 : factor_128,
2912 : offset_comp_avg_128,
2913 : dst,
2914 : dst_stride,
2915 : dst8,
2916 : dst8_stride);
2917 117812 : src_ptr += 2 * src_stride;
2918 117812 : dst += 2 * dst_stride;
2919 117812 : dst8 += 2 * dst8_stride;
2920 117812 : y -= 2;
2921 117812 : } while (y);
2922 : }
2923 : else {
2924 : do {
2925 181511 : const __m128i res = x_convolve_4tap_4x2_ssse3(
2926 : src_ptr, src_stride, coeffs_128);
2927 181510 : jnt_avg_round_store_4x2_sse2(res,
2928 : offset_avg_128,
2929 : dst,
2930 : dst_stride,
2931 : dst8,
2932 : dst8_stride);
2933 181511 : src_ptr += 2 * src_stride;
2934 181511 : dst += 2 * dst_stride;
2935 181511 : dst8 += 2 * dst8_stride;
2936 181511 : y -= 2;
2937 181511 : } while (y);
2938 : }
2939 : }
2940 : else {
2941 : do {
2942 : const __m128i res =
2943 373179 : x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2944 373184 : jnt_no_avg_round_store_4x2_sse2(
2945 : res, offset_no_avg_128, dst, dst_stride);
2946 373179 : src_ptr += 2 * src_stride;
2947 373179 : dst += 2 * dst_stride;
2948 373179 : y -= 2;
2949 373179 : } while (y);
2950 : }
2951 : }
2952 165001 : }
2953 :
2954 7846150 : static void jnt_convolve_x_6tap_avx2(
2955 : const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
2956 : const int32_t dst8_stride, const int32_t w, const int32_t h,
2957 : const InterpFilterParams *const filter_params_x, const int32_t subpel_x_q4,
2958 : const ConvolveParams *const conv_params) {
2959 7846150 : const uint8_t *src_ptr = src - 2;
2960 7846150 : const int32_t dst_stride = conv_params->dst_stride;
2961 7846150 : const int32_t round_0 = 3;
2962 7846150 : const int32_t round_1 = COMPOUND_ROUND1_BITS;
2963 7846150 : const int32_t bits = FILTER_BITS - round_1;
2964 7846150 : const int32_t bd = 8;
2965 7846150 : const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
2966 7846150 : const int32_t round_offset =
2967 7846150 : (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
2968 7846150 : const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
2969 7846150 : const int32_t offset_comp_avg =
2970 7846150 : round_offset * conv_params->bck_offset +
2971 7846150 : (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
2972 7846150 : (round_offset << DIST_PRECISION_BITS);
2973 7846150 : const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
2974 7846150 : (1 << (round_0 - bits - 2)) -
2975 7846150 : (round_offset << (round_0 - bits - 1));
2976 7846150 : const int16_t offset_no_avg =
2977 7846150 : (round_offset << (round_0 - bits - 1)) + (1 << (round_0 - bits - 2));
2978 7846150 : const int32_t factor =
2979 7846150 : conv_params->fwd_offset | (conv_params->bck_offset << 16);
2980 7846150 : const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
2981 7846150 : const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
2982 15692300 : const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
2983 7846150 : const __m256i factor_256 = _mm256_set1_epi32(factor);
2984 7846150 : ConvBufType *dst = conv_params->dst;
2985 7846150 : int32_t y = h;
2986 : __m256i coeffs_256[4], filt_256[4];
2987 :
2988 7846150 : filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
2989 7846150 : filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
2990 7846150 : filt_256[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
2991 :
2992 7846150 : prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
2993 :
2994 7847420 : if (w == 8) {
2995 3505330 : if (conv_params->do_average) {
2996 1412780 : if (conv_params->use_jnt_comp_avg) {
2997 : do {
2998 4458300 : const __m256i res = x_convolve_6tap_8x2_avx2(
2999 : src_ptr, src_stride, coeffs_256, filt_256);
3000 4458270 : jnt_comp_avg_round_store_8x2_avx2(res,
3001 : factor_256,
3002 : offset_comp_avg_256,
3003 : dst,
3004 : dst_stride,
3005 : dst8,
3006 : dst8_stride);
3007 4458190 : src_ptr += 2 * src_stride;
3008 4458190 : dst += 2 * dst_stride;
3009 4458190 : dst8 += 2 * dst8_stride;
3010 4458190 : y -= 2;
3011 4458190 : } while (y);
3012 : }
3013 : else {
3014 : do {
3015 5683300 : const __m256i res = x_convolve_6tap_8x2_avx2(
3016 : src_ptr, src_stride, coeffs_256, filt_256);
3017 5683310 : jnt_avg_round_store_8x2_sse2(res,
3018 : offset_avg_256,
3019 : dst,
3020 : dst_stride,
3021 : dst8,
3022 : dst8_stride);
3023 5683100 : src_ptr += 2 * src_stride;
3024 5683100 : dst += 2 * dst_stride;
3025 5683100 : dst8 += 2 * dst8_stride;
3026 5683100 : y -= 2;
3027 5683100 : } while (y);
3028 : }
3029 : }
3030 : else {
3031 : do {
3032 15283700 : const __m256i res = x_convolve_6tap_8x2_avx2(
3033 : src_ptr, src_stride, coeffs_256, filt_256);
3034 15284100 : jnt_no_avg_round_store_8x2_avx2(
3035 : res, offset_no_avg_256, dst, dst_stride);
3036 15283700 : src_ptr += 2 * src_stride;
3037 15283700 : dst += 2 * dst_stride;
3038 15283700 : y -= 2;
3039 15283700 : } while (y);
3040 : }
3041 : }
3042 4342080 : else if (w == 16) {
3043 2489280 : if (conv_params->do_average) {
3044 976559 : if (conv_params->use_jnt_comp_avg) {
3045 : do {
3046 : jnt_x_comp_avg_6tap_16x2_avx2(src_ptr,
3047 : src_stride,
3048 : coeffs_256,
3049 : filt_256,
3050 : factor_256,
3051 : offset_comp_avg_256,
3052 : dst,
3053 : dst_stride,
3054 : dst8,
3055 : dst8_stride);
3056 3859710 : src_ptr += 2 * src_stride;
3057 3859710 : dst += 2 * dst_stride;
3058 3859710 : dst8 += 2 * dst8_stride;
3059 3859710 : y -= 2;
3060 3859710 : } while (y);
3061 : }
3062 : else {
3063 : do {
3064 : jnt_x_avg_6tap_16x2_avx2(src_ptr,
3065 : src_stride,
3066 : coeffs_256,
3067 : filt_256,
3068 : offset_avg_256,
3069 : dst,
3070 : dst_stride,
3071 : dst8,
3072 : dst8_stride);
3073 5608480 : src_ptr += 2 * src_stride;
3074 5608480 : dst += 2 * dst_stride;
3075 5608480 : dst8 += 2 * dst8_stride;
3076 5608480 : y -= 2;
3077 5608480 : } while (y);
3078 : }
3079 : }
3080 : else {
3081 : do {
3082 : jnt_x_no_avg_6tap_16x2_avx2(src_ptr,
3083 : src_stride,
3084 : coeffs_256,
3085 : filt_256,
3086 : offset_no_avg_256,
3087 : dst,
3088 : dst_stride);
3089 14950000 : src_ptr += 2 * src_stride;
3090 14950000 : dst += 2 * dst_stride;
3091 14950000 : y -= 2;
3092 14950000 : } while (y);
3093 : }
3094 : }
3095 1852800 : else if (w == 32) {
3096 1435280 : if (conv_params->do_average) {
3097 547917 : if (conv_params->use_jnt_comp_avg) {
3098 : do {
3099 4622450 : jnt_x_comp_avg_6tap_32_avx2(src_ptr,
3100 : coeffs_256,
3101 : filt_256,
3102 : factor_256,
3103 : offset_comp_avg_256,
3104 : dst,
3105 : dst8);
3106 4622430 : src_ptr += src_stride;
3107 4622430 : dst += dst_stride;
3108 4622430 : dst8 += dst8_stride;
3109 4622430 : } while (--y);
3110 : }
3111 : else {
3112 : do {
3113 7053320 : jnt_x_avg_6tap_32_avx2(src_ptr,
3114 : coeffs_256,
3115 : filt_256,
3116 : offset_avg_256,
3117 : dst,
3118 : dst8);
3119 7053200 : src_ptr += src_stride;
3120 7053200 : dst += dst_stride;
3121 7053200 : dst8 += dst8_stride;
3122 7053200 : } while (--y);
3123 : }
3124 : }
3125 : else {
3126 : do {
3127 19120200 : jnt_x_no_avg_6tap_32_avx2(
3128 : src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3129 19119000 : src_ptr += src_stride;
3130 19119000 : dst += dst_stride;
3131 19119000 : } while (--y);
3132 : }
3133 : }
3134 417526 : else if (w == 64) {
3135 417546 : if (conv_params->do_average) {
3136 159491 : if (conv_params->use_jnt_comp_avg) {
3137 : do {
3138 : jnt_x_comp_avg_6tap_16x2_avx2(src_ptr,
3139 : 16,
3140 : coeffs_256,
3141 : filt_256,
3142 : factor_256,
3143 : offset_comp_avg_256,
3144 : dst,
3145 : 16,
3146 : dst8,
3147 : 16);
3148 1856980 : jnt_x_comp_avg_6tap_16x2_avx2(src_ptr + 32,
3149 : 16,
3150 : coeffs_256,
3151 : filt_256,
3152 : factor_256,
3153 : offset_comp_avg_256,
3154 : dst + 32,
3155 : 16,
3156 : dst8 + 32,
3157 : 16);
3158 1856990 : src_ptr += src_stride;
3159 1856990 : dst += dst_stride;
3160 1856990 : dst8 += dst8_stride;
3161 1856990 : } while (--y);
3162 : }
3163 : else {
3164 : do {
3165 : jnt_x_avg_6tap_16x2_avx2(src_ptr,
3166 : 16,
3167 : coeffs_256,
3168 : filt_256,
3169 : offset_avg_256,
3170 : dst,
3171 : 16,
3172 : dst8,
3173 : 16);
3174 3215680 : jnt_x_avg_6tap_16x2_avx2(src_ptr + 32,
3175 : 16,
3176 : coeffs_256,
3177 : filt_256,
3178 : offset_avg_256,
3179 : dst + 32,
3180 : 16,
3181 : dst8 + 32,
3182 : 16);
3183 3215700 : src_ptr += src_stride;
3184 3215700 : dst += dst_stride;
3185 3215700 : dst8 += dst8_stride;
3186 3215700 : } while (--y);
3187 : }
3188 : }
3189 : else {
3190 : do {
3191 : jnt_x_no_avg_6tap_16x2_avx2(src_ptr,
3192 : 16,
3193 : coeffs_256,
3194 : filt_256,
3195 : offset_no_avg_256,
3196 : dst,
3197 : 16);
3198 8098490 : jnt_x_no_avg_6tap_16x2_avx2(src_ptr + 32,
3199 : 16,
3200 : coeffs_256,
3201 : filt_256,
3202 : offset_no_avg_256,
3203 : dst + 32,
3204 : 16);
3205 8098480 : src_ptr += src_stride;
3206 8098480 : dst += dst_stride;
3207 8098480 : } while (--y);
3208 : }
3209 : }
3210 : else {
3211 0 : assert(w == 128);
3212 :
3213 0 : if (conv_params->do_average) {
3214 0 : if (conv_params->use_jnt_comp_avg) {
3215 : do {
3216 : jnt_x_comp_avg_6tap_16x2_avx2(src_ptr,
3217 : 16,
3218 : coeffs_256,
3219 : filt_256,
3220 : factor_256,
3221 : offset_comp_avg_256,
3222 : dst,
3223 : 16,
3224 : dst8,
3225 : 16);
3226 0 : jnt_x_comp_avg_6tap_16x2_avx2(src_ptr + 1 * 32,
3227 : 16,
3228 : coeffs_256,
3229 : filt_256,
3230 : factor_256,
3231 : offset_comp_avg_256,
3232 : dst + 1 * 32,
3233 : 16,
3234 : dst8 + 1 * 32,
3235 : 16);
3236 0 : jnt_x_comp_avg_6tap_16x2_avx2(src_ptr + 2 * 32,
3237 : 16,
3238 : coeffs_256,
3239 : filt_256,
3240 : factor_256,
3241 : offset_comp_avg_256,
3242 : dst + 2 * 32,
3243 : 16,
3244 : dst8 + 2 * 32,
3245 : 16);
3246 0 : jnt_x_comp_avg_6tap_16x2_avx2(src_ptr + 3 * 32,
3247 : 16,
3248 : coeffs_256,
3249 : filt_256,
3250 : factor_256,
3251 : offset_comp_avg_256,
3252 : dst + 3 * 32,
3253 : 16,
3254 : dst8 + 3 * 32,
3255 : 16);
3256 0 : src_ptr += src_stride;
3257 0 : dst += dst_stride;
3258 0 : dst8 += dst8_stride;
3259 0 : } while (--y);
3260 : }
3261 : else {
3262 : do {
3263 : jnt_x_avg_6tap_16x2_avx2(src_ptr,
3264 : 16,
3265 : coeffs_256,
3266 : filt_256,
3267 : offset_avg_256,
3268 : dst,
3269 : 16,
3270 : dst8,
3271 : 16);
3272 0 : jnt_x_avg_6tap_16x2_avx2(src_ptr + 1 * 32,
3273 : 16,
3274 : coeffs_256,
3275 : filt_256,
3276 : offset_avg_256,
3277 : dst + 1 * 32,
3278 : 16,
3279 : dst8 + 1 * 32,
3280 : 16);
3281 0 : jnt_x_avg_6tap_16x2_avx2(src_ptr + 2 * 32,
3282 : 16,
3283 : coeffs_256,
3284 : filt_256,
3285 : offset_avg_256,
3286 : dst + 2 * 32,
3287 : 16,
3288 : dst8 + 2 * 32,
3289 : 16);
3290 0 : jnt_x_avg_6tap_16x2_avx2(src_ptr + 3 * 32,
3291 : 16,
3292 : coeffs_256,
3293 : filt_256,
3294 : offset_avg_256,
3295 : dst + 3 * 32,
3296 : 16,
3297 : dst8 + 3 * 32,
3298 : 16);
3299 0 : src_ptr += src_stride;
3300 0 : dst += dst_stride;
3301 0 : dst8 += dst8_stride;
3302 0 : } while (--y);
3303 : }
3304 : }
3305 : else {
3306 : do {
3307 : jnt_x_no_avg_6tap_16x2_avx2(src_ptr,
3308 : 16,
3309 : coeffs_256,
3310 : filt_256,
3311 : offset_no_avg_256,
3312 : dst,
3313 : 16);
3314 0 : jnt_x_no_avg_6tap_16x2_avx2(src_ptr + 1 * 32,
3315 : 16,
3316 : coeffs_256,
3317 : filt_256,
3318 : offset_no_avg_256,
3319 : dst + 1 * 32,
3320 : 16);
3321 0 : jnt_x_no_avg_6tap_16x2_avx2(src_ptr + 2 * 32,
3322 : 16,
3323 : coeffs_256,
3324 : filt_256,
3325 : offset_no_avg_256,
3326 : dst + 2 * 32,
3327 : 16);
3328 0 : jnt_x_no_avg_6tap_16x2_avx2(src_ptr + 3 * 32,
3329 : 16,
3330 : coeffs_256,
3331 : filt_256,
3332 : offset_no_avg_256,
3333 : dst + 3 * 32,
3334 : 16);
3335 0 : src_ptr += src_stride;
3336 0 : dst += dst_stride;
3337 0 : } while (--y);
3338 : }
3339 : }
3340 7844520 : }
3341 :
3342 3278940 : static void jnt_convolve_x_8tap_avx2(
3343 : const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
3344 : const int32_t dst8_stride, const int32_t w, const int32_t h,
3345 : const InterpFilterParams *const filter_params_x, const int32_t subpel_x_q4,
3346 : const ConvolveParams *const conv_params) {
3347 3278940 : const uint8_t *src_ptr = src - 3;
3348 3278940 : const int32_t dst_stride = conv_params->dst_stride;
3349 3278940 : const int32_t round_0 = 3;
3350 3278940 : const int32_t round_1 = COMPOUND_ROUND1_BITS;
3351 3278940 : const int32_t bits = FILTER_BITS - round_1;
3352 3278940 : const int32_t bd = 8;
3353 3278940 : const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0;
3354 3278940 : const int32_t round_offset =
3355 3278940 : (1 << (offset_bits - round_1)) + (1 << (offset_bits - round_1 - 1));
3356 3278940 : const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
3357 3278940 : const int32_t offset_comp_avg =
3358 3278940 : round_offset * conv_params->bck_offset +
3359 3278940 : (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
3360 3278940 : (round_offset << DIST_PRECISION_BITS);
3361 3278940 : const int16_t offset_avg = (1 << (FILTER_BITS - 1)) +
3362 3278940 : (1 << (round_0 - bits - 2)) -
3363 3278940 : (round_offset << (round_0 - bits - 1));
3364 3278940 : const int16_t offset_no_avg =
3365 3278940 : (round_offset << (round_0 - bits - 1)) + (1 << (round_0 - bits - 2));
3366 3278940 : const int32_t factor =
3367 3278940 : conv_params->fwd_offset | (conv_params->bck_offset << 16);
3368 3278940 : const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
3369 3278940 : const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
3370 6557880 : const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
3371 3278940 : const __m256i factor_256 = _mm256_set1_epi32(factor);
3372 3278940 : ConvBufType *dst = conv_params->dst;
3373 3278940 : int32_t y = h;
3374 : __m256i coeffs_256[4], filt_256[4];
3375 :
3376 3278940 : filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
3377 3278940 : filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
3378 3278940 : filt_256[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
3379 3278940 : filt_256[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
3380 :
3381 3278940 : prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3382 :
3383 3279110 : if (w == 8) {
3384 1406050 : if (conv_params->do_average) {
3385 576810 : if (conv_params->use_jnt_comp_avg) {
3386 : do {
3387 1889450 : const __m256i res = x_convolve_8tap_8x2_avx2(
3388 : src_ptr, src_stride, coeffs_256, filt_256);
3389 1889450 : jnt_comp_avg_round_store_8x2_avx2(res,
3390 : factor_256,
3391 : offset_comp_avg_256,
3392 : dst,
3393 : dst_stride,
3394 : dst8,
3395 : dst8_stride);
3396 1889430 : src_ptr += 2 * src_stride;
3397 1889430 : dst += 2 * dst_stride;
3398 1889430 : dst8 += 2 * dst8_stride;
3399 1889430 : y -= 2;
3400 1889430 : } while (y);
3401 : }
3402 : else {
3403 : do {
3404 2383010 : const __m256i res = x_convolve_8tap_8x2_avx2(
3405 : src_ptr, src_stride, coeffs_256, filt_256);
3406 2383030 : jnt_avg_round_store_8x2_sse2(res,
3407 : offset_avg_256,
3408 : dst,
3409 : dst_stride,
3410 : dst8,
3411 : dst8_stride);
3412 2383000 : src_ptr += 2 * src_stride;
3413 2383000 : dst += 2 * dst_stride;
3414 2383000 : dst8 += 2 * dst8_stride;
3415 2383000 : y -= 2;
3416 2383000 : } while (y);
3417 : }
3418 : }
3419 : else {
3420 : do {
3421 6212910 : const __m256i res = x_convolve_8tap_8x2_avx2(
3422 : src_ptr, src_stride, coeffs_256, filt_256);
3423 6213000 : jnt_no_avg_round_store_8x2_avx2(
3424 : res, offset_no_avg_256, dst, dst_stride);
3425 6212950 : src_ptr += 2 * src_stride;
3426 6212950 : dst += 2 * dst_stride;
3427 6212950 : y -= 2;
3428 6212950 : } while (y);
3429 : }
3430 : }
3431 1873050 : else if (w == 16) {
3432 1034330 : if (conv_params->do_average) {
3433 409753 : if (conv_params->use_jnt_comp_avg) {
3434 : do {
3435 1690030 : jnt_x_comp_avg_8tap_16x2_avx2(src_ptr,
3436 : src_stride,
3437 : coeffs_256,
3438 : filt_256,
3439 : factor_256,
3440 : offset_comp_avg_256,
3441 : dst,
3442 : dst_stride,
3443 : dst8,
3444 : dst8_stride);
3445 1690020 : src_ptr += 2 * src_stride;
3446 1690020 : dst += 2 * dst_stride;
3447 1690020 : dst8 += 2 * dst8_stride;
3448 1690020 : y -= 2;
3449 1690020 : } while (y);
3450 : }
3451 : else {
3452 : do {
3453 2376470 : jnt_x_avg_8tap_16x2_avx2(src_ptr,
3454 : src_stride,
3455 : coeffs_256,
3456 : filt_256,
3457 : offset_avg_256,
3458 : dst,
3459 : dst_stride,
3460 : dst8,
3461 : dst8_stride);
3462 2376460 : src_ptr += 2 * src_stride;
3463 2376460 : dst += 2 * dst_stride;
3464 2376460 : dst8 += 2 * dst8_stride;
3465 2376460 : y -= 2;
3466 2376460 : } while (y);
3467 : }
3468 : }
3469 : else {
3470 : do {
3471 6335670 : jnt_x_no_avg_8tap_16x2_avx2(src_ptr,
3472 : src_stride,
3473 : coeffs_256,
3474 : filt_256,
3475 : offset_no_avg_256,
3476 : dst,
3477 : dst_stride);
3478 6335550 : src_ptr += 2 * src_stride;
3479 6335550 : dst += 2 * dst_stride;
3480 6335550 : y -= 2;
3481 6335550 : } while (y);
3482 : }
3483 : }
3484 838721 : else if (w == 32) {
3485 642749 : if (conv_params->do_average) {
3486 248229 : if (conv_params->use_jnt_comp_avg) {
3487 : do {
3488 : jnt_x_comp_avg_8tap_32_avx2(src_ptr,
3489 : coeffs_256,
3490 : filt_256,
3491 : factor_256,
3492 : offset_comp_avg_256,
3493 : dst,
3494 : dst8);
3495 2157830 : src_ptr += src_stride;
3496 2157830 : dst += dst_stride;
3497 2157830 : dst8 += dst8_stride;
3498 2157830 : } while (--y);
3499 : }
3500 : else {
3501 : do {
3502 : jnt_x_avg_8tap_32_avx2(src_ptr,
3503 : coeffs_256,
3504 : filt_256,
3505 : offset_avg_256,
3506 : dst,
3507 : dst8);
3508 3172140 : src_ptr += src_stride;
3509 3172140 : dst += dst_stride;
3510 3172140 : dst8 += dst8_stride;
3511 3172140 : } while (--y);
3512 : }
3513 : }
3514 : else {
3515 : do {
3516 : jnt_x_no_avg_8tap_32_avx2(
3517 : src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3518 8687870 : src_ptr += src_stride;
3519 8687870 : dst += dst_stride;
3520 8687870 : } while (--y);
3521 : }
3522 : }
3523 195972 : else if (w == 64) {
3524 195979 : if (conv_params->do_average) {
3525 74962 : if (conv_params->use_jnt_comp_avg) {
3526 : do {
3527 : jnt_x_comp_avg_8tap_32_avx2(src_ptr,
3528 : coeffs_256,
3529 : filt_256,
3530 : factor_256,
3531 : offset_comp_avg_256,
3532 : dst,
3533 : dst8);
3534 901365 : jnt_x_comp_avg_8tap_32_avx2(src_ptr + 32,
3535 : coeffs_256,
3536 : filt_256,
3537 : factor_256,
3538 : offset_comp_avg_256,
3539 : dst + 32,
3540 : dst8 + 32);
3541 901378 : src_ptr += src_stride;
3542 901378 : dst += dst_stride;
3543 901378 : dst8 += dst8_stride;
3544 901378 : } while (--y);
3545 : }
3546 : else {
3547 : do {
3548 : jnt_x_avg_8tap_32_avx2(src_ptr,
3549 : coeffs_256,
3550 : filt_256,
3551 : offset_avg_256,
3552 : dst,
3553 : dst8);
3554 1471870 : jnt_x_avg_8tap_32_avx2(src_ptr + 32,
3555 : coeffs_256,
3556 : filt_256,
3557 : offset_avg_256,
3558 : dst + 32,
3559 : dst8 + 32);
3560 1471860 : src_ptr += src_stride;
3561 1471860 : dst += dst_stride;
3562 1471860 : dst8 += dst8_stride;
3563 1471860 : } while (--y);
3564 : }
3565 : }
3566 : else {
3567 : do {
3568 : jnt_x_no_avg_8tap_32_avx2(
3569 : src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3570 3805590 : jnt_x_no_avg_8tap_32_avx2(src_ptr + 32,
3571 : coeffs_256,
3572 : filt_256,
3573 : offset_no_avg_256,
3574 : dst + 32);
3575 3805620 : src_ptr += src_stride;
3576 3805620 : dst += dst_stride;
3577 3805620 : } while (--y);
3578 : }
3579 : }
3580 : else {
3581 0 : assert(w == 128);
3582 :
3583 0 : if (conv_params->do_average) {
3584 0 : if (conv_params->use_jnt_comp_avg) {
3585 : do {
3586 : jnt_x_comp_avg_8tap_32_avx2(src_ptr,
3587 : coeffs_256,
3588 : filt_256,
3589 : factor_256,
3590 : offset_comp_avg_256,
3591 : dst,
3592 : dst8);
3593 0 : jnt_x_comp_avg_8tap_32_avx2(src_ptr + 1 * 32,
3594 : coeffs_256,
3595 : filt_256,
3596 : factor_256,
3597 : offset_comp_avg_256,
3598 : dst + 1 * 32,
3599 : dst8 + 1 * 32);
3600 0 : jnt_x_comp_avg_8tap_32_avx2(src_ptr + 2 * 32,
3601 : coeffs_256,
3602 : filt_256,
3603 : factor_256,
3604 : offset_comp_avg_256,
3605 : dst + 2 * 32,
3606 : dst8 + 2 * 32);
3607 0 : jnt_x_comp_avg_8tap_32_avx2(src_ptr + 3 * 32,
3608 : coeffs_256,
3609 : filt_256,
3610 : factor_256,
3611 : offset_comp_avg_256,
3612 : dst + 3 * 32,
3613 : dst8 + 3 * 32);
3614 0 : src_ptr += src_stride;
3615 0 : dst += dst_stride;
3616 0 : dst8 += dst8_stride;
3617 0 : } while (--y);
3618 : }
3619 : else {
3620 : do {
3621 : jnt_x_avg_8tap_32_avx2(src_ptr,
3622 : coeffs_256,
3623 : filt_256,
3624 : offset_avg_256,
3625 : dst,
3626 : dst8);
3627 0 : jnt_x_avg_8tap_32_avx2(src_ptr + 1 * 32,
3628 : coeffs_256,
3629 : filt_256,
3630 : offset_avg_256,
3631 : dst + 1 * 32,
3632 : dst8 + 1 * 32);
3633 0 : jnt_x_avg_8tap_32_avx2(src_ptr + 2 * 32,
3634 : coeffs_256,
3635 : filt_256,
3636 : offset_avg_256,
3637 : dst + 2 * 32,
3638 : dst8 + 2 * 32);
3639 0 : jnt_x_avg_8tap_32_avx2(src_ptr + 3 * 32,
3640 : coeffs_256,
3641 : filt_256,
3642 : offset_avg_256,
3643 : dst + 3 * 32,
3644 : dst8 + 3 * 32);
3645 0 : src_ptr += src_stride;
3646 0 : dst += dst_stride;
3647 0 : dst8 += dst8_stride;
3648 0 : } while (--y);
3649 : }
3650 : }
3651 : else {
3652 : do {
3653 : jnt_x_no_avg_8tap_32_avx2(
3654 : src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3655 0 : jnt_x_no_avg_8tap_32_avx2(src_ptr + 1 * 32,
3656 : coeffs_256,
3657 : filt_256,
3658 : offset_no_avg_256,
3659 : dst + 1 * 32);
3660 0 : jnt_x_no_avg_8tap_32_avx2(src_ptr + 2 * 32,
3661 : coeffs_256,
3662 : filt_256,
3663 : offset_no_avg_256,
3664 : dst + 2 * 32);
3665 0 : jnt_x_no_avg_8tap_32_avx2(src_ptr + 3 * 32,
3666 : coeffs_256,
3667 : filt_256,
3668 : offset_no_avg_256,
3669 : dst + 3 * 32);
3670 0 : src_ptr += src_stride;
3671 0 : dst += dst_stride;
3672 0 : } while (--y);
3673 : }
3674 : }
3675 3279040 : }
3676 :
3677 : typedef void(*jnt_convolve_x_tap_func)(
3678 : const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
3679 : const int32_t dst8_stride, const int32_t w, const int32_t h,
3680 : const InterpFilterParams *const filter_params_x, const int32_t subpel_x_q4,
3681 : const ConvolveParams *const conv_params);
3682 :
3683 31862100 : void eb_av1_jnt_convolve_x_avx2(const uint8_t *src, int32_t src_stride,
3684 : uint8_t *dst8, int32_t dst8_stride, int32_t w,
3685 : int32_t h, InterpFilterParams *filter_params_x,
3686 : InterpFilterParams *filter_params_y,
3687 : const int32_t subpel_x_q4,
3688 : const int32_t subpel_y_q4,
3689 : ConvolveParams *conv_params) {
3690 : static const jnt_convolve_x_tap_func
3691 : jnt_convolve_x_tap_func_table[MAX_FILTER_TAP + 1] = {
3692 : NULL,
3693 : NULL,
3694 : jnt_convolve_x_2tap_avx2,
3695 : NULL,
3696 : jnt_convolve_x_4tap_ssse3,
3697 : NULL,
3698 : jnt_convolve_x_6tap_avx2,
3699 : NULL,
3700 : jnt_convolve_x_8tap_avx2 };
3701 31862100 : const int32_t tap_x = get_convolve_tap(filter_params_x->filter_ptr);
3702 :
3703 : (void)filter_params_y;
3704 : (void)subpel_y_q4;
3705 :
3706 31859200 : assert(conv_params->round_0 == 3);
3707 31860000 : assert(conv_params->round_1 == COMPOUND_ROUND1_BITS);
3708 :
3709 31860000 : jnt_convolve_x_tap_func_table[tap_x](src,
3710 : src_stride,
3711 : dst8,
3712 : dst8_stride,
3713 : w,
3714 : h,
3715 : filter_params_x,
3716 : subpel_x_q4,
3717 : conv_params);
3718 31883400 : }
|