Line data Source code
1 : /*
2 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_
13 : #define AOM_DSP_X86_CONVOLVE_AVX2_H_
14 :
15 : #include "convolve.h"
16 : #include "EbDefinitions.h"
17 : #include "EbInterPrediction.h"
18 : #include "EbMemory_AVX2.h"
19 : #include "EbMemory_SSE4_1.h"
20 : #include "synonyms.h"
21 : #include "synonyms_avx2.h"
22 :
23 : // filters for 16
24 : DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
25 : 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
26 : 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
27 :
28 : DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
29 : 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
30 : 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 };
31 :
32 : DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
33 : 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
34 : 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 };
35 :
36 : DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
37 : 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
38 : 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 };
39 :
40 732082200 : static INLINE EbBool is_convolve_2tap(const int16_t *const filter) {
41 732082200 : return (EbBool)((InterpKernel *)filter == bilinear_filters);
42 : }
43 :
44 461094000 : static INLINE EbBool is_convolve_4tap(const int16_t *const filter) {
45 461094000 : return (EbBool)(((InterpKernel *)filter == sub_pel_filters_4) ||
46 : ((InterpKernel *)filter == sub_pel_filters_4smooth));
47 : }
48 :
49 424751300 : static INLINE EbBool is_convolve_6tap(const int16_t *const filter) {
50 424751300 : return (EbBool)(((InterpKernel *)filter == sub_pel_filters_8) ||
51 : ((InterpKernel *)filter == sub_pel_filters_8smooth));
52 : }
53 :
54 668794200 : static INLINE int32_t get_convolve_tap(const int16_t *const filter) {
55 668794200 : if (is_convolve_2tap(filter))
56 266024100 : return 2;
57 402985700 : else if (is_convolve_4tap(filter))
58 31042641 : return 4;
59 374174300 : else if (is_convolve_6tap(filter))
60 314499300 : return 6;
61 : else
62 60109160 : return 8;
63 : }
64 :
65 43479320 : static INLINE void prepare_half_coeffs_2tap_ssse3(
66 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
67 : __m128i *const coeffs /* [4] */) {
68 43479320 : const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
69 : *filter_params, subpel_q4 & SUBPEL_MASK);
70 86957930 : const __m128i coeffs_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
71 :
72 : // right shift all filter co-efficients by 1 to reduce the bits required.
73 : // This extra right shift will be taken care of at the end while rounding
74 : // the result.
75 : // Since all filter co-efficients are even, this change will not affect the
76 : // end result
77 173915900 : assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
78 : _mm_set1_epi16((short)0xffff)));
79 :
80 43478920 : const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
81 :
82 : // coeffs 3 4 3 4 3 4 3 4
83 43478920 : *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
84 43478920 : }
85 :
86 5573454 : static INLINE void prepare_half_coeffs_4tap_ssse3(
87 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
88 : __m128i *const coeffs /* [2] */) {
89 5573454 : const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
90 : *filter_params, subpel_q4 & SUBPEL_MASK);
91 5573435 : const __m128i coeffs_8 = _mm_load_si128((__m128i *)filter);
92 :
93 : // right shift all filter co-efficients by 1 to reduce the bits required.
94 : // This extra right shift will be taken care of at the end while rounding
95 : // the result.
96 : // Since all filter co-efficients are even, this change will not affect the
97 : // end result
98 22293710 : assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
99 : _mm_set1_epi16((short)0xffff)));
100 :
101 5573435 : const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
102 :
103 : // coeffs 2 3 2 3 2 3 2 3
104 11146860 : coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
105 : // coeffs 4 5 4 5 4 5 4 5
106 5573435 : coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
107 5573435 : }
108 :
109 2455100 : static INLINE void prepare_half_coeffs_6tap_ssse3(
110 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
111 : __m128i *const coeffs /* [4] */) {
112 2455100 : const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
113 : *filter_params, subpel_q4 & SUBPEL_MASK);
114 2455100 : const __m128i coeffs_8 = _mm_load_si128((__m128i *)filter);
115 :
116 : // right shift all filter co-efficients by 1 to reduce the bits required.
117 : // This extra right shift will be taken care of at the end while rounding
118 : // the result.
119 : // Since all filter co-efficients are even, this change will not affect the
120 : // end result
121 9820400 : assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
122 : _mm_set1_epi16((short)0xffff)));
123 :
124 2455100 : const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
125 :
126 : // coeffs 1 2 1 2 1 2 1 2
127 4910200 : coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
128 : // coeffs 3 4 3 4 3 4 3 4
129 4910200 : coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
130 : // coeffs 5 6 5 6 5 6 5 6
131 2455100 : coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
132 2455100 : }
133 :
134 15090 : static INLINE void prepare_half_coeffs_8tap_ssse3(
135 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
136 : __m128i *const coeffs /* [4] */) {
137 15090 : const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
138 : *filter_params, subpel_q4 & SUBPEL_MASK);
139 15090 : const __m128i coeffs_8 = _mm_load_si128((__m128i *)filter);
140 :
141 : // right shift all filter co-efficients by 1 to reduce the bits required.
142 : // This extra right shift will be taken care of at the end while rounding
143 : // the result.
144 : // Since all filter co-efficients are even, this change will not affect the
145 : // end result
146 60360 : assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
147 : _mm_set1_epi16((short)0xffff)));
148 :
149 15090 : const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
150 :
151 : // coeffs 0 1 0 1 0 1 0 1
152 30180 : coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
153 : // coeffs 2 3 2 3 2 3 2 3
154 30180 : coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
155 : // coeffs 4 5 4 5 4 5 4 5
156 30180 : coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
157 : // coeffs 6 7 6 7 6 7 6 7
158 15090 : coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
159 15090 : }
160 :
161 116163440 : static INLINE void prepare_half_coeffs_2tap_avx2(
162 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
163 : __m256i *const coeffs /* [4] */) {
164 116163440 : const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
165 : *filter_params, subpel_q4 & SUBPEL_MASK);
166 232343700 : const __m128i coeffs_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
167 116171850 : const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
168 :
169 : // right shift all filter co-efficients by 1 to reduce the bits required.
170 : // This extra right shift will be taken care of at the end while rounding
171 : // the result.
172 : // Since all filter co-efficients are even, this change will not affect the
173 : // end result
174 464687500 : assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
175 : _mm_set1_epi16((short)0xffff)));
176 :
177 116171850 : const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
178 :
179 : // coeffs 3 4 3 4 3 4 3 4
180 116171850 : *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
181 116171850 : }
182 :
183 2351436 : static INLINE void prepare_half_coeffs_4tap_avx2(
184 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
185 : __m256i *const coeffs /* [3] */) {
186 2351436 : const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
187 : *filter_params, subpel_q4 & SUBPEL_MASK);
188 2351426 : const __m128i coeffs_8 = _mm_load_si128((__m128i *)filter);
189 2351426 : const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
190 :
191 : // right shift all filter co-efficients by 1 to reduce the bits required.
192 : // This extra right shift will be taken care of at the end while rounding
193 : // the result.
194 : // Since all filter co-efficients are even, this change will not affect the
195 : // end result
196 9405714 : assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
197 : _mm_set1_epi16((short)0xffff)));
198 :
199 2351426 : const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
200 :
201 : // coeffs 2 3 2 3 2 3 2 3
202 4702862 : coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
203 : // coeffs 4 5 4 5 4 5 4 5
204 2351426 : coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
205 2351426 : }
206 :
207 98102800 : static INLINE void prepare_half_coeffs_6tap_avx2(
208 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
209 : __m256i *const coeffs /* [3] */) {
210 98102800 : const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
211 : *filter_params, subpel_q4 & SUBPEL_MASK);
212 98099600 : const __m128i coeffs_8 = _mm_load_si128((__m128i *)filter);
213 98099600 : const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
214 :
215 : // right shift all filter co-efficients by 1 to reduce the bits required.
216 : // This extra right shift will be taken care of at the end while rounding
217 : // the result.
218 : // Since all filter co-efficients are even, this change will not affect the
219 : // end result
220 392398000 : assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
221 : _mm_set1_epi16((short)0xffff)));
222 :
223 98099600 : const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
224 :
225 : // coeffs 1 2 1 2 1 2 1 2
226 196199200 : coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
227 : // coeffs 3 4 3 4 3 4 3 4
228 196199200 : coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
229 : // coeffs 5 6 5 6 5 6 5 6
230 98099600 : coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0C0Au));
231 98099600 : }
232 :
233 182187500 : static INLINE void prepare_half_coeffs_8tap_avx2(
234 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
235 : __m256i *const coeffs /* [4] */) {
236 182187500 : const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
237 : *filter_params, subpel_q4 & SUBPEL_MASK);
238 182148520 : const __m128i coeffs_8 = _mm_load_si128((__m128i *)filter);
239 182148520 : const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
240 :
241 : // right shift all filter co-efficients by 1 to reduce the bits required.
242 : // This extra right shift will be taken care of at the end while rounding
243 : // the result.
244 : // Since all filter co-efficients are even, this change will not affect the
245 : // end result
246 728593000 : assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
247 : _mm_set1_epi16((short)0xffff)));
248 :
249 182148520 : const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
250 :
251 : // coeffs 0 1 0 1 0 1 0 1
252 364295900 : coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
253 : // coeffs 2 3 2 3 2 3 2 3
254 364295900 : coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
255 : // coeffs 4 5 4 5 4 5 4 5
256 364295900 : coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
257 : // coeffs 6 7 6 7 6 7 6 7
258 182148520 : coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
259 182148520 : }
260 :
261 0 : static INLINE void prepare_coeffs_2tap_sse2(
262 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
263 : __m128i *const coeffs /* [1] */) {
264 0 : const int16_t *filter = av1_get_interp_filter_subpel_kernel(
265 : *filter_params, subpel_q4 & SUBPEL_MASK);
266 :
267 0 : const __m128i coeff = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
268 :
269 : // coeffs 3 4 3 4 3 4 3 4
270 0 : coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
271 0 : }
272 :
273 0 : static INLINE void prepare_coeffs_4tap_sse2(
274 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
275 : __m128i *const coeffs /* [2] */) {
276 0 : const int16_t *filter = av1_get_interp_filter_subpel_kernel(
277 : *filter_params, subpel_q4 & SUBPEL_MASK);
278 :
279 0 : const __m128i coeff = _mm_load_si128((__m128i *)filter);
280 :
281 : // coeffs 2 3 2 3 2 3 2 3
282 0 : coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
283 : // coeffs 4 5 4 5 4 5 4 5
284 0 : coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
285 0 : }
286 :
287 0 : static INLINE void prepare_coeffs_6tap_ssse3(
288 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
289 : __m128i *const coeffs /* [3] */) {
290 0 : const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
291 : *filter_params, subpel_q4 & SUBPEL_MASK);
292 0 : const __m128i coeff = _mm_load_si128((__m128i *)filter);
293 :
294 : // coeffs 1 2 1 2 1 2 1 2
295 0 : coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
296 : // coeffs 3 4 3 4 3 4 3 4
297 0 : coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
298 : // coeffs 5 6 5 6 5 6 5 6
299 0 : coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
300 0 : }
301 :
302 0 : static INLINE void prepare_coeffs_8tap_sse2(
303 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
304 : __m128i *const coeffs /* [4] */) {
305 0 : const int16_t *filter = av1_get_interp_filter_subpel_kernel(
306 : *filter_params, subpel_q4 & SUBPEL_MASK);
307 :
308 0 : const __m128i coeff = _mm_load_si128((__m128i *)filter);
309 :
310 : // coeffs 0 1 0 1 0 1 0 1
311 0 : coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
312 : // coeffs 2 3 2 3 2 3 2 3
313 0 : coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
314 : // coeffs 4 5 4 5 4 5 4 5
315 0 : coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
316 : // coeffs 6 7 6 7 6 7 6 7
317 0 : coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
318 0 : }
319 :
320 94588800 : static INLINE void prepare_coeffs_2tap_avx2(
321 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
322 : __m256i *const coeffs /* [1] */) {
323 94588800 : const int16_t *filter = av1_get_interp_filter_subpel_kernel(
324 : *filter_params, subpel_q4 & SUBPEL_MASK);
325 :
326 189171300 : const __m128i coeff_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
327 94585600 : const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
328 :
329 : // coeffs 3 4 3 4 3 4 3 4
330 94585600 : coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
331 94585600 : }
332 :
333 1653570 : static INLINE void prepare_coeffs_4tap_avx2(
334 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
335 : __m256i *const coeffs /* [2] */) {
336 1653570 : const int16_t *filter = av1_get_interp_filter_subpel_kernel(
337 : *filter_params, subpel_q4 & SUBPEL_MASK);
338 :
339 1653560 : const __m128i coeff_8 = _mm_load_si128((__m128i *)filter);
340 1653560 : const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
341 :
342 : // coeffs 2 3 2 3 2 3 2 3
343 1653560 : coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
344 : // coeffs 4 5 4 5 4 5 4 5
345 1653560 : coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
346 1653560 : }
347 :
348 38615000 : static INLINE void prepare_coeffs_6tap_avx2(
349 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
350 : __m256i *const coeffs /* [3] */) {
351 38615000 : const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
352 : *filter_params, subpel_q4 & SUBPEL_MASK);
353 38612300 : const __m128i coeffs_8 = _mm_load_si128((__m128i *)filter);
354 38612300 : const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
355 :
356 : // coeffs 1 2 1 2 1 2 1 2
357 77224600 : coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
358 : // coeffs 3 4 3 4 3 4 3 4
359 77224600 : coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
360 : // coeffs 5 6 5 6 5 6 5 6
361 38612300 : coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
362 38612300 : }
363 :
364 171996300 : static INLINE void prepare_coeffs_8tap_avx2(
365 : const InterpFilterParams *const filter_params, const int32_t subpel_q4,
366 : __m256i *const coeffs /* [4] */) {
367 171996300 : const int16_t *filter = av1_get_interp_filter_subpel_kernel(
368 : *filter_params, subpel_q4 & SUBPEL_MASK);
369 :
370 171977800 : const __m128i coeff_8 = _mm_load_si128((__m128i *)filter);
371 171977800 : const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
372 :
373 : // coeffs 0 1 0 1 0 1 0 1
374 171977800 : coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
375 : // coeffs 2 3 2 3 2 3 2 3
376 171977800 : coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
377 : // coeffs 4 5 4 5 4 5 4 5
378 171977800 : coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
379 : // coeffs 6 7 6 7 6 7 6 7
380 171977800 : coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
381 171977800 : }
382 :
383 : static INLINE void load_16bit_5rows_avx2(const int16_t *const src,
384 : const int32_t stride, __m256i dst[5]) {
385 : dst[0] = _mm256_load_si256((__m256i *)(src + 0 * stride));
386 : dst[1] = _mm256_load_si256((__m256i *)(src + 1 * stride));
387 : dst[2] = _mm256_load_si256((__m256i *)(src + 2 * stride));
388 : dst[3] = _mm256_load_si256((__m256i *)(src + 3 * stride));
389 : dst[4] = _mm256_load_si256((__m256i *)(src + 4 * stride));
390 : }
391 :
392 15250300 : static INLINE void load_16bit_7rows_avx2(const int16_t *const src,
393 : const int32_t stride, __m256i dst[7]) {
394 15250300 : dst[0] = _mm256_load_si256((__m256i *)(src + 0 * stride));
395 15250300 : dst[1] = _mm256_load_si256((__m256i *)(src + 1 * stride));
396 15250300 : dst[2] = _mm256_load_si256((__m256i *)(src + 2 * stride));
397 15250300 : dst[3] = _mm256_load_si256((__m256i *)(src + 3 * stride));
398 15250300 : dst[4] = _mm256_load_si256((__m256i *)(src + 4 * stride));
399 15250300 : dst[5] = _mm256_load_si256((__m256i *)(src + 5 * stride));
400 15250300 : dst[6] = _mm256_load_si256((__m256i *)(src + 6 * stride));
401 15250300 : }
402 :
403 : SIMD_INLINE void load_16bit_8rows_avx2(const int16_t *const src,
404 : const int32_t stride, __m256i dst[8]) {
405 0 : dst[0] = _mm256_load_si256((__m256i *)(src + 0 * stride));
406 0 : dst[1] = _mm256_load_si256((__m256i *)(src + 1 * stride));
407 0 : dst[2] = _mm256_load_si256((__m256i *)(src + 2 * stride));
408 0 : dst[3] = _mm256_load_si256((__m256i *)(src + 3 * stride));
409 0 : dst[4] = _mm256_load_si256((__m256i *)(src + 4 * stride));
410 0 : dst[5] = _mm256_load_si256((__m256i *)(src + 5 * stride));
411 0 : dst[6] = _mm256_load_si256((__m256i *)(src + 6 * stride));
412 0 : dst[7] = _mm256_load_si256((__m256i *)(src + 7 * stride));
413 0 : }
414 :
415 : SIMD_INLINE void loadu_unpack_16bit_5rows_avx2(const int16_t *const src,
416 : const int32_t stride,
417 : __m256i s_256[5],
418 : __m256i ss_256[5],
419 : __m256i tt_256[5]) {
420 21355300 : s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
421 21355300 : s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
422 21355300 : s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
423 21355300 : s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
424 21355300 : s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
425 :
426 21355300 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
427 21355300 : ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
428 21355300 : ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
429 21355300 : ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
430 :
431 21355300 : tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
432 21355300 : tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
433 21355300 : tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
434 21355300 : tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
435 21355300 : }
436 :
437 : SIMD_INLINE void loadu_unpack_16bit_3rows_avx2(const int16_t *const src,
438 : const int32_t stride,
439 : __m256i s_256[3],
440 : __m256i ss_256[3],
441 : __m256i tt_256[3])
442 : {
443 : s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
444 : s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
445 : s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
446 :
447 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
448 : ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
449 :
450 : tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
451 : tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
452 : }
453 :
454 :
455 37699500 : static INLINE void convolve_8tap_unapck_avx2(const __m256i s[6],
456 : __m256i ss[7]) {
457 37699500 : ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
458 37699500 : ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
459 37699500 : ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
460 37699500 : ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
461 37699500 : ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
462 37699500 : ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
463 37699500 : }
464 :
465 695131500 : static INLINE __m128i convolve_2tap_ssse3(const __m128i ss[1],
466 : const __m128i coeffs[1]) {
467 1390261000 : return _mm_maddubs_epi16(ss[0], coeffs[0]);
468 : }
469 :
470 30331937 : static INLINE __m128i convolve_4tap_ssse3(const __m128i ss[2],
471 : const __m128i coeffs[2]) {
472 30331937 : const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
473 60663870 : const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
474 30331937 : return _mm_add_epi16(res_23, res_45);
475 : }
476 :
477 15516945 : static INLINE __m128i convolve_6tap_ssse3(const __m128i ss[3],
478 : const __m128i coeffs[3]) {
479 15516945 : const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
480 15516945 : const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
481 31033890 : const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
482 15516945 : const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
483 15516945 : return _mm_add_epi16(res_1256, res_34);
484 : }
485 :
486 89584 : static INLINE __m128i convolve_8tap_ssse3(const __m128i ss[4],
487 : const __m128i coeffs[4]) {
488 89584 : const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
489 89584 : const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
490 89584 : const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
491 179168 : const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
492 89584 : const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
493 89584 : const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
494 89584 : return _mm_add_epi16(res_0145, res_2367);
495 : }
496 :
497 3591145700 : static INLINE __m256i convolve_2tap_avx2(const __m256i ss[1],
498 : const __m256i coeffs[1]) {
499 7182293000 : return _mm256_maddubs_epi16(ss[0], coeffs[0]);
500 : }
501 :
502 136633067 : static INLINE __m256i convolve_4tap_avx2(const __m256i ss[2],
503 : const __m256i coeffs[2]) {
504 136633067 : const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
505 273266154 : const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
506 136633067 : return _mm256_add_epi16(res_23, res_45);
507 : }
508 :
509 2353409000 : static INLINE __m256i convolve_6tap_avx2(const __m256i ss[3],
510 : const __m256i coeffs[3]) {
511 2353409000 : const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
512 2353409000 : const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
513 4706826000 : const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
514 2353409000 : const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
515 2353409000 : return _mm256_add_epi16(res_0145, res_23);
516 : }
517 :
518 3821450700 : static INLINE __m256i convolve_8tap_avx2(const __m256i ss[4],
519 : const __m256i coeffs[4]) {
520 3821450700 : const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
521 3821450700 : const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
522 3821450700 : const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
523 7642915700 : const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
524 3821450700 : const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
525 3821450700 : const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
526 3821450700 : return _mm256_add_epi16(res_0145, res_2367);
527 : }
528 :
529 0 : static INLINE __m128i convolve16_2tap_sse2(const __m128i ss[1],
530 : const __m128i coeffs[1]) {
531 0 : return _mm_madd_epi16(ss[0], coeffs[0]);
532 : }
533 :
534 0 : static INLINE __m128i convolve16_4tap_sse2(const __m128i ss[2],
535 : const __m128i coeffs[2]) {
536 0 : const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
537 0 : const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
538 0 : return _mm_add_epi32(res_01, res_23);
539 : }
540 :
541 0 : static INLINE __m128i convolve16_6tap_sse2(const __m128i ss[3],
542 : const __m128i coeffs[3]) {
543 0 : const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
544 0 : const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
545 0 : const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
546 0 : const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
547 0 : return _mm_add_epi32(res_0123, res_45);
548 : }
549 :
550 0 : static INLINE __m128i convolve16_8tap_sse2(const __m128i ss[4],
551 : const __m128i coeffs[4]) {
552 0 : const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
553 0 : const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
554 0 : const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
555 0 : const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
556 0 : const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
557 0 : const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
558 0 : return _mm_add_epi32(res_0123, res_4567);
559 : }
560 :
561 3359760000 : static INLINE __m256i convolve16_2tap_avx2(const __m256i ss[1],
562 : const __m256i coeffs[1]) {
563 6719510000 : return _mm256_madd_epi16(ss[0], coeffs[0]);
564 : }
565 :
566 7013640 : static INLINE __m256i convolve16_4tap_avx2(const __m256i ss[2],
567 : const __m256i coeffs[2]) {
568 7013640 : const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
569 14027300 : const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
570 7013640 : return _mm256_add_epi32(res_1, res_2);
571 : }
572 :
573 1659360000 : static INLINE __m256i convolve16_6tap_avx2(const __m256i ss[3],
574 : const __m256i coeffs[3]) {
575 1659360000 : const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
576 1659360000 : const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
577 3318730000 : const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
578 1659360000 : const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
579 1659360000 : return _mm256_add_epi32(res_0123, res_45);
580 : }
581 :
582 880097300 : static INLINE __m256i convolve16_8tap_avx2(const __m256i ss[4],
583 : const __m256i coeffs[4]) {
584 880097300 : const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
585 880097300 : const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
586 880097300 : const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
587 1760199000 : const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
588 880097300 : const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
589 880097300 : const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
590 880097300 : return _mm256_add_epi32(res_0123, res_4567);
591 : }
592 :
593 1708755000 : static INLINE __m256i x_convolve_6tap_avx2(const __m256i data,
594 : const __m256i coeffs[3],
595 : const __m256i *const filt) {
596 : __m256i ss[3];
597 :
598 1708755000 : ss[0] = _mm256_shuffle_epi8(data, filt[0]);
599 1708755000 : ss[1] = _mm256_shuffle_epi8(data, filt[1]);
600 1708755000 : ss[2] = _mm256_shuffle_epi8(data, filt[2]);
601 :
602 1708755000 : return convolve_6tap_avx2(ss, coeffs);
603 : }
604 :
605 706986300 : static INLINE __m256i x_convolve_8tap_avx2(const __m256i data,
606 : const __m256i coeffs[4],
607 : const __m256i *const filt) {
608 : __m256i ss[4];
609 :
610 706986300 : ss[0] = _mm256_shuffle_epi8(data, filt[0]);
611 706986300 : ss[1] = _mm256_shuffle_epi8(data, filt[1]);
612 706986300 : ss[2] = _mm256_shuffle_epi8(data, filt[2]);
613 706986300 : ss[3] = _mm256_shuffle_epi8(data, filt[3]);
614 :
615 706986300 : return convolve_8tap_avx2(ss, coeffs);
616 : }
617 :
618 556173000 : static INLINE __m256i sr_y_round_avx2(const __m256i src) {
619 556173000 : const __m256i round = _mm256_set1_epi16(32);
620 556173000 : const __m256i dst = _mm256_add_epi16(src, round);
621 556173000 : return _mm256_srai_epi16(dst, FILTER_BITS - 1);
622 : }
623 :
624 669909000 : static INLINE __m128i xy_x_round_sse2(const __m128i src) {
625 669909000 : const __m128i round = _mm_set1_epi16(2);
626 669909000 : const __m128i dst = _mm_add_epi16(src, round);
627 669909000 : return _mm_srai_epi16(dst, 2);
628 : }
629 :
630 3327560000 : static INLINE __m256i xy_x_round_avx2(const __m256i src) {
631 3327560000 : const __m256i round = _mm256_set1_epi16(2);
632 3327560000 : const __m256i dst = _mm256_add_epi16(src, round);
633 3327560000 : return _mm256_srai_epi16(dst, 2);
634 : }
635 :
636 0 : static INLINE void xy_x_round_store_2x2_sse2(const __m128i res,
637 : int16_t *const dst) {
638 0 : const __m128i d = xy_x_round_sse2(res);
639 0 : _mm_storel_epi64((__m128i *)dst, d);
640 0 : }
641 :
642 12750800 : static INLINE void xy_x_round_store_4x2_sse2(const __m128i res,
643 : int16_t *const dst) {
644 12750800 : const __m128i d = xy_x_round_sse2(res);
645 : _mm_store_si128((__m128i *)dst, d);
646 12750100 : }
647 :
648 330723000 : static INLINE void xy_x_round_store_8x2_sse2(const __m128i res[2],
649 : int16_t *const dst) {
650 : __m128i r[2];
651 :
652 330723000 : r[0] = xy_x_round_sse2(res[0]);
653 330286000 : r[1] = xy_x_round_sse2(res[1]);
654 330583000 : _mm_store_si128((__m128i *)dst, r[0]);
655 330583000 : _mm_store_si128((__m128i *)(dst + 8), r[1]);
656 330583000 : }
657 :
658 265329000 : static INLINE void xy_x_round_store_8x2_avx2(const __m256i res,
659 : int16_t *const dst) {
660 265329000 : const __m256i d = xy_x_round_avx2(res);
661 : _mm256_store_si256((__m256i *)dst, d);
662 265926000 : }
663 :
664 537305000 : static INLINE void xy_x_round_store_32_avx2(const __m256i res[2],
665 : int16_t *const dst) {
666 : __m256i r[2];
667 :
668 537305000 : r[0] = xy_x_round_avx2(res[0]);
669 540783000 : r[1] = xy_x_round_avx2(res[1]);
670 : const __m256i d0 =
671 543167000 : _mm256_inserti128_si256(r[0], _mm256_extracti128_si256(r[1], 0), 1);
672 : const __m256i d1 =
673 543167000 : _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
674 : _mm256_store_si256((__m256i *)dst, d0);
675 543167000 : _mm256_store_si256((__m256i *)(dst + 16), d1);
676 543167000 : }
677 :
678 : static INLINE __m128i xy_y_round_sse2(const __m128i src) {
679 : const __m128i round = _mm_set1_epi32(1024);
680 : const __m128i dst = _mm_add_epi32(src, round);
681 : return _mm_srai_epi32(dst, 11);
682 : }
683 :
684 : static INLINE __m128i xy_y_round_half_pel_sse2(const __m128i src) {
685 : const __m128i round = _mm_set1_epi16(16);
686 : const __m128i dst = _mm_add_epi16(src, round);
687 : return _mm_srai_epi16(dst, 5);
688 : }
689 :
690 : static INLINE __m256i xy_y_round_avx2(const __m256i src) {
691 : const __m256i round = _mm256_set1_epi32(1024);
692 : const __m256i dst = _mm256_add_epi32(src, round);
693 : return _mm256_srai_epi32(dst, 11);
694 : }
695 :
696 : static INLINE __m256i xy_y_round_16_avx2(const __m256i r[2]) {
697 : const __m256i r0 = xy_y_round_avx2(r[0]);
698 : const __m256i r1 = xy_y_round_avx2(r[1]);
699 : return _mm256_packs_epi32(r0, r1);
700 : }
701 :
702 : static INLINE __m256i xy_y_round_half_pel_avx2(const __m256i src) {
703 : const __m256i round = _mm256_set1_epi16(16);
704 : const __m256i dst = _mm256_add_epi16(src, round);
705 : return _mm256_srai_epi16(dst, 5);
706 : }
707 :
708 225212 : static INLINE __m128i jnt_y_round_sse2(const __m128i src) {
709 225212 : const __m128i round = _mm_set1_epi16(2);
710 225212 : const __m128i dst = _mm_add_epi16(src, round);
711 225212 : return _mm_srai_epi16(dst, 2);
712 : }
713 :
714 231354000 : static INLINE __m256i jnt_y_round_avx2(const __m256i src) {
715 231354000 : const __m256i round = _mm256_set1_epi16(2);
716 231354000 : const __m256i dst = _mm256_add_epi16(src, round);
717 231354000 : return _mm256_srai_epi16(dst, 2);
718 : }
719 :
720 351612 : static INLINE __m128i jnt_avg_round_sse2(const __m128i src,
721 : const __m128i offset) {
722 351612 : const __m128i dst = _mm_add_epi16(src, offset);
723 351612 : return _mm_srai_epi16(dst, 2);
724 : }
725 :
726 276725000 : static INLINE __m256i jnt_avg_round_avx2(const __m256i src,
727 : const __m256i offset) {
728 276725000 : const __m256i dst = _mm256_add_epi16(src, offset);
729 276725000 : return _mm256_srai_epi16(dst, 2);
730 : }
731 :
732 733242 : static INLINE __m128i jnt_no_avg_round_sse2(const __m128i src,
733 : const __m128i offset) {
734 733242 : const __m128i dst = _mm_add_epi16(src, offset);
735 733242 : return _mm_srli_epi16(dst, 2);
736 : }
737 :
738 1024650000 : static INLINE __m256i jnt_no_avg_round_avx2(const __m256i src,
739 : const __m256i offset) {
740 1024650000 : const __m256i dst = _mm256_add_epi16(src, offset);
741 1024650000 : return _mm256_srli_epi16(dst, 2);
742 : }
743 :
744 197450 : static INLINE void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
745 : const int32_t stride) {
746 197450 : const __m128i d = _mm_packus_epi16(res, res);
747 197450 : *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
748 197450 : *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
749 197450 : }
750 :
751 35268622 : static INLINE void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
752 : const int32_t stride) {
753 35268622 : const __m128i d = _mm_packus_epi16(res, res);
754 : store_u8_4x2_sse2(d, dst, stride);
755 35268722 : }
756 :
757 : static INLINE void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
758 : const int32_t stride) {
759 : const __m256i d = _mm256_packus_epi16(res, res);
760 : const __m128i d0 = _mm256_castsi256_si128(d);
761 : const __m128i d1 = _mm256_extracti128_si256(d, 1);
762 : xx_storel_32(dst, d0);
763 : xx_storel_32(dst + stride, d1);
764 : }
765 :
766 366878800 : static INLINE void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
767 : const int32_t stride) {
768 366878800 : const __m256i d = _mm256_packus_epi16(res, res);
769 366878800 : const __m128i d0 = _mm256_castsi256_si128(d);
770 366878800 : const __m128i d1 = _mm256_extracti128_si256(d, 1);
771 366878800 : _mm_storel_epi64((__m128i *)dst, d0);
772 366878800 : _mm_storel_epi64((__m128i *)(dst + stride), d1);
773 366878800 : }
774 :
775 236666400 : static INLINE void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
776 : uint8_t *const dst,
777 : const int32_t stride) {
778 236666400 : const __m256i d = _mm256_packus_epi16(res0, res1);
779 236666400 : storeu_u8_16x2_avx2(d, dst, stride);
780 236536900 : }
781 :
782 148583000 : static INLINE void xy_y_pack_store_16x2_avx2(const __m256i res0,
783 : const __m256i res1,
784 : uint8_t *const dst,
785 : const int32_t stride) {
786 148583000 : const __m256i t = _mm256_packus_epi16(res0, res1);
787 148583000 : const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
788 148583000 : storeu_u8_16x2_avx2(d, dst, stride);
789 148638000 : }
790 :
791 : static INLINE void xy_y_round_store_2x2_sse2(const __m128i res,
792 : uint8_t *const dst,
793 : const int32_t stride) {
794 : const __m128i r = xy_y_round_sse2(res);
795 : const __m128i rr = _mm_packs_epi32(r, r);
796 : pack_store_2x2_sse2(rr, dst, stride);
797 : }
798 :
799 : static INLINE void xy_y_round_store_4x2_avx2(const __m256i res,
800 : uint8_t *const dst,
801 : const int32_t stride) {
802 : const __m256i r = xy_y_round_avx2(res);
803 : const __m256i rr = _mm256_packs_epi32(r, r);
804 : pack_store_4x2_avx2(rr, dst, stride);
805 : }
806 :
807 : static INLINE void xy_y_pack_store_32_avx2(const __m256i res0,
808 : const __m256i res1,
809 : uint8_t *const dst) {
810 : const __m256i d = _mm256_packus_epi16(res0, res1);
811 : // d = _mm256_permute4x64_epi64(d, 0xD8);
812 : _mm256_storeu_si256((__m256i *)dst, d);
813 : }
814 :
815 : static INLINE void xy_y_round_store_32_avx2(const __m256i r0[2],
816 : const __m256i r1[2],
817 : uint8_t *const dst) {
818 : const __m256i ra = xy_y_round_16_avx2(r0);
819 : const __m256i rb = xy_y_round_16_avx2(r1);
820 : xy_y_pack_store_32_avx2(ra, rb, dst);
821 : }
822 :
823 820942000 : static INLINE void convolve_store_32_avx2(const __m256i res0,
824 : const __m256i res1,
825 : uint8_t *const dst) {
826 820942000 : const __m256i d = _mm256_packus_epi16(res0, res1);
827 : _mm256_storeu_si256((__m256i *)dst, d);
828 820942000 : }
829 :
830 1122091000 : static INLINE void jnt_no_avg_store_16x2_avx2(const __m256i src0,
831 : const __m256i src1,
832 : ConvBufType *const dst,
833 : const int32_t stride) {
834 : const __m256i d0 =
835 1122091000 : _mm256_inserti128_si256(src0, _mm256_extracti128_si256(src1, 0), 1);
836 : const __m256i d1 =
837 1122091000 : _mm256_inserti128_si256(src1, _mm256_extracti128_si256(src0, 1), 0);
838 : _mm256_storeu_si256((__m256i *)dst, d0);
839 1122091000 : _mm256_storeu_si256((__m256i *)(dst + stride), d1);
840 1122091000 : }
841 :
842 0 : static INLINE __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
843 : const int32_t stride,
844 : const __m128i coeffs[1]) {
845 : const __m128i sfl =
846 0 : _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
847 0 : const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
848 0 : const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
849 0 : return convolve_2tap_ssse3(&ss, coeffs);
850 : }
851 :
852 1468830 : static INLINE __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
853 : const int32_t stride,
854 : const __m128i coeffs[1]) {
855 : const __m128i sfl =
856 1468830 : _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
857 1468830 : const __m128i s_128 = load_u8_8x2_sse2(src, stride);
858 1468840 : const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
859 1468840 : return convolve_2tap_ssse3(&ss, coeffs);
860 : }
861 :
862 338881390 : static INLINE void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
863 : const int32_t stride,
864 : const __m128i coeffs[1],
865 : __m128i r[2]) {
866 : __m128i ss[2];
867 338881390 : const __m128i s00 = _mm_loadu_si128((__m128i *)src);
868 338881390 : const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
869 338881390 : const __m128i s01 = _mm_srli_si128(s00, 1);
870 338881390 : const __m128i s11 = _mm_srli_si128(s10, 1);
871 338881390 : ss[0] = _mm_unpacklo_epi8(s00, s01);
872 338881390 : ss[1] = _mm_unpacklo_epi8(s10, s11);
873 :
874 338881390 : r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
875 339113580 : r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
876 338544480 : }
877 :
878 61452000 : static INLINE __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
879 : const int32_t stride,
880 : const __m256i coeffs[1]) {
881 : __m128i s_128[2][2];
882 : __m256i s_256[2];
883 :
884 61452000 : s_128[0][0] = _mm_loadu_si128((__m128i *)src);
885 61452000 : s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
886 61452000 : s_128[0][1] = _mm_srli_si128(s_128[0][0], 1);
887 61452000 : s_128[1][1] = _mm_srli_si128(s_128[1][0], 1);
888 61452000 : s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
889 61452000 : s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
890 61452000 : const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
891 61452000 : return convolve_2tap_avx2(&ss, coeffs);
892 : }
893 :
894 399526810 : static INLINE void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
895 : const int32_t stride,
896 : const __m256i coeffs[1],
897 : __m256i r[2]) {
898 : __m128i s_128[2][2];
899 : __m256i s_256[2];
900 :
901 399526810 : s_128[0][0] = _mm_loadu_si128((__m128i *)src);
902 399526810 : s_128[0][1] = _mm_loadu_si128((__m128i *)(src + 1));
903 399526810 : s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
904 399526810 : s_128[1][1] = _mm_loadu_si128((__m128i *)(src + stride + 1));
905 399526810 : s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
906 399526810 : s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
907 399526810 : const __m256i s0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
908 399526810 : const __m256i s1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
909 399526810 : r[0] = convolve_2tap_avx2(&s0, coeffs);
910 399250750 : r[1] = convolve_2tap_avx2(&s1, coeffs);
911 398219110 : }
912 :
913 171934100 : static INLINE void x_convolve_2tap_32_avx2(const uint8_t *const src,
914 : const __m256i coeffs[1],
915 : __m256i r[2]) {
916 171934100 : const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
917 343868100 : const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
918 171934100 : const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
919 171934100 : const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
920 :
921 171934100 : r[0] = convolve_2tap_avx2(&ss0, coeffs);
922 172021400 : r[1] = convolve_2tap_avx2(&ss1, coeffs);
923 171898200 : }
924 :
925 104946 : static INLINE __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
926 : const int32_t stride,
927 : const __m128i coeffs[1]) {
928 : const __m128i sfl0 =
929 104946 : _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
930 : const __m128i sfl1 =
931 104946 : _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
932 104946 : const __m128i s = load_u8_8x2_sse2(src, stride);
933 : __m128i ss[2];
934 :
935 104946 : ss[0] = _mm_shuffle_epi8(s, sfl0);
936 104946 : ss[1] = _mm_shuffle_epi8(s, sfl1);
937 104946 : return convolve_4tap_ssse3(ss, coeffs);
938 : }
939 :
940 28868090 : static INLINE __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
941 : const int32_t stride,
942 : const __m128i coeffs[1]) {
943 28868090 : const __m128i s = load_u8_8x2_sse2(src, stride);
944 : const __m128i sfl0 =
945 28868302 : _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
946 : const __m128i sfl1 =
947 28868302 : _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
948 : __m128i ss[2];
949 :
950 28868302 : ss[0] = _mm_shuffle_epi8(s, sfl0);
951 28868302 : ss[1] = _mm_shuffle_epi8(s, sfl1);
952 28868302 : return convolve_4tap_ssse3(ss, coeffs);
953 : }
954 :
955 275830700 : static INLINE __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
956 : const int32_t stride,
957 : const __m256i coeffs[3],
958 : const __m256i *const filt) {
959 275830700 : const __m128i s0 = _mm_loadu_si128((__m128i *)src);
960 275830700 : const __m128i s1 = _mm_loadu_si128((__m128i *)(src + stride));
961 275830700 : const __m256i s_256 = _mm256_setr_m128i(s0, s1);
962 275830700 : return x_convolve_6tap_avx2(s_256, coeffs, filt);
963 : }
964 :
965 727784700 : static INLINE void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
966 : const int32_t src_stride,
967 : const __m256i coeffs[3],
968 : const __m256i *const filt,
969 : __m256i r[2]) {
970 727784700 : const __m128i s0_128 = _mm_loadu_si128((__m128i *)src);
971 727784700 : const __m128i s1_128 = _mm_loadu_si128((__m128i *)(src + src_stride));
972 727784700 : const __m128i s2_128 = _mm_loadu_si128((__m128i *)(src + 8));
973 727784700 : const __m128i s3_128 = _mm_loadu_si128((__m128i *)(src + src_stride + 8));
974 727784700 : const __m256i s0_256 = _mm256_setr_m128i(s0_128, s1_128);
975 727784700 : const __m256i s1_256 = _mm256_setr_m128i(s2_128, s3_128);
976 :
977 727784700 : r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
978 727960000 : r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
979 726174700 : }
980 :
981 100278630 : static INLINE __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
982 : const int32_t stride,
983 : const __m256i coeffs[3],
984 : const __m256i *const filt) {
985 100278630 : const __m128i s0 = _mm_loadu_si128((__m128i *)src);
986 100278630 : const __m128i s1 = _mm_loadu_si128((__m128i *)(src + stride));
987 100278630 : const __m256i s_256 = _mm256_setr_m128i(s0, s1);
988 100278630 : return x_convolve_8tap_avx2(s_256, coeffs, filt);
989 : }
990 :
991 : SIMD_INLINE void x_convolve_8tap_16x2_avx2(const uint8_t *const src,
992 : const int32_t src_stride,
993 : const __m256i coeffs[4],
994 : const __m256i *const filt,
995 : __m256i r[2]) {
996 283220600 : const __m128i s0_128 = _mm_loadu_si128((__m128i *)src);
997 283220600 : const __m128i s1_128 = _mm_loadu_si128((__m128i *)(src + src_stride));
998 283220600 : const __m128i s2_128 = _mm_loadu_si128((__m128i *)(src + 8));
999 283220600 : const __m128i s3_128 = _mm_loadu_si128((__m128i *)(src + src_stride + 8));
1000 283220600 : const __m256i s0_256 = _mm256_setr_m128i(s0_128, s1_128);
1001 283220600 : const __m256i s1_256 = _mm256_setr_m128i(s2_128, s3_128);
1002 :
1003 283220600 : r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1004 283066200 : r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1005 282875100 : }
1006 :
1007 0 : static INLINE __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
1008 : const int32_t stride,
1009 : const __m128i coeffs[1],
1010 : __m128i s_16[2]) {
1011 : __m128i s_128[2];
1012 :
1013 0 : s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1014 0 : s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1015 0 : s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1016 0 : s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1017 0 : const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1018 0 : return convolve_2tap_ssse3(&ss, coeffs);
1019 : }
1020 :
1021 1559280 : static INLINE __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
1022 : const int32_t stride,
1023 : const __m128i coeffs[1],
1024 : __m128i s_32[2]) {
1025 : __m128i s_128[2];
1026 :
1027 1559280 : s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src + stride));
1028 1559280 : s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1029 1559280 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * stride));
1030 1559280 : s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1031 1559280 : const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1032 1559280 : return convolve_2tap_ssse3(&ss, coeffs);
1033 : }
1034 :
1035 61018500 : static INLINE __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
1036 : const int32_t stride,
1037 : const __m256i coeffs[1],
1038 : __m128i s_64[2]) {
1039 : __m256i s_256[2];
1040 :
1041 61018500 : s_64[1] = _mm_loadl_epi64((__m128i *)(src + stride));
1042 61018500 : s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1043 61018500 : s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1044 61018500 : s_256[1] = _mm256_setr_m128i(s_64[1], s_64[0]);
1045 61018500 : const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1046 61018500 : return convolve_2tap_avx2(&ss, coeffs);
1047 : }
1048 :
1049 73657410 : static INLINE void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
1050 : const int32_t stride,
1051 : const __m256i coeffs[1],
1052 : __m128i s_128[2], __m256i r[2]) {
1053 : __m256i s_256[2];
1054 :
1055 73657410 : s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1056 73657410 : s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1057 73657410 : s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1058 73657410 : s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1059 73657410 : const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1060 73657410 : const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1061 73657410 : r[0] = convolve_2tap_avx2(&ss0, coeffs);
1062 73684640 : r[1] = convolve_2tap_avx2(&ss1, coeffs);
1063 73674390 : }
1064 :
1065 162559100 : static INLINE void y_convolve_2tap_32_avx2(const uint8_t *const src,
1066 : const __m256i coeffs[1],
1067 : const __m256i s0, __m256i *const s1,
1068 : __m256i r[2]) {
1069 162559100 : *s1 = _mm256_loadu_si256((__m256i *)src);
1070 162559100 : const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1071 162559100 : const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1072 162559100 : r[0] = convolve_2tap_avx2(&ss0, coeffs);
1073 162621500 : r[1] = convolve_2tap_avx2(&ss1, coeffs);
1074 162516200 : }
1075 :
1076 26776 : static INLINE __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
1077 : const int32_t stride,
1078 : const __m128i coeffs[2],
1079 : __m128i s_16[4],
1080 : __m128i ss_128[2]) {
1081 26776 : s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1082 26776 : const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1083 26776 : s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1084 26776 : const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1085 26776 : ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1086 26776 : return convolve_4tap_ssse3(ss_128, coeffs);
1087 : }
1088 :
1089 1328956 : static INLINE __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
1090 : const int32_t stride,
1091 : const __m128i coeffs[2],
1092 : __m128i s_32[4],
1093 : __m128i ss_128[2]) {
1094 1328956 : s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src + stride));
1095 1328956 : const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1096 1328956 : s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * stride));
1097 1328956 : const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1098 1328956 : ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1099 1328956 : return convolve_4tap_ssse3(ss_128, coeffs);
1100 : }
1101 :
1102 1923966 : static INLINE __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
1103 : const int32_t stride,
1104 : const __m256i coeffs[2],
1105 : __m128i s_64[4],
1106 : __m256i ss_256[2]) {
1107 1923966 : s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1108 1923966 : const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1109 1923966 : s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1110 1923966 : const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1111 1923966 : ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1112 1923966 : return convolve_4tap_avx2(ss_256, coeffs);
1113 : }
1114 :
1115 2751104 : static INLINE void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
1116 : const int32_t stride,
1117 : const __m256i coeffs[2],
1118 : __m128i s_128[4],
1119 : __m256i ss_256[2], __m256i r[2]) {
1120 2751104 : s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1121 2751104 : const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1122 2751104 : s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1123 2751104 : const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1124 2751104 : ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1125 2751104 : ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1126 2751104 : r[0] = convolve_4tap_avx2(ss_256, coeffs);
1127 2751146 : r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1128 2751115 : }
1129 :
1130 65728 : static INLINE __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
1131 : const int32_t stride,
1132 : const __m128i coeffs[3],
1133 : __m128i s_16[6],
1134 : __m128i ss_128[3]) {
1135 65728 : s_16[5] = _mm_cvtsi32_si128(*(int16_t *)(src + 3 * stride));
1136 65728 : const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1137 65728 : s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src + 4 * stride));
1138 65728 : const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1139 65728 : ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1140 65728 : return convolve_6tap_ssse3(ss_128, coeffs);
1141 : }
1142 :
1143 11086 : static INLINE void y_convolve_4tap_32x2_avx2(
1144 : const uint8_t *const src, const int32_t stride, const __m256i coeffs[2],
1145 : __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1146 11086 : s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1147 11086 : ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1148 11086 : ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1149 11086 : s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1150 11086 : tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1151 11086 : tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1152 11086 : r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1153 11086 : r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1154 11086 : r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1155 11086 : r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1156 11086 : }
1157 :
1158 15450542 : static INLINE __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1159 : const int32_t stride,
1160 : const __m128i coeffs[3],
1161 : __m128i s_32[6],
1162 : __m128i ss_128[3]) {
1163 15450542 : s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src + 3 * stride));
1164 15450542 : const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1165 15450542 : s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src + 4 * stride));
1166 15450542 : const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1167 15450542 : ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1168 15450542 : return convolve_6tap_ssse3(ss_128, coeffs);
1169 : }
1170 :
1171 90980600 : static INLINE __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
1172 : const int32_t stride,
1173 : const __m256i coeffs[3],
1174 : __m128i s_64[6],
1175 : __m256i ss_256[3]) {
1176 90980600 : s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1177 90980600 : const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1178 90980600 : s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1179 90980600 : const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1180 90980600 : ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1181 90980600 : return convolve_6tap_avx2(ss_256, coeffs);
1182 : }
1183 :
1184 89805200 : static INLINE void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
1185 : const int32_t stride,
1186 : const __m256i coeffs[3],
1187 : __m128i s_128[6],
1188 : __m256i ss_256[3], __m256i r[2]) {
1189 89805200 : s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1190 89805200 : const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1191 89805200 : s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1192 89805200 : const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1193 89805200 : ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1194 89805200 : ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1195 89805200 : r[0] = convolve_6tap_avx2(ss_256, coeffs);
1196 89819500 : r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1197 89812000 : }
1198 :
1199 96092600 : static INLINE void y_convolve_6tap_32x2_avx2(
1200 : const uint8_t *const src, const int32_t stride, const __m256i coeffs[3],
1201 : __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1202 96092600 : s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1203 96092600 : ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1204 96092600 : ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1205 96092600 : s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1206 96092600 : tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1207 96092600 : tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1208 96092600 : r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1209 96100800 : r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1210 96077600 : r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1211 96061600 : r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1212 96066500 : }
1213 :
1214 0 : static INLINE __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
1215 : const int32_t stride,
1216 : const __m128i coeffs[4],
1217 : __m128i s_16[8],
1218 : __m128i ss_128[4]) {
1219 0 : s_16[7] = _mm_cvtsi32_si128(*(int16_t *)(src + 7 * stride));
1220 0 : const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1221 0 : s_16[6] = _mm_cvtsi32_si128(*(int16_t *)(src + 8 * stride));
1222 0 : const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1223 0 : ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1224 0 : return convolve_8tap_ssse3(ss_128, coeffs);
1225 : }
1226 :
1227 89584 : static INLINE __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
1228 : const int32_t stride,
1229 : const __m128i coeffs[4],
1230 : __m128i s_32[8],
1231 : __m128i ss_128[4]) {
1232 89584 : s_32[7] = _mm_cvtsi32_si128(*(int32_t *)(src + 7 * stride));
1233 89584 : const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1234 89584 : s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src + 8 * stride));
1235 89584 : const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1236 89584 : ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1237 89584 : return convolve_8tap_ssse3(ss_128, coeffs);
1238 : }
1239 :
1240 16576230 : static INLINE __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
1241 : const int32_t stride,
1242 : const __m256i coeffs[4],
1243 : __m128i s_64[8],
1244 : __m256i ss_256[4]) {
1245 16576230 : s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1246 16576230 : const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1247 16576230 : s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1248 16576230 : const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1249 16576230 : ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1250 16576230 : return convolve_8tap_avx2(ss_256, coeffs);
1251 : }
1252 :
1253 16636140 : static INLINE void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
1254 : const int32_t stride,
1255 : const __m256i coeffs[4],
1256 : __m128i s_128[8],
1257 : __m256i ss_256[4], __m256i r[2]) {
1258 16636140 : s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1259 16636140 : const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1260 16636140 : s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1261 16636140 : const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1262 16636140 : ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1263 16636140 : ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1264 16636140 : r[0] = convolve_8tap_avx2(ss_256, coeffs);
1265 16637710 : r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1266 16637650 : }
1267 :
1268 20257350 : static INLINE void y_convolve_8tap_32x2_avx2(
1269 : const uint8_t *const src, const int32_t stride, const __m256i coeffs[4],
1270 : __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1271 20257350 : s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1272 20257350 : ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1273 20257350 : ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1274 20257350 : s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1275 20257350 : tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1276 20257350 : tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1277 20257350 : r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1278 20258850 : r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1279 20258370 : r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1280 20257420 : r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1281 20257030 : }
1282 :
1283 687460000 : static INLINE void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
1284 : const __m256i coeffs[1],
1285 : __m256i r[2]) {
1286 687460000 : const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1287 1374920000 : const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1288 687460000 : const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1289 687460000 : const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1290 :
1291 687460000 : r[0] = convolve_2tap_avx2(&ss0, coeffs);
1292 687282000 : r[1] = convolve_2tap_avx2(&ss1, coeffs);
1293 683459000 : }
1294 :
1295 688062000 : static INLINE void xy_x_2tap_32_avx2(const uint8_t *const src,
1296 : const __m256i coeffs[1],
1297 : int16_t *const dst) {
1298 : __m256i r[2];
1299 :
1300 688062000 : xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1301 683510000 : const __m256i d0 = xy_x_round_avx2(r[0]);
1302 687328000 : const __m256i d1 = xy_x_round_avx2(r[1]);
1303 : // d0 = _mm256_inserti128_si256(d0, _mm256_extracti128_si256(d1, 0), 1);
1304 : // d1 = _mm256_inserti128_si256(d1, _mm256_extracti128_si256(d0, 1), 0);
1305 : _mm256_store_si256((__m256i *)dst, d0);
1306 688865000 : _mm256_store_si256((__m256i *)(dst + 16), d1);
1307 688865000 : }
1308 :
1309 299944000 : static INLINE void xy_x_6tap_32_avx2(const uint8_t *const src,
1310 : const int32_t src_stride,
1311 : const __m256i coeffs[3],
1312 : const __m256i *const filt,
1313 : int16_t *const dst) {
1314 : __m256i r[2];
1315 :
1316 299944000 : x_convolve_6tap_16x2_avx2(src, src_stride, coeffs, filt, r);
1317 299404000 : const __m256i d0 = xy_x_round_avx2(r[0]);
1318 299989000 : const __m256i d1 = xy_x_round_avx2(r[1]);
1319 : _mm256_store_si256((__m256i *)dst, d0);
1320 300218000 : _mm256_store_si256((__m256i *)(dst + 16), d1);
1321 300218000 : }
1322 :
1323 160006000 : static INLINE void xy_x_8tap_32_avx2(const uint8_t *const src,
1324 : const int32_t src_stride,
1325 : const __m256i coeffs[4],
1326 : const __m256i *const filt,
1327 : int16_t *const dst) {
1328 : __m256i r[2];
1329 :
1330 : x_convolve_8tap_16x2_avx2(src, src_stride, coeffs, filt, r);
1331 159728000 : const __m256i d0 = xy_x_round_avx2(r[0]);
1332 159874000 : const __m256i d1 = xy_x_round_avx2(r[1]);
1333 : _mm256_store_si256((__m256i *)dst, d0);
1334 160038000 : _mm256_store_si256((__m256i *)(dst + 16), d1);
1335 160038000 : }
1336 :
1337 0 : static INLINE __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
1338 : __m128i s_32[2],
1339 : const __m128i coeffs[1]) {
1340 : __m128i s_128[2];
1341 :
1342 0 : s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src + 2));
1343 0 : s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1344 0 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * 2));
1345 0 : s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1346 0 : const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1347 0 : return convolve16_2tap_sse2(&ss, coeffs);
1348 : }
1349 :
1350 0 : static INLINE __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
1351 : const int16_t *const src, __m128i s_32[2]) {
1352 : __m128i s_128[2];
1353 :
1354 0 : s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src + 2));
1355 0 : s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1356 0 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * 2));
1357 0 : s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1358 0 : return _mm_add_epi16(s_128[0], s_128[1]);
1359 : }
1360 :
1361 0 : static INLINE void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
1362 : __m128i s_64[2],
1363 : const __m128i coeffs[1],
1364 : __m128i r[2]) {
1365 : __m128i s_128[2];
1366 :
1367 0 : s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1368 0 : s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1369 0 : s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1370 0 : s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1371 0 : const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1372 0 : const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1373 0 : r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1374 0 : r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1375 0 : }
1376 :
1377 0 : static INLINE __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
1378 : const int16_t *const src, __m128i s_64[2]) {
1379 : __m128i s_128[2];
1380 :
1381 0 : s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1382 0 : s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1383 0 : s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1384 0 : s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1385 0 : return _mm_add_epi16(s_128[0], s_128[1]);
1386 : }
1387 :
1388 1704090000 : static INLINE void xy_y_convolve_2tap_16_avx2(const __m256i s0,
1389 : const __m256i s1,
1390 : const __m256i coeffs[1],
1391 : __m256i r[2]) {
1392 1704090000 : const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1393 1704090000 : const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1394 1704090000 : r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1395 1698140000 : r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1396 1684730000 : }
1397 :
1398 235131000 : static INLINE void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
1399 : __m128i s_128[2],
1400 : const __m256i coeffs[1],
1401 : __m256i r[2]) {
1402 : __m256i s_256[2];
1403 235131000 : s_128[1] = _mm_load_si128((__m128i *)(src + 8));
1404 235131000 : s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1405 235131000 : s_128[0] = _mm_load_si128((__m128i *)(src + 2 * 8));
1406 235131000 : s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1407 235131000 : xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1408 234854000 : }
1409 :
1410 56561000 : static INLINE __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
1411 : const int16_t *const src, __m128i s_128[2]) {
1412 : __m256i s_256[2];
1413 56561000 : s_128[1] = _mm_load_si128((__m128i *)(src + 8));
1414 56561000 : s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1415 56561000 : s_128[0] = _mm_load_si128((__m128i *)(src + 2 * 8));
1416 56561000 : s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1417 113122000 : return _mm256_add_epi16(s_256[0], s_256[1]);
1418 : }
1419 :
1420 59906500 : static INLINE void xy_y_convolve_2tap_16x2_half_pel_avx2(
1421 : const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1422 59906500 : s_256[1] = _mm256_load_si256((__m256i *)(src + 16));
1423 59906500 : r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1424 59906500 : s_256[0] = _mm256_load_si256((__m256i *)(src + 2 * 16));
1425 59906500 : r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1426 59906500 : }
1427 :
1428 : static INLINE void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
1429 : const int32_t stride) {
1430 : const __m256i t = _mm256_packus_epi16(r[0], r[1]);
1431 : const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
1432 : storeu_u8_16x2_avx2(d, dst, stride);
1433 : }
1434 :
1435 234788000 : static INLINE void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
1436 : __m256i s[2],
1437 : const __m256i coeffs[1],
1438 : __m256i r[4]) {
1439 234788000 : s[1] = _mm256_load_si256((__m256i *)(src + 16));
1440 234788000 : xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r);
1441 233845000 : s[0] = _mm256_load_si256((__m256i *)(src + 2 * 16));
1442 233845000 : xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1443 233143000 : }
1444 :
1445 532882000 : static INLINE void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
1446 : const __m256i s0[2],
1447 : __m256i s1[2],
1448 : const __m256i coeffs[1],
1449 : __m256i r[4]) {
1450 532882000 : s1[0] = _mm256_load_si256((__m256i *)src);
1451 532882000 : s1[1] = _mm256_load_si256((__m256i *)(src + 16));
1452 532882000 : xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1453 528235000 : xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1454 525035000 : }
1455 :
1456 : static INLINE void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
1457 : const __m256i s0[2],
1458 : __m256i s1[2],
1459 : const __m256i coeffs[1],
1460 : uint8_t *const dst) {
1461 : __m256i r[4];
1462 :
1463 : xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1464 : xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1465 : }
1466 :
1467 131852000 : static INLINE void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
1468 : const __m256i s0[2],
1469 : __m256i s1[2],
1470 : __m256i r[2]) {
1471 131852000 : s1[0] = _mm256_load_si256((__m256i *)src);
1472 131852000 : s1[1] = _mm256_load_si256((__m256i *)(src + 16));
1473 131852000 : r[0] = _mm256_add_epi16(s0[0], s1[0]);
1474 131852000 : r[1] = _mm256_add_epi16(s0[1], s1[1]);
1475 131852000 : }
1476 :
1477 : static INLINE void xy_y_convolve_2tap_half_pel_32_all_avx2(
1478 : const int16_t *const src, const __m256i s0[2], __m256i s1[2],
1479 : uint8_t *const dst) {
1480 : __m256i r[2];
1481 :
1482 : xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1483 : r[0] = xy_y_round_half_pel_avx2(r[0]);
1484 : r[1] = xy_y_round_half_pel_avx2(r[1]);
1485 : xy_y_pack_store_32_avx2(r[0], r[1], dst);
1486 : }
1487 :
1488 0 : static INLINE __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
1489 : __m128i s_32[4],
1490 : __m128i ss_128[2],
1491 : const __m128i coeffs[1]) {
1492 0 : s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src + 3 * 2));
1493 0 : const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1494 0 : s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src + 4 * 2));
1495 0 : const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1496 0 : ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1497 0 : const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1498 0 : ss_128[0] = ss_128[1];
1499 0 : return r;
1500 : }
1501 :
1502 1338440 : static INLINE __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
1503 : __m128i s_64[4],
1504 : __m256i ss_256[2],
1505 : const __m256i coeffs[2]) {
1506 : __m256i s_256[2];
1507 1338440 : s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1508 1338440 : s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1509 1338440 : s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1510 1338440 : s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1511 1338440 : ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1512 1338440 : const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1513 1338450 : ss_256[0] = ss_256[1];
1514 1338450 : return r;
1515 : }
1516 :
1517 2837830 : static INLINE void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
1518 : const __m256i coeffs[2],
1519 : __m256i r[2]) {
1520 2837830 : r[0] = convolve16_4tap_avx2(ss, coeffs);
1521 2837800 : r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1522 2837770 : }
1523 :
1524 1099400 : static INLINE void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
1525 : __m256i ss_256[4],
1526 : const __m256i coeffs[2],
1527 : __m256i r[2]) {
1528 : __m256i s_256[2];
1529 1099400 : s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1530 1099400 : s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1531 1099400 : ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1532 1099400 : ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1533 1099400 : xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1534 1099400 : ss_256[0] = ss_256[1];
1535 1099400 : ss_256[2] = ss_256[3];
1536 1099400 : }
1537 :
1538 : static INLINE void xy_y_convolve_4tap_8x2_half_pel_avx2(
1539 : const int16_t *const src, const __m256i coeffs[1], __m256i s_256[4],
1540 : __m256i r[2]) {
1541 : __m256i a_256[2];
1542 : s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1543 : s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1544 : a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1545 : a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1546 : xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1547 : s_256[0] = s_256[2];
1548 : s_256[1] = s_256[3];
1549 : }
1550 :
1551 869257 : static INLINE void xy_y_convolve_4tap_16x2_avx2(
1552 : const int16_t *const src, __m256i s_256[4], __m256i ss_256[4],
1553 : __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1554 869257 : s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1555 869257 : ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1556 869257 : ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1557 869257 : s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1558 869257 : tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1559 869257 : tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1560 869257 : xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1561 869250 : xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1562 869246 : ss_256[0] = ss_256[1];
1563 869246 : ss_256[2] = ss_256[3];
1564 869246 : tt_256[0] = tt_256[1];
1565 869246 : tt_256[2] = tt_256[3];
1566 869246 : }
1567 :
1568 : static INLINE void xy_y_convolve_4tap_32x2_avx2(
1569 : const int16_t *const src, const int32_t stride,
1570 : __m256i s_256[4], __m256i ss_256[4],
1571 : __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4])
1572 : {
1573 : s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1574 : ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1575 : ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1576 : s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1577 : tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1578 : tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1579 : xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1580 : xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1581 : ss_256[0] = ss_256[1];
1582 : ss_256[2] = ss_256[3];
1583 : tt_256[0] = tt_256[1];
1584 : tt_256[2] = tt_256[3];
1585 : }
1586 :
1587 : static INLINE void xy_y_convolve_4tap_16x2_half_pelavx2(
1588 : const int16_t *const src, __m256i s_256[5], const __m256i coeffs[1],
1589 : __m256i r[4]) {
1590 : __m256i a_256[2];
1591 :
1592 : s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1593 : s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1594 :
1595 : a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1596 : a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1597 : xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1598 :
1599 : a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1600 : a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1601 : xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1602 :
1603 : s_256[0] = s_256[2];
1604 : s_256[1] = s_256[3];
1605 : s_256[2] = s_256[4];
1606 : }
1607 :
1608 0 : static INLINE __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
1609 : __m128i s_32[6],
1610 : __m128i ss_128[3],
1611 : const __m128i coeffs[3]) {
1612 0 : s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src + 5 * 2));
1613 0 : const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1614 0 : s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src + 6 * 2));
1615 0 : const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1616 0 : ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1617 0 : const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1618 0 : ss_128[0] = ss_128[1];
1619 0 : ss_128[1] = ss_128[2];
1620 0 : return r;
1621 : }
1622 :
1623 5727240 : static INLINE __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
1624 : __m128i s_64[6],
1625 : __m256i ss_256[3],
1626 : const __m256i coeffs[3]) {
1627 : __m256i s_256[2];
1628 5727240 : s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1629 5727240 : s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1630 5727240 : s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1631 5727240 : s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1632 5727240 : ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1633 5727240 : const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1634 5727260 : ss_256[0] = ss_256[1];
1635 5727260 : ss_256[1] = ss_256[2];
1636 5727260 : return r;
1637 : }
1638 :
1639 837617000 : static INLINE void xy_y_convolve_6tap_16_avx2(const __m256i *const ss,
1640 : const __m256i coeffs[3],
1641 : __m256i r[2]) {
1642 837617000 : r[0] = convolve16_6tap_avx2(ss, coeffs);
1643 834291000 : r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1644 833598000 : }
1645 :
1646 129073000 : static INLINE void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
1647 : __m256i ss_256[6],
1648 : const __m256i coeffs[3],
1649 : __m256i r[2]) {
1650 : __m256i s_256[2];
1651 129073000 : s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1652 129073000 : s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1653 129073000 : ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1654 129073000 : ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1655 129073000 : xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1656 129033000 : ss_256[0] = ss_256[1];
1657 129033000 : ss_256[1] = ss_256[2];
1658 129033000 : ss_256[3] = ss_256[4];
1659 129033000 : ss_256[4] = ss_256[5];
1660 129033000 : }
1661 :
1662 : static INLINE void xy_y_convolve_6tap_8x2_half_pel_avx2(
1663 : const int16_t *const src, const __m256i coeffs[2], __m256i s_256[4],
1664 : __m256i r[2]) {
1665 : __m256i a_256[2], ss_256[4];
1666 : s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1667 : s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1668 : a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1669 : a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1670 : ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1671 : ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1672 : ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1673 : ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1674 : xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1675 : s_256[0] = s_256[2];
1676 : s_256[1] = s_256[3];
1677 : s_256[2] = s_256[4];
1678 : s_256[3] = s_256[5];
1679 : }
1680 :
1681 359157000 : static INLINE void xy_y_convolve_6tap_16x2_avx2(
1682 : const int16_t *const src, const int32_t stride, __m256i s_256[6],
1683 : __m256i ss_256[6], __m256i tt_256[6], const __m256i coeffs[3],
1684 : __m256i r[4]) {
1685 359157000 : s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1686 359157000 : ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1687 359157000 : ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1688 359157000 : s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1689 359157000 : tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1690 359157000 : tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1691 :
1692 359157000 : xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1693 358881000 : xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1694 :
1695 357852000 : ss_256[0] = ss_256[1];
1696 357852000 : ss_256[1] = ss_256[2];
1697 357852000 : ss_256[3] = ss_256[4];
1698 357852000 : ss_256[4] = ss_256[5];
1699 :
1700 357852000 : tt_256[0] = tt_256[1];
1701 357852000 : tt_256[1] = tt_256[2];
1702 357852000 : tt_256[3] = tt_256[4];
1703 357852000 : tt_256[4] = tt_256[5];
1704 357852000 : }
1705 :
1706 : static INLINE void xy_y_convolve_6tap_16x2_half_pel_avx2(
1707 : const int16_t *const src, const int32_t stride, __m256i s_256[6],
1708 : __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1709 : __m256i a_256[2];
1710 :
1711 : s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1712 : a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1713 : a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1714 : ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1715 : ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1716 : ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1717 : ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1718 : xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1719 :
1720 : a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1721 : s_256[0] = s_256[2];
1722 : s_256[2] = s_256[4];
1723 : s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1724 : a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1725 : s_256[1] = s_256[3];
1726 : s_256[3] = s_256[5];
1727 : ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1728 : ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1729 : ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1730 : ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1731 : xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1732 : }
1733 :
1734 0 : static INLINE __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
1735 : __m128i s_32[8],
1736 : __m128i ss_128[4],
1737 : const __m128i coeffs[4]) {
1738 0 : s_32[7] = _mm_cvtsi32_si128(*(int32_t *)(src + 7 * 2));
1739 0 : const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1740 0 : s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src + 8 * 2));
1741 0 : const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1742 0 : ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1743 0 : const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1744 0 : ss_128[0] = ss_128[1];
1745 0 : ss_128[1] = ss_128[2];
1746 0 : ss_128[2] = ss_128[3];
1747 0 : return r;
1748 : }
1749 :
1750 708165 : static INLINE __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
1751 : __m128i s_64[8],
1752 : __m256i ss_256[4],
1753 : const __m256i coeffs[4]) {
1754 : __m256i s_256[2];
1755 708165 : s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1756 708165 : s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1757 708165 : s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1758 708165 : s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1759 708165 : ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1760 708165 : const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1761 708166 : ss_256[0] = ss_256[1];
1762 708166 : ss_256[1] = ss_256[2];
1763 708166 : ss_256[2] = ss_256[3];
1764 708166 : return r;
1765 : }
1766 :
1767 402514000 : static INLINE void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
1768 : const __m256i coeffs[4],
1769 : __m256i r[2]) {
1770 402514000 : r[0] = convolve16_8tap_avx2(ss, coeffs);
1771 401863000 : r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1772 400963000 : }
1773 :
1774 55040100 : static INLINE void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
1775 : __m256i ss_256[8],
1776 : const __m256i coeffs[4],
1777 : __m256i r[2]) {
1778 : __m256i s_256[2];
1779 55040100 : s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1780 55040100 : s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1781 55040100 : ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1782 55040100 : ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1783 55040100 : xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1784 55041900 : ss_256[0] = ss_256[1];
1785 55041900 : ss_256[1] = ss_256[2];
1786 55041900 : ss_256[2] = ss_256[3];
1787 55041900 : ss_256[4] = ss_256[5];
1788 55041900 : ss_256[5] = ss_256[6];
1789 55041900 : ss_256[6] = ss_256[7];
1790 55041900 : }
1791 :
1792 : static INLINE void xy_y_convolve_8tap_8x2_half_pel_avx2(
1793 : const int16_t *const src, const __m256i coeffs[2], __m256i s_256[8],
1794 : __m256i r[2]) {
1795 : __m256i a_256[4], ss_256[4];
1796 :
1797 : s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1798 : s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1799 : a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1800 : a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1801 : a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1802 : a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1803 : ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1804 : ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1805 : ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1806 : ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1807 : xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1808 : s_256[0] = s_256[2];
1809 : s_256[1] = s_256[3];
1810 : s_256[2] = s_256[4];
1811 : s_256[3] = s_256[5];
1812 : s_256[4] = s_256[6];
1813 : s_256[5] = s_256[7];
1814 : }
1815 :
1816 : SIMD_INLINE void xy_y_convolve_8tap_16x2_avx2(
1817 : const int16_t *const src, const int32_t stride, const __m256i coeffs[4],
1818 : __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1819 175608000 : s_256[7] = _mm256_load_si256((__m256i *)(src + 7 * stride));
1820 175608000 : ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1821 175608000 : ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1822 175608000 : s_256[6] = _mm256_load_si256((__m256i *)(src + 8 * stride));
1823 175608000 : tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1824 175608000 : tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1825 175608000 : xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1826 175533000 : xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1827 175627000 : ss_256[0] = ss_256[1];
1828 175627000 : ss_256[1] = ss_256[2];
1829 175627000 : ss_256[2] = ss_256[3];
1830 175627000 : ss_256[4] = ss_256[5];
1831 175627000 : ss_256[5] = ss_256[6];
1832 175627000 : ss_256[6] = ss_256[7];
1833 175627000 : tt_256[0] = tt_256[1];
1834 175627000 : tt_256[1] = tt_256[2];
1835 175627000 : tt_256[2] = tt_256[3];
1836 175627000 : tt_256[4] = tt_256[5];
1837 175627000 : tt_256[5] = tt_256[6];
1838 175627000 : tt_256[6] = tt_256[7];
1839 175627000 : }
1840 :
1841 : static INLINE void xy_y_convolve_8tap_16x2_half_pel_avx2(
1842 : const int16_t *const src, const int32_t stride, const __m256i coeffs[4],
1843 : __m256i s_256[8], __m256i r[4]) {
1844 : __m256i a_256[4], ss_256[4];
1845 : s_256[7] = _mm256_load_si256((__m256i *)(src + 7 * stride));
1846 :
1847 : a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1848 : a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1849 : a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1850 : a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1851 : ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1852 : ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1853 : ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1854 : ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1855 :
1856 : xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1857 :
1858 : a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1859 : a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1860 : a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1861 : s_256[0] = s_256[2];
1862 : s_256[2] = s_256[4];
1863 : s_256[4] = s_256[6];
1864 : s_256[6] = _mm256_load_si256((__m256i *)(src + 8 * stride));
1865 :
1866 : a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1867 : s_256[1] = s_256[3];
1868 : s_256[3] = s_256[5];
1869 : s_256[5] = s_256[7];
1870 : ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1871 : ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1872 : ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1873 : ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1874 :
1875 : xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1876 : }
1877 :
1878 0 : static INLINE void jnt_comp_avg_round_store_2x2_sse2(
1879 : const __m128i res, const __m128i factor, const __m128i offset,
1880 : const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
1881 : const int32_t dst8_stride) {
1882 0 : const __m128i r = jnt_y_round_sse2(res);
1883 : __m128i d;
1884 :
1885 0 : d = load_u16_2x2_sse4_1(dst, dst_stride);
1886 0 : d = _mm_unpacklo_epi16(d, r);
1887 0 : d = _mm_madd_epi16(d, factor);
1888 0 : d = _mm_add_epi32(d, offset);
1889 0 : d = _mm_srai_epi32(d, 8);
1890 0 : d = _mm_packs_epi32(d, d);
1891 0 : pack_store_2x2_sse2(d, dst8, dst8_stride);
1892 0 : }
1893 :
1894 225212 : static INLINE void jnt_comp_avg_round_store_4x2_sse2(
1895 : const __m128i res, const __m128i factor, const __m128i offset,
1896 : const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
1897 : const int32_t dst8_stride) {
1898 225212 : const __m128i r = jnt_y_round_sse2(res);
1899 225212 : const __m128i dst_128 = load_u16_4x2_sse2(dst, dst_stride);
1900 : __m128i d[2];
1901 :
1902 225212 : d[0] = _mm_unpacklo_epi16(dst_128, r);
1903 225212 : d[1] = _mm_unpackhi_epi16(dst_128, r);
1904 225212 : d[0] = _mm_madd_epi16(d[0], factor);
1905 225212 : d[1] = _mm_madd_epi16(d[1], factor);
1906 225212 : d[0] = _mm_add_epi32(d[0], offset);
1907 225212 : d[1] = _mm_add_epi32(d[1], offset);
1908 225212 : d[0] = _mm_srai_epi32(d[0], 8);
1909 225212 : d[1] = _mm_srai_epi32(d[1], 8);
1910 225212 : d[0] = _mm_packs_epi32(d[0], d[1]);
1911 225212 : pack_store_4x2_sse2(d[0], dst8, dst8_stride);
1912 225212 : }
1913 :
1914 231372000 : static INLINE __m256i jnt_comp_avg_convolve_16_avx2(const __m256i res,
1915 : const __m256i dst,
1916 : const __m256i factor,
1917 : const __m256i offset) {
1918 : __m256i d[2];
1919 :
1920 231372000 : d[0] = _mm256_unpacklo_epi16(dst, res);
1921 231372000 : d[1] = _mm256_unpackhi_epi16(dst, res);
1922 231372000 : d[0] = _mm256_madd_epi16(d[0], factor);
1923 231372000 : d[1] = _mm256_madd_epi16(d[1], factor);
1924 231372000 : d[0] = _mm256_add_epi32(d[0], offset);
1925 231372000 : d[1] = _mm256_add_epi32(d[1], offset);
1926 231372000 : d[0] = _mm256_srai_epi32(d[0], 8);
1927 231372000 : d[1] = _mm256_srai_epi32(d[1], 8);
1928 462744000 : return _mm256_packs_epi32(d[0], d[1]);
1929 : }
1930 :
1931 29951000 : static INLINE void jnt_comp_avg_round_store_8x2_avx2(
1932 : const __m256i res, const __m256i factor, const __m256i offset,
1933 : const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
1934 : const int32_t dst8_stride) {
1935 29951000 : const __m256i r = jnt_y_round_avx2(res);
1936 : __m256i d;
1937 :
1938 29952700 : d = loadu_u16_8x2_avx2(dst, dst_stride);
1939 29949800 : d = jnt_comp_avg_convolve_16_avx2(r, d, factor, offset);
1940 29953800 : pack_store_8x2_avx2(d, dst8, dst8_stride);
1941 29951000 : }
1942 :
1943 : SIMD_INLINE void jnt_comp_avg_round_store_16x2_avx2(
1944 : const __m256i res[2], const __m256i factor, const __m256i offset,
1945 : const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
1946 : const int32_t dst8_stride) {
1947 : __m256i r[2], d[2];
1948 :
1949 33230600 : r[0] = jnt_y_round_avx2(res[0]);
1950 33231800 : r[1] = jnt_y_round_avx2(res[1]);
1951 33232000 : d[0] = loadu_u16_8x2_avx2(dst, dst_stride);
1952 33230500 : d[1] = loadu_u16_8x2_avx2(dst + 8, dst_stride);
1953 33229700 : d[0] = jnt_comp_avg_convolve_16_avx2(r[0], d[0], factor, offset);
1954 33231500 : d[1] = jnt_comp_avg_convolve_16_avx2(r[1], d[1], factor, offset);
1955 33232700 : pack_store_16x2_avx2(d[0], d[1], dst8, dst8_stride);
1956 33232200 : }
1957 :
1958 : SIMD_INLINE void jnt_comp_avg_round_store_32_avx2(const __m256i res[2],
1959 : const __m256i factor,
1960 : const __m256i offset,
1961 : const ConvBufType *const dst,
1962 : uint8_t *const dst8) {
1963 : __m256i r[2], d[2];
1964 :
1965 67840500 : r[0] = jnt_y_round_avx2(res[0]);
1966 67842600 : r[1] = jnt_y_round_avx2(res[1]);
1967 67843100 : d[0] = loadu_u16_8x2_avx2(dst, 16);
1968 67840400 : d[1] = loadu_u16_8x2_avx2(dst + 8, 16);
1969 67839100 : d[0] = jnt_comp_avg_convolve_16_avx2(r[0], d[0], factor, offset);
1970 67842500 : d[1] = jnt_comp_avg_convolve_16_avx2(r[1], d[1], factor, offset);
1971 67843500 : convolve_store_32_avx2(d[0], d[1], dst8);
1972 67842400 : }
1973 :
1974 351615 : static INLINE __m128i jnt_avg_4x2_sse2(const __m128i res, const __m128i dst) {
1975 351615 : const __m128i d = _mm_add_epi16(res, dst);
1976 351615 : return _mm_srai_epi16(d, 5);
1977 : }
1978 :
1979 0 : static INLINE void jnt_avg_round_store_2x2_sse2(
1980 : const __m128i res, const __m128i offset, const ConvBufType *const dst,
1981 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
1982 0 : const __m128i r = jnt_avg_round_sse2(res, offset);
1983 : __m128i d;
1984 :
1985 0 : d = load_u16_2x2_sse4_1(dst, dst_stride);
1986 0 : d = jnt_avg_4x2_sse2(r, d);
1987 0 : pack_store_2x2_sse2(d, dst8, dst8_stride);
1988 0 : }
1989 :
1990 351613 : static INLINE void jnt_avg_round_store_4x2_sse2(
1991 : const __m128i res, const __m128i offset, const ConvBufType *const dst,
1992 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
1993 351613 : const __m128i r = jnt_avg_round_sse2(res, offset);
1994 : __m128i d;
1995 :
1996 351614 : d = load_u16_4x2_sse2(dst, dst_stride);
1997 351615 : d = jnt_avg_4x2_sse2(r, d);
1998 351615 : pack_store_4x2_sse2(d, dst8, dst8_stride);
1999 351614 : }
2000 :
2001 911865000 : static INLINE __m256i jnt_avg_16_avx2(const __m256i res, const __m256i dst) {
2002 911865000 : const __m256i d = _mm256_add_epi16(res, dst);
2003 911865000 : return _mm256_srai_epi16(d, 5);
2004 : }
2005 :
2006 33379100 : static INLINE void jnt_avg_round_store_8x2_sse2(
2007 : const __m256i res, const __m256i offset, const ConvBufType *const dst,
2008 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
2009 33379100 : const __m256i r = jnt_avg_round_avx2(res, offset);
2010 : __m256i d;
2011 :
2012 33375600 : d = loadu_u16_8x2_avx2(dst, dst_stride);
2013 33371500 : d = jnt_avg_16_avx2(r, d);
2014 33375000 : pack_store_8x2_avx2(d, dst8, dst8_stride);
2015 33377200 : }
2016 :
2017 41128900 : static INLINE void jnt_avg_round_store_16x2_avx2(
2018 : const __m256i res[2], const __m256i offset, const ConvBufType *const dst,
2019 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
2020 : __m256i r[2], d[2];
2021 :
2022 41128900 : r[0] = jnt_avg_round_avx2(res[0], offset);
2023 41127000 : r[1] = jnt_avg_round_avx2(res[1], offset);
2024 41116400 : d[0] = loadu_u16_8x2_avx2(dst, dst_stride);
2025 41113400 : d[1] = loadu_u16_8x2_avx2(dst + 8, dst_stride);
2026 41110900 : d[0] = jnt_avg_16_avx2(r[0], d[0]);
2027 41116000 : d[1] = jnt_avg_16_avx2(r[1], d[1]);
2028 41117700 : pack_store_16x2_avx2(d[0], d[1], dst8, dst8_stride);
2029 41127100 : }
2030 :
2031 80887900 : static INLINE void jnt_avg_round_store_32_avx2(const __m256i res[2],
2032 : const __m256i offset,
2033 : const ConvBufType *const dst,
2034 : uint8_t *const dst8) {
2035 : __m256i r[2], d[2];
2036 :
2037 80887900 : r[0] = jnt_avg_round_avx2(res[0], offset);
2038 80880600 : r[1] = jnt_avg_round_avx2(res[1], offset);
2039 80846300 : d[0] = loadu_u16_8x2_avx2(dst, 16);
2040 80843400 : d[1] = loadu_u16_8x2_avx2(dst + 8, 16);
2041 80840600 : d[0] = jnt_avg_16_avx2(r[0], d[0]);
2042 80848500 : d[1] = jnt_avg_16_avx2(r[1], d[1]);
2043 80830500 : convolve_store_32_avx2(d[0], d[1], dst8);
2044 80835700 : }
2045 :
2046 0 : static INLINE void jnt_no_avg_round_store_2x2_sse2(const __m128i res,
2047 : const __m128i offset,
2048 : ConvBufType *const dst,
2049 : const int32_t dst_stride) {
2050 0 : const __m128i d = jnt_no_avg_round_sse2(res, offset);
2051 : store_u16_2x2_sse2(d, dst, dst_stride);
2052 0 : }
2053 :
2054 733248 : static INLINE void jnt_no_avg_round_store_4x2_sse2(const __m128i res,
2055 : const __m128i offset,
2056 : ConvBufType *const dst,
2057 : const int32_t dst_stride) {
2058 733248 : const __m128i d = jnt_no_avg_round_sse2(res, offset);
2059 : store_u16_4x2_sse2(d, dst, dst_stride);
2060 733245 : }
2061 :
2062 130712000 : static INLINE void jnt_no_avg_round_store_8x2_avx2(const __m256i res,
2063 : const __m256i offset,
2064 : ConvBufType *const dst,
2065 : const int32_t dst_stride) {
2066 130712000 : const __m256i d = jnt_no_avg_round_avx2(res, offset);
2067 130680000 : storeu_u16_8x2_avx2(d, dst, dst_stride);
2068 130700000 : }
2069 :
2070 154087000 : static INLINE void jnt_no_avg_round_store_16x2_avx2(const __m256i res[2],
2071 : const __m256i offset,
2072 : ConvBufType *const dst,
2073 : const int32_t dst_stride) {
2074 : __m256i d[2];
2075 :
2076 154087000 : d[0] = jnt_no_avg_round_avx2(res[0], offset);
2077 154061000 : d[1] = jnt_no_avg_round_avx2(res[1], offset);
2078 153944000 : jnt_no_avg_store_16x2_avx2(d[0], d[1], dst, dst_stride);
2079 153973000 : }
2080 :
2081 297976000 : static INLINE void jnt_no_avg_round_store_32_avx2(const __m256i res[2],
2082 : const __m256i offset,
2083 : ConvBufType *const dst) {
2084 : __m256i d[2];
2085 :
2086 297976000 : d[0] = jnt_no_avg_round_avx2(res[0], offset);
2087 297856000 : d[1] = jnt_no_avg_round_avx2(res[1], offset);
2088 297499000 : jnt_no_avg_store_16x2_avx2(d[0], d[1], dst, 16);
2089 297655000 : }
2090 :
2091 128357000 : static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
2092 : const __m256i *const res_unsigned,
2093 : const __m256i *const wt,
2094 : const int32_t use_jnt_comp_avg) {
2095 : __m256i res;
2096 128357000 : if (use_jnt_comp_avg) {
2097 56132800 : const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
2098 56132800 : const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
2099 :
2100 56132800 : const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
2101 112266000 : const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
2102 :
2103 56132800 : const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
2104 56132800 : const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
2105 :
2106 56132800 : res = _mm256_packs_epi32(res_lo, res_hi);
2107 : }
2108 : else {
2109 144449000 : const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
2110 72224700 : res = _mm256_srai_epi16(wt_res, 1);
2111 : }
2112 128357000 : return res;
2113 : }
2114 :
2115 128329000 : static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
2116 : const __m256i *const offset_const,
2117 : const __m256i *const round_const,
2118 : const int32_t round_shift) {
2119 128329000 : const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
2120 256658000 : const __m256i res_round = _mm256_srai_epi16(
2121 : _mm256_add_epi16(res_signed, *round_const), round_shift);
2122 128329000 : return res_round;
2123 : }
2124 :
2125 0 : static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
2126 : const __m256i *const res_unsigned,
2127 : const __m256i *const wt0,
2128 : const __m256i *const wt1,
2129 : const int32_t use_jnt_comp_avg) {
2130 : __m256i res;
2131 0 : if (use_jnt_comp_avg) {
2132 0 : const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
2133 0 : const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
2134 0 : const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
2135 0 : res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
2136 : }
2137 : else {
2138 0 : const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
2139 0 : res = _mm256_srai_epi32(wt_res, 1);
2140 : }
2141 0 : return res;
2142 : }
2143 167618000 : static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
2144 167618000 : __m128i mhi = _mm_loadu_si128((__m128i *)(hi));
2145 167618000 : __m128i mlo = _mm_loadu_si128((__m128i *)(lo));
2146 167618000 : return yy_set_m128i(mhi, mlo);
2147 : }
2148 :
2149 :
2150 0 : static INLINE __m256i highbd_convolve_rounding(
2151 : const __m256i *const res_unsigned, const __m256i *const offset_const,
2152 : const __m256i *const round_const, const int32_t round_shift) {
2153 0 : const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
2154 0 : const __m256i res_round = _mm256_srai_epi32(
2155 : _mm256_add_epi32(res_signed, *round_const), round_shift);
2156 :
2157 0 : return res_round;
2158 : }
2159 : #if OBMC_CONVOLVE
2160 70205800 : static INLINE __m256i convolve_4tap(const __m256i *const s,
2161 : const __m256i *const coeffs) {
2162 70205800 : const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
2163 140412000 : const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
2164 70205800 : return _mm256_add_epi32(res_1, res_2);
2165 : }
2166 :
2167 4116930000 : static INLINE __m256i convolve_8tap(const __m256i *const s,
2168 : const __m256i *const coeffs) {
2169 4116930000 : const __m256i res_01 = _mm256_madd_epi16(s[0], coeffs[0]);
2170 4116930000 : const __m256i res_23 = _mm256_madd_epi16(s[1], coeffs[1]);
2171 4116930000 : const __m256i res_45 = _mm256_madd_epi16(s[2], coeffs[2]);
2172 8233860000 : const __m256i res_67 = _mm256_madd_epi16(s[3], coeffs[3]);
2173 4116930000 : const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
2174 4116930000 : const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
2175 4116930000 : return _mm256_add_epi32(res_0123, res_4567);
2176 : }
2177 737897000 : static INLINE __m256i convolve_2tap(const __m256i *const s,
2178 : const __m256i *const coeffs) {
2179 1475790000 : return _mm256_madd_epi16(s[0], coeffs[0]);
2180 : }
2181 525258000 : static INLINE __m256i convolve_x_2tap(const __m256i data,
2182 : const __m256i *const coeffs, const __m256i *const filt) {
2183 525258000 : const __m256i s = _mm256_shuffle_epi8(data, filt[0]);
2184 525258000 : return convolve_2tap_avx2(&s, coeffs);
2185 : }
2186 129142000 : static INLINE __m256i convolve_x_4tap(const __m256i data,
2187 : const __m256i *const coeffs, const __m256i *const filt) {
2188 : __m256i s[2];
2189 :
2190 129142000 : s[0] = _mm256_shuffle_epi8(data, filt[0]);
2191 129142000 : s[1] = _mm256_shuffle_epi8(data, filt[1]);
2192 :
2193 129142000 : return convolve_4tap_avx2(s, coeffs);
2194 : }
2195 :
2196 2943610000 : static INLINE __m256i convolve_x_8tap_avx2(const __m256i data,
2197 : const __m256i *const coeffs, const __m256i *const filt) {
2198 : __m256i s[4];
2199 :
2200 2943610000 : s[0] = _mm256_shuffle_epi8(data, filt[0]);
2201 2943610000 : s[1] = _mm256_shuffle_epi8(data, filt[1]);
2202 2943610000 : s[2] = _mm256_shuffle_epi8(data, filt[2]);
2203 2943610000 : s[3] = _mm256_shuffle_epi8(data, filt[3]);
2204 :
2205 2943610000 : return convolve_8tap_avx2(s, coeffs);
2206 : }
2207 :
2208 : #define CONVOLVE_SR_VERTICAL_FILTER_2TAP \
2209 : __m256i s[6]; \
2210 : \
2211 : for (i = 0; i < h; i += 2) { \
2212 : const int16_t *data = &t_block[i * im_stride]; \
2213 : \
2214 : const __m256i s4 = \
2215 : _mm256_loadu_si256((__m256i *)(data + 0 * im_stride)); \
2216 : const __m256i s5 = \
2217 : _mm256_loadu_si256((__m256i *)(data + 1 * im_stride)); \
2218 : \
2219 : s[0] = _mm256_unpacklo_epi16(s4, s5); \
2220 : s[1] = _mm256_unpackhi_epi16(s4, s5); \
2221 : \
2222 : __m256i res_a = convolve_2tap(s + 0, coeffs_v); \
2223 : __m256i res_b = convolve_2tap(s + 1, coeffs_v); \
2224 : \
2225 : res_a = \
2226 : _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \
2227 : res_b = \
2228 : _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \
2229 : \
2230 : const __m256i res_a_round = _mm256_sra_epi32( \
2231 : _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
2232 : const __m256i res_b_round = _mm256_sra_epi32( \
2233 : _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
2234 : \
2235 : const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \
2236 : const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \
2237 : \
2238 : const __m128i res_0 = _mm256_castsi256_si128(res_8b); \
2239 : const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \
2240 : \
2241 : __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \
2242 : __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \
2243 : if (w - j > 4) { \
2244 : _mm_storel_epi64(p_0, res_0); \
2245 : _mm_storel_epi64(p_1, res_1); \
2246 : } \
2247 : else if (w == 4) { \
2248 : xx_storel_32(p_0, res_0); \
2249 : xx_storel_32(p_1, res_1); \
2250 : } \
2251 : else { \
2252 : *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); \
2253 : *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); \
2254 : } \
2255 : }
2256 : #define CONVOLVE_SR_VERTICAL_FILTER_4TAP \
2257 : __m256i s[6]; \
2258 : __m256i src_0 = _mm256_loadu_si256((__m256i *)(t_block + 0 * im_stride)); \
2259 : __m256i src_1 = _mm256_loadu_si256((__m256i *)(t_block + 1 * im_stride)); \
2260 : __m256i src_2 = _mm256_loadu_si256((__m256i *)(t_block + 2 * im_stride)); \
2261 : __m256i src_3 = _mm256_loadu_si256((__m256i *)(t_block + 3 * im_stride)); \
2262 : \
2263 : s[0] = _mm256_unpacklo_epi16(src_0, src_1); \
2264 : s[1] = _mm256_unpacklo_epi16(src_2, src_3); \
2265 : s[3] = _mm256_unpackhi_epi16(src_0, src_1); \
2266 : s[4] = _mm256_unpackhi_epi16(src_2, src_3); \
2267 : \
2268 : for (i = 0; i < h; i += 2) { \
2269 : const int16_t *data = &t_block[i * im_stride]; \
2270 : \
2271 : const __m256i s4 = \
2272 : _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \
2273 : const __m256i s5 = \
2274 : _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \
2275 : \
2276 : s[2] = _mm256_unpacklo_epi16(s4, s5); \
2277 : s[5] = _mm256_unpackhi_epi16(s4, s5); \
2278 : \
2279 : __m256i res_a = convolve_4tap(s, coeffs_v + 1); \
2280 : __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1); \
2281 : \
2282 : res_a = \
2283 : _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \
2284 : res_b = \
2285 : _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \
2286 : \
2287 : const __m256i res_a_round = _mm256_sra_epi32( \
2288 : _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
2289 : const __m256i res_b_round = _mm256_sra_epi32( \
2290 : _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
2291 : \
2292 : const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \
2293 : const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \
2294 : \
2295 : const __m128i res_0 = _mm256_castsi256_si128(res_8b); \
2296 : const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \
2297 : \
2298 : __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \
2299 : __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \
2300 : if (w - j > 4) { \
2301 : _mm_storel_epi64(p_0, res_0); \
2302 : _mm_storel_epi64(p_1, res_1); \
2303 : } \
2304 : else if (w == 4) { \
2305 : xx_storel_32(p_0, res_0); \
2306 : xx_storel_32(p_1, res_1); \
2307 : } \
2308 : else { \
2309 : *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); \
2310 : *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); \
2311 : } \
2312 : \
2313 : s[0] = s[1]; \
2314 : s[1] = s[2]; \
2315 : s[3] = s[4]; \
2316 : s[4] = s[5]; \
2317 : }
2318 :
2319 : #define CONVOLVE_SR_VERTICAL_FILTER_8TAP \
2320 : __m256i src_0 = _mm256_loadu_si256((__m256i *)(t_block + 0 * im_stride)); \
2321 : __m256i src_1 = _mm256_loadu_si256((__m256i *)(t_block + 1 * im_stride)); \
2322 : __m256i src_2 = _mm256_loadu_si256((__m256i *)(t_block + 2 * im_stride)); \
2323 : __m256i src_3 = _mm256_loadu_si256((__m256i *)(t_block + 3 * im_stride)); \
2324 : __m256i src_4 = _mm256_loadu_si256((__m256i *)(t_block + 4 * im_stride)); \
2325 : __m256i src_5 = _mm256_loadu_si256((__m256i *)(t_block + 5 * im_stride)); \
2326 : \
2327 : __m256i s[8]; \
2328 : s[0] = _mm256_unpacklo_epi16(src_0, src_1); \
2329 : s[1] = _mm256_unpacklo_epi16(src_2, src_3); \
2330 : s[2] = _mm256_unpacklo_epi16(src_4, src_5); \
2331 : \
2332 : s[4] = _mm256_unpackhi_epi16(src_0, src_1); \
2333 : s[5] = _mm256_unpackhi_epi16(src_2, src_3); \
2334 : s[6] = _mm256_unpackhi_epi16(src_4, src_5); \
2335 : \
2336 : for (i = 0; i < h; i += 2) { \
2337 : const int16_t *data = &t_block[i * im_stride]; \
2338 : \
2339 : const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
2340 : const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
2341 : \
2342 : s[3] = _mm256_unpacklo_epi16(s6, s7); \
2343 : s[7] = _mm256_unpackhi_epi16(s6, s7); \
2344 : \
2345 : __m256i res_a = convolve_8tap(s, coeffs_v); \
2346 : __m256i res_b = convolve_8tap(s + 4, coeffs_v); \
2347 : \
2348 : res_a = \
2349 : _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \
2350 : res_b = \
2351 : _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \
2352 : \
2353 : const __m256i res_a_round = _mm256_sra_epi32( \
2354 : _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
2355 : const __m256i res_b_round = _mm256_sra_epi32( \
2356 : _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
2357 : \
2358 : const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \
2359 : const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \
2360 : \
2361 : const __m128i res_0 = _mm256_castsi256_si128(res_8b); \
2362 : const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \
2363 : \
2364 : __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \
2365 : __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \
2366 : if (w - j > 4) { \
2367 : _mm_storel_epi64(p_0, res_0); \
2368 : _mm_storel_epi64(p_1, res_1); \
2369 : } else if (w == 4) { \
2370 : xx_storel_32(p_0, res_0); \
2371 : xx_storel_32(p_1, res_1); \
2372 : } else { \
2373 : *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); \
2374 : *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); \
2375 : } \
2376 : \
2377 : s[0] = s[1]; \
2378 : s[1] = s[2]; \
2379 : s[2] = s[3]; \
2380 : \
2381 : s[4] = s[5]; \
2382 : s[5] = s[6]; \
2383 : s[6] = s[7]; \
2384 : }
2385 : #define CONVOLVE_SR_HORIZONTAL_FILTER_2TAP \
2386 : for (i = 0; i < (im_h - 2); i += 2) { \
2387 : __m256i data = _mm256_castsi128_si256( \
2388 : _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
2389 : \
2390 : data = _mm256_inserti128_si256( \
2391 : data, \
2392 : _mm_loadu_si128( \
2393 : (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \
2394 : 1); \
2395 : __m256i res = convolve_x_2tap(data, coeffs_h, filt); \
2396 : \
2397 : res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), \
2398 : round_shift_h); \
2399 : _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
2400 : } \
2401 : \
2402 : __m256i data_1 = _mm256_castsi128_si256( \
2403 : _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
2404 : \
2405 : __m256i res = convolve_x_2tap(data_1, coeffs_h, filt); \
2406 : res = \
2407 : _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
2408 : _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
2409 : #define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP \
2410 : for (i = 0; i < (im_h - 2); i += 2) { \
2411 : __m256i data = _mm256_castsi128_si256( \
2412 : _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
2413 : \
2414 : data = _mm256_inserti128_si256( \
2415 : data, \
2416 : _mm_loadu_si128( \
2417 : (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \
2418 : 1); \
2419 : __m256i res = convolve_x_4tap(data, coeffs_h + 1, filt); \
2420 : \
2421 : res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), \
2422 : round_shift_h); \
2423 : _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
2424 : } \
2425 : \
2426 : __m256i data_1 = _mm256_castsi128_si256( \
2427 : _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
2428 : \
2429 : __m256i res = convolve_x_4tap(data_1, coeffs_h + 1, filt); \
2430 : res = \
2431 : _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
2432 : _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
2433 : #define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \
2434 : for (i = 0; i < (im_h - 2); i += 2) { \
2435 : __m256i data = _mm256_castsi128_si256( \
2436 : _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
2437 : data = _mm256_inserti128_si256( \
2438 : data, \
2439 : _mm_loadu_si128( \
2440 : (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \
2441 : 1); \
2442 : \
2443 : __m256i res = convolve_x_8tap_avx2(data, coeffs_h, filt); \
2444 : res = \
2445 : _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
2446 : _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
2447 : } \
2448 : \
2449 : __m256i data_1 = _mm256_castsi128_si256( \
2450 : _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
2451 : \
2452 : __m256i res = convolve_x_8tap_avx2(data_1, coeffs_h, filt); \
2453 : \
2454 : res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
2455 : \
2456 : _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
2457 :
2458 : #endif
2459 : #endif
|