Line data Source code
1 : /*
2 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <immintrin.h>
13 : #include "aom_dsp_rtcd.h"
14 : #include "convolve.h"
15 : #include "convolve_avx2.h"
16 : #include "EbDefinitions.h"
17 : #include "EbMemory_SSE4_1.h"
18 :
19 0 : static INLINE __m128i jnt_2d_comp_avg_round_4_sse2(const __m128i src) {
20 0 : const __m128i round = _mm_set1_epi32(1 << (COMPOUND_ROUND1_BITS - 1));
21 0 : const __m128i dst = _mm_add_epi32(src, round);
22 0 : const __m128i d = _mm_srai_epi32(dst, COMPOUND_ROUND1_BITS);
23 0 : return _mm_packs_epi32(d, d);
24 : }
25 :
26 0 : static INLINE __m128i jnt_2d_comp_avg_round_half_pel_sse2(const __m128i src) {
27 0 : const __m128i round = _mm_set1_epi16(1);
28 0 : const __m128i dst = _mm_add_epi16(src, round);
29 0 : return _mm_srai_epi16(dst, 1);
30 : }
31 :
32 0 : static INLINE __m128i jnt_2d_comp_avg_round_4x2_sse2(const __m128i src[2]) {
33 0 : const __m128i round = _mm_set1_epi32(1 << (COMPOUND_ROUND1_BITS - 1));
34 0 : const __m128i dst0 = _mm_add_epi32(src[0], round);
35 0 : const __m128i dst1 = _mm_add_epi32(src[1], round);
36 0 : const __m128i d0 = _mm_srai_epi32(dst0, COMPOUND_ROUND1_BITS);
37 0 : const __m128i d1 = _mm_srai_epi32(dst1, COMPOUND_ROUND1_BITS);
38 0 : return _mm_packs_epi32(d0, d1);
39 : }
40 :
41 1276390 : static INLINE __m256i jnt_2d_comp_avg_round_8_avx2(const __m256i src) {
42 1276390 : const __m256i round = _mm256_set1_epi32(1 << (COMPOUND_ROUND1_BITS - 1));
43 1276390 : const __m256i dst = _mm256_add_epi32(src, round);
44 1276390 : const __m256i d = _mm256_srai_epi32(dst, COMPOUND_ROUND1_BITS);
45 1276390 : return _mm256_packs_epi32(d, d);
46 : }
47 :
48 464184000 : static INLINE __m256i jnt_2d_comp_avg_round_8x2_avx2(const __m256i src[2]) {
49 464184000 : const __m256i round = _mm256_set1_epi32(1 << (COMPOUND_ROUND1_BITS - 1));
50 464184000 : const __m256i dst0 = _mm256_add_epi32(src[0], round);
51 928367000 : const __m256i dst1 = _mm256_add_epi32(src[1], round);
52 464184000 : const __m256i d0 = _mm256_srai_epi32(dst0, COMPOUND_ROUND1_BITS);
53 464184000 : const __m256i d1 = _mm256_srai_epi32(dst1, COMPOUND_ROUND1_BITS);
54 464184000 : return _mm256_packs_epi32(d0, d1);
55 : }
56 :
57 65680700 : static INLINE __m256i jnt_2d_comp_avg_round_half_pel_avx2(const __m256i src) {
58 65680700 : const __m256i round = _mm256_set1_epi16(1);
59 65680700 : const __m256i dst = _mm256_add_epi16(src, round);
60 65680700 : return _mm256_srai_epi16(dst, 1);
61 : }
62 :
63 0 : static INLINE void jnt_2d_comp_avg_round_store_2x2_sse2(
64 : const __m128i res, const __m128i factor, const __m128i offset,
65 : const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
66 : const int32_t dst8_stride) {
67 0 : const __m128i r = jnt_2d_comp_avg_round_4_sse2(res);
68 : __m128i d;
69 :
70 0 : d = load_u16_2x2_sse4_1(dst, dst_stride);
71 0 : d = _mm_unpacklo_epi16(d, r);
72 0 : d = _mm_madd_epi16(d, factor);
73 0 : d = _mm_add_epi32(d, offset);
74 0 : d = _mm_srai_epi32(d, 8);
75 0 : d = _mm_packs_epi32(d, d);
76 0 : pack_store_2x2_sse2(d, dst8, dst8_stride);
77 0 : }
78 :
79 0 : static INLINE void jnt_2d_comp_avg_round_store_half_pel_2x2_sse2(
80 : const __m128i res, const __m128i factor, const __m128i offset,
81 : const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
82 : const int32_t dst8_stride) {
83 0 : const __m128i r = jnt_2d_comp_avg_round_half_pel_sse2(res);
84 : __m128i d;
85 :
86 0 : d = load_u16_2x2_sse4_1(dst, dst_stride);
87 0 : d = _mm_unpacklo_epi16(d, r);
88 0 : d = _mm_madd_epi16(d, factor);
89 0 : d = _mm_add_epi32(d, offset);
90 0 : d = _mm_srai_epi32(d, 8);
91 0 : d = _mm_packs_epi32(d, d);
92 0 : pack_store_2x2_sse2(d, dst8, dst8_stride);
93 0 : }
94 :
95 0 : static INLINE void jnt_2d_comp_avg_round_store_4x2_sse2(
96 : const __m128i res[2], const __m128i factor, const __m128i offset,
97 : const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
98 : const int32_t dst8_stride) {
99 0 : const __m128i r = jnt_2d_comp_avg_round_4x2_sse2(res);
100 0 : const __m128i d = load_u16_4x2_sse2(dst, dst_stride);
101 : __m128i dd[2];
102 :
103 0 : dd[0] = _mm_unpacklo_epi16(d, r);
104 0 : dd[1] = _mm_unpackhi_epi16(d, r);
105 0 : dd[0] = _mm_madd_epi16(dd[0], factor);
106 0 : dd[1] = _mm_madd_epi16(dd[1], factor);
107 0 : dd[0] = _mm_add_epi32(dd[0], offset);
108 0 : dd[1] = _mm_add_epi32(dd[1], offset);
109 0 : dd[0] = _mm_srai_epi32(dd[0], 8);
110 0 : dd[1] = _mm_srai_epi32(dd[1], 8);
111 0 : dd[0] = _mm_packs_epi32(dd[0], dd[1]);
112 0 : pack_store_4x2_sse2(dd[0], dst8, dst8_stride);
113 0 : }
114 :
115 0 : static INLINE void jnt_2d_comp_avg_round_store_half_pel_4x2_sse2(
116 : const __m128i res, const __m128i factor, const __m128i offset,
117 : const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
118 : const int32_t dst8_stride) {
119 0 : const __m128i r = jnt_2d_comp_avg_round_half_pel_sse2(res);
120 0 : const __m128i d = load_u16_4x2_sse2(dst, dst_stride);
121 : __m128i dd[2];
122 :
123 0 : dd[0] = _mm_unpacklo_epi16(d, r);
124 0 : dd[1] = _mm_unpackhi_epi16(d, r);
125 0 : dd[0] = _mm_madd_epi16(dd[0], factor);
126 0 : dd[1] = _mm_madd_epi16(dd[1], factor);
127 0 : dd[0] = _mm_add_epi32(dd[0], offset);
128 0 : dd[1] = _mm_add_epi32(dd[1], offset);
129 0 : dd[0] = _mm_srai_epi32(dd[0], 8);
130 0 : dd[1] = _mm_srai_epi32(dd[1], 8);
131 0 : dd[0] = _mm_packs_epi32(dd[0], dd[1]);
132 0 : pack_store_4x2_sse2(dd[0], dst8, dst8_stride);
133 0 : }
134 :
135 1276390 : static INLINE __m256i jnt_2d_comp_avg_round_pack_8_avx2(const __m256i res,
136 : const __m256i factor,
137 : const __m256i offset,
138 : const __m256i dst) {
139 1276390 : const __m256i r = jnt_2d_comp_avg_round_8_avx2(res);
140 : __m256i d[2];
141 :
142 1276380 : d[0] = _mm256_unpacklo_epi16(dst, r);
143 1276380 : d[0] = _mm256_madd_epi16(d[0], factor);
144 1276380 : d[0] = _mm256_add_epi32(d[0], offset);
145 1276380 : d[0] = _mm256_srai_epi32(d[0], 8);
146 2552770 : return _mm256_packs_epi32(d[0], d[0]);
147 : }
148 :
149 464298000 : static INLINE __m256i jnt_2d_comp_avg_round_pack_16_avx2(const __m256i res[2],
150 : const __m256i factor,
151 : const __m256i offset,
152 : const __m256i dst) {
153 464298000 : const __m256i r = jnt_2d_comp_avg_round_8x2_avx2(res);
154 : __m256i d[2];
155 :
156 464075000 : d[0] = _mm256_unpacklo_epi16(dst, r);
157 464075000 : d[1] = _mm256_unpackhi_epi16(dst, r);
158 464075000 : d[0] = _mm256_madd_epi16(d[0], factor);
159 464075000 : d[1] = _mm256_madd_epi16(d[1], factor);
160 464075000 : d[0] = _mm256_add_epi32(d[0], offset);
161 464075000 : d[1] = _mm256_add_epi32(d[1], offset);
162 464075000 : d[0] = _mm256_srai_epi32(d[0], 8);
163 464075000 : d[1] = _mm256_srai_epi32(d[1], 8);
164 928150000 : return _mm256_packs_epi32(d[0], d[1]);
165 : }
166 :
167 65691100 : static INLINE __m256i jnt_2d_comp_avg_round_pack_half_pel_avx2(
168 : const __m256i res, const __m256i factor, const __m256i offset,
169 : const __m256i dst) {
170 65691100 : const __m256i r = jnt_2d_comp_avg_round_half_pel_avx2(res);
171 : __m256i d[2];
172 :
173 65704700 : d[0] = _mm256_unpacklo_epi16(dst, r);
174 65704700 : d[1] = _mm256_unpackhi_epi16(dst, r);
175 65704700 : d[0] = _mm256_madd_epi16(d[0], factor);
176 65704700 : d[1] = _mm256_madd_epi16(d[1], factor);
177 65704700 : d[0] = _mm256_add_epi32(d[0], offset);
178 65704700 : d[1] = _mm256_add_epi32(d[1], offset);
179 65704700 : d[0] = _mm256_srai_epi32(d[0], 8);
180 65704700 : d[1] = _mm256_srai_epi32(d[1], 8);
181 131409000 : return _mm256_packs_epi32(d[0], d[1]);
182 : }
183 :
184 1276390 : static INLINE void jnt_2d_comp_avg_round_store_4x2_avx2(
185 : const __m256i res, const __m256i factor, const __m256i offset,
186 : const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
187 : const int32_t dst8_stride) {
188 : __m128i d_128[2];
189 : __m256i d;
190 :
191 1276390 : d_128[0] = _mm_loadl_epi64((__m128i *)(dst));
192 1276390 : d_128[1] = _mm_loadl_epi64((__m128i *)(dst + dst_stride));
193 1276390 : d = _mm256_setr_m128i(d_128[0], d_128[1]);
194 1276390 : d = loadu_u16_8x2_avx2(dst, dst_stride);
195 1276390 : d = jnt_2d_comp_avg_round_pack_8_avx2(res, factor, offset, d);
196 1276390 : d = _mm256_packus_epi16(d, d);
197 1276390 : const __m128i d0 = _mm256_castsi256_si128(d);
198 1276390 : const __m128i d1 = _mm256_extracti128_si256(d, 1);
199 1276390 : _mm_storel_epi64((__m128i *)dst8, d0);
200 1276390 : _mm_storel_epi64((__m128i *)(dst8 + dst8_stride), d1);
201 1276390 : }
202 :
203 65555300 : static INLINE void jnt_2d_comp_avg_round_store_8x2_avx2(
204 : const __m256i res[2], const __m256i factor, const __m256i offset,
205 : const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
206 : const int32_t dst8_stride) {
207 65555300 : const __m256i d = loadu_u16_8x2_avx2(dst, dst_stride);
208 : const __m256i dd =
209 65580700 : jnt_2d_comp_avg_round_pack_16_avx2(res, factor, offset, d);
210 65660300 : pack_store_8x2_avx2(dd, dst8, dst8_stride);
211 65626700 : }
212 :
213 8005510 : static INLINE void jnt_2d_comp_avg_round_store_half_pel_8x2_avx2(
214 : const __m256i res, const __m256i factor, const __m256i offset,
215 : const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
216 : const int32_t dst8_stride) {
217 8005510 : const __m256i d = loadu_u16_8x2_avx2(dst, dst_stride);
218 : const __m256i dd =
219 8005540 : jnt_2d_comp_avg_round_pack_half_pel_avx2(res, factor, offset, d);
220 8006830 : pack_store_8x2_avx2(dd, dst8, dst8_stride);
221 8006380 : }
222 :
223 59794100 : static INLINE void jnt_2d_comp_avg_round_store_16x2_avx2(
224 : const __m256i res[4], const __m256i factor, const __m256i offset,
225 : const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
226 : const int32_t dst8_stride) {
227 : __m256i d[2];
228 :
229 59794100 : d[0] = _mm256_loadu_si256((__m256i *)dst);
230 59794100 : d[1] = _mm256_loadu_si256((__m256i *)(dst + dst_stride));
231 59794100 : d[0] = jnt_2d_comp_avg_round_pack_16_avx2(res + 0, factor, offset, d[0]);
232 59884900 : d[1] = jnt_2d_comp_avg_round_pack_16_avx2(res + 2, factor, offset, d[1]);
233 59869000 : xy_y_pack_store_16x2_avx2(d[0], d[1], dst8, dst8_stride);
234 59862700 : }
235 :
236 8432120 : static INLINE void jnt_2d_comp_avg_round_store_half_pel_16x2_avx2(
237 : const __m256i res[2], const __m256i factor, const __m256i offset,
238 : const ConvBufType *const dst, const int32_t dst_stride, uint8_t *const dst8,
239 : const int32_t dst8_stride) {
240 : __m256i d[2];
241 :
242 8432120 : d[0] = _mm256_loadu_si256((__m256i *)dst);
243 8432120 : d[1] = _mm256_loadu_si256((__m256i *)(dst + dst_stride));
244 8432980 : d[0] =
245 8432120 : jnt_2d_comp_avg_round_pack_half_pel_avx2(res[0], factor, offset, d[0]);
246 8433000 : d[1] =
247 8432980 : jnt_2d_comp_avg_round_pack_half_pel_avx2(res[1], factor, offset, d[1]);
248 8433000 : xy_y_pack_store_16x2_avx2(d[0], d[1], dst8, dst8_stride);
249 8432240 : }
250 :
251 : SIMD_INLINE void jnt_2d_comp_avg_round_store_32_avx2(
252 : const __m256i r0[2], const __m256i r1[2], const __m256i factor,
253 : const __m256i offset, const ConvBufType *const dst, uint8_t *const dst8) {
254 : __m256i d[2];
255 :
256 28275000 : d[0] = loadu_u16_8x2_avx2(dst, 16);
257 141809000 : d[1] = loadu_u16_8x2_avx2(dst + 8, 16);
258 141824000 : d[0] = jnt_2d_comp_avg_round_pack_16_avx2(r0, factor, offset, d[0]);
259 141868000 : d[1] = jnt_2d_comp_avg_round_pack_16_avx2(r1, factor, offset, d[1]);
260 141861000 : convolve_store_32_avx2(d[0], d[1], dst8);
261 141854000 : }
262 :
263 : SIMD_INLINE void jnt_2d_comp_avg_round_store_half_pel_32_avx2(
264 : const __m256i res[2], const __m256i factor, const __m256i offset,
265 : const ConvBufType *const dst, uint8_t *const dst8) {
266 : __m256i d[2];
267 :
268 7802800 : d[0] = loadu_u16_8x2_avx2(dst, 16);
269 20456100 : d[1] = loadu_u16_8x2_avx2(dst + 8, 16);
270 20457200 : d[0] =
271 20455800 : jnt_2d_comp_avg_round_pack_half_pel_avx2(res[0], factor, offset, d[0]);
272 20457200 : d[1] =
273 20457200 : jnt_2d_comp_avg_round_pack_half_pel_avx2(res[1], factor, offset, d[1]);
274 20457200 : convolve_store_32_avx2(d[0], d[1], dst8);
275 20456700 : }
276 :
277 0 : static INLINE __m128i jnt_2d_round_4_sse2(const __m128i src,
278 : const __m128i offset) {
279 0 : const __m128i dst = _mm_add_epi32(src, offset);
280 0 : const __m128i d = _mm_srai_epi32(dst, COMPOUND_ROUND1_BITS);
281 0 : return _mm_packs_epi32(d, d);
282 : }
283 :
284 0 : static INLINE __m128i jnt_2d_round_half_pel_sse2(const __m128i src,
285 : const __m128i offset) {
286 0 : const __m128i dst = _mm_add_epi16(src, offset);
287 0 : return _mm_srai_epi16(dst, 1);
288 : }
289 :
290 0 : static INLINE __m128i jnt_2d_round_4x2_sse2(const __m128i src[2],
291 : const __m128i offset) {
292 0 : const __m128i dst0 = _mm_add_epi32(src[0], offset);
293 0 : const __m128i dst1 = _mm_add_epi32(src[1], offset);
294 0 : const __m128i d0 = _mm_srai_epi32(dst0, COMPOUND_ROUND1_BITS);
295 0 : const __m128i d1 = _mm_srai_epi32(dst1, COMPOUND_ROUND1_BITS);
296 0 : return _mm_packs_epi32(d0, d1);
297 : }
298 :
299 6497180 : static INLINE __m256i jnt_2d_round_4x2_avx2(const __m256i src,
300 : const __m256i offset) {
301 6497180 : const __m256i dst = _mm256_add_epi32(src, offset);
302 6497180 : const __m256i d = _mm256_srai_epi32(dst, COMPOUND_ROUND1_BITS);
303 6497180 : return _mm256_packs_epi32(d, d);
304 : }
305 :
306 2420930000 : static INLINE __m256i jnt_2d_round_16_avx2(const __m256i src[2],
307 : const __m256i offset) {
308 2420930000 : const __m256i dst0 = _mm256_add_epi32(src[0], offset);
309 4841870000 : const __m256i dst1 = _mm256_add_epi32(src[1], offset);
310 2420930000 : const __m256i d0 = _mm256_srai_epi32(dst0, COMPOUND_ROUND1_BITS);
311 2420930000 : const __m256i d1 = _mm256_srai_epi32(dst1, COMPOUND_ROUND1_BITS);
312 2420930000 : return _mm256_packs_epi32(d0, d1);
313 : }
314 :
315 373219000 : static INLINE __m256i jnt_2d_round_half_pel_avx2(const __m256i src,
316 : const __m256i offset) {
317 373219000 : const __m256i dst0 = _mm256_add_epi16(src, offset);
318 373219000 : return _mm256_srai_epi16(dst0, 1);
319 : }
320 :
321 0 : static INLINE void jnt_2d_avg_round_store_2x2_sse2(
322 : const __m128i res, const __m128i offset, const ConvBufType *const dst,
323 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
324 0 : const __m128i r = jnt_2d_round_4_sse2(res, offset);
325 : __m128i d;
326 :
327 0 : d = load_u16_2x2_sse4_1(dst, dst_stride);
328 0 : d = jnt_avg_4x2_sse2(r, d);
329 0 : pack_store_2x2_sse2(d, dst8, dst8_stride);
330 0 : }
331 :
332 0 : static INLINE void jnt_2d_avg_round_store_half_pel_2x2_sse2(
333 : const __m128i res, const __m128i offset, const ConvBufType *const dst,
334 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
335 0 : const __m128i r = jnt_2d_round_half_pel_sse2(res, offset);
336 : __m128i d;
337 :
338 0 : d = load_u16_2x2_sse4_1(dst, dst_stride);
339 0 : d = jnt_avg_4x2_sse2(r, d);
340 0 : pack_store_2x2_sse2(d, dst8, dst8_stride);
341 0 : }
342 :
343 0 : static INLINE void jnt_2d_avg_round_store_4x2_sse2(
344 : const __m128i res[2], const __m128i offset, const ConvBufType *const dst,
345 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
346 0 : const __m128i r = jnt_2d_round_4x2_sse2(res, offset);
347 : __m128i d;
348 :
349 0 : d = load_u16_4x2_sse2(dst, dst_stride);
350 0 : d = jnt_avg_4x2_sse2(r, d);
351 0 : pack_store_4x2_sse2(d, dst8, dst8_stride);
352 0 : }
353 :
354 0 : static INLINE void jnt_2d_avg_round_store_half_pel_4x2_sse2(
355 : const __m128i res, const __m128i offset, const ConvBufType *const dst,
356 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
357 0 : const __m128i r = jnt_2d_round_half_pel_sse2(res, offset);
358 : __m128i d;
359 :
360 0 : d = load_u16_4x2_sse2(dst, dst_stride);
361 0 : d = jnt_avg_4x2_sse2(r, d);
362 0 : pack_store_4x2_sse2(d, dst8, dst8_stride);
363 0 : }
364 :
365 2199240 : static INLINE void jnt_2d_avg_round_store_4x2_avx2(
366 : const __m256i res, const __m256i offset, const ConvBufType *const dst,
367 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
368 2199240 : const __m256i r = jnt_2d_round_4x2_avx2(res, offset);
369 : __m128i d_128[2];
370 : __m256i d;
371 :
372 2199250 : d_128[0] = _mm_loadl_epi64((__m128i *)(dst));
373 2199250 : d_128[1] = _mm_loadl_epi64((__m128i *)(dst + dst_stride));
374 2199250 : d = _mm256_setr_m128i(d_128[0], d_128[1]);
375 2199250 : d = jnt_avg_16_avx2(r, d);
376 2199260 : d = _mm256_packus_epi16(d, d);
377 2199260 : const __m128i d0 = _mm256_castsi256_si128(d);
378 2199260 : const __m128i d1 = _mm256_extracti128_si256(d, 1);
379 2199260 : _mm_storel_epi64((__m128i *)dst8, d0);
380 2199260 : _mm_storel_epi64((__m128i *)(dst8 + dst8_stride), d1);
381 2199260 : }
382 :
383 75330400 : static INLINE void jnt_2d_avg_round_store_8x2_avx2(
384 : const __m256i res[2], const __m256i offset, const ConvBufType *const dst,
385 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
386 75330400 : const __m256i r = jnt_2d_round_16_avx2(res, offset);
387 : __m256i d;
388 :
389 75355400 : d = loadu_u16_8x2_avx2(dst, dst_stride);
390 75357100 : d = jnt_avg_16_avx2(r, d);
391 75330800 : pack_store_8x2_avx2(d, dst8, dst8_stride);
392 75330600 : }
393 :
394 8006060 : static INLINE void jnt_2d_avg_round_store_half_pel_8x2_avx2(
395 : const __m256i res, const __m256i offset, const ConvBufType *const dst,
396 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
397 8006060 : const __m256i r = jnt_2d_round_half_pel_avx2(res, offset);
398 : __m256i d;
399 :
400 8006040 : d = loadu_u16_8x2_avx2(dst, dst_stride);
401 8005850 : d = jnt_avg_16_avx2(r, d);
402 8005830 : pack_store_8x2_avx2(d, dst8, dst8_stride);
403 8005940 : }
404 :
405 72250300 : static INLINE void jnt_2d_avg_round_store_16x2_avx2(
406 : const __m256i res[4], const __m256i offset, const ConvBufType *const dst,
407 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
408 : __m256i r[2], d[2];
409 :
410 72250300 : r[0] = jnt_2d_round_16_avx2(res + 0, offset);
411 72302900 : r[1] = jnt_2d_round_16_avx2(res + 2, offset);
412 72276600 : d[0] = _mm256_loadu_si256((__m256i *)dst);
413 72276600 : d[1] = _mm256_loadu_si256((__m256i *)(dst + dst_stride));
414 72276600 : d[0] = jnt_avg_16_avx2(r[0], d[0]);
415 72272600 : d[1] = jnt_avg_16_avx2(r[1], d[1]);
416 72260800 : xy_y_pack_store_16x2_avx2(d[0], d[1], dst8, dst8_stride);
417 72312200 : }
418 :
419 8432720 : static INLINE void jnt_2d_avg_round_store_half_pel_16x2_avx2(
420 : const __m256i res[2], const __m256i offset, const ConvBufType *const dst,
421 : const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
422 : __m256i r[2], d[2];
423 :
424 8432720 : r[0] = jnt_2d_round_half_pel_avx2(res[0], offset);
425 8432770 : r[1] = jnt_2d_round_half_pel_avx2(res[1], offset);
426 8432610 : d[0] = _mm256_loadu_si256((__m256i *)dst);
427 8432610 : d[1] = _mm256_loadu_si256((__m256i *)(dst + dst_stride));
428 8432610 : d[0] = jnt_avg_16_avx2(r[0], d[0]);
429 8432620 : d[1] = jnt_avg_16_avx2(r[1], d[1]);
430 8432440 : xy_y_pack_store_16x2_avx2(d[0], d[1], dst8, dst8_stride);
431 8432760 : }
432 :
433 : SIMD_INLINE void jnt_2d_avg_round_store_32_avx2(const __m256i r0[2],
434 : const __m256i r1[2],
435 : const __m256i offset,
436 : const ConvBufType *const dst,
437 : uint8_t *const dst8) {
438 : __m256i r[2], d[2];
439 :
440 45772000 : r[0] = jnt_2d_round_16_avx2(r0, offset);
441 176810000 : r[1] = jnt_2d_round_16_avx2(r1, offset);
442 176835000 : d[0] = loadu_u16_8x2_avx2(dst, 16);
443 176856000 : d[1] = loadu_u16_8x2_avx2(dst + 8, 16);
444 176848000 : d[0] = jnt_avg_16_avx2(r[0], d[0]);
445 176837000 : d[1] = jnt_avg_16_avx2(r[1], d[1]);
446 176823000 : convolve_store_32_avx2(d[0], d[1], dst8);
447 176818000 : }
448 :
449 20445700 : static INLINE void jnt_2d_avg_round_store_half_pel_32_avx2(
450 : const __m256i res[2], const __m256i offset, const ConvBufType *const dst,
451 : uint8_t *const dst8) {
452 : __m256i r[2], d[2];
453 :
454 20445700 : r[0] = jnt_2d_round_half_pel_avx2(res[0], offset);
455 20445700 : r[1] = jnt_2d_round_half_pel_avx2(res[1], offset);
456 20443600 : d[0] = loadu_u16_8x2_avx2(dst, 16);
457 20445100 : d[1] = loadu_u16_8x2_avx2(dst + 8, 16);
458 20445100 : d[0] = jnt_avg_16_avx2(r[0], d[0]);
459 20444700 : d[1] = jnt_avg_16_avx2(r[1], d[1]);
460 20444100 : convolve_store_32_avx2(d[0], d[1], dst8);
461 20444200 : }
462 :
463 0 : static INLINE void jnt_2d_no_avg_round_store_2x2_sse2(
464 : const __m128i res, const __m128i offset, ConvBufType *const dst,
465 : const int32_t dst_stride) {
466 0 : const __m128i d = jnt_2d_round_4_sse2(res, offset);
467 : store_u16_2x2_sse2(d, dst, dst_stride);
468 0 : }
469 :
470 0 : static INLINE void jnt_2d_no_avg_round_store_half_pel_2x2_sse2(
471 : const __m128i res, const __m128i offset, ConvBufType *const dst,
472 : const int32_t dst_stride) {
473 0 : const __m128i d = jnt_2d_round_half_pel_sse2(res, offset);
474 : store_u16_2x2_sse2(d, dst, dst_stride);
475 0 : }
476 :
477 0 : static INLINE void jnt_2d_no_avg_round_store_4x2_sse2(
478 : const __m128i res[2], const __m128i offset, ConvBufType *const dst,
479 : const int32_t dst_stride) {
480 0 : const __m128i d = jnt_2d_round_4x2_sse2(res, offset);
481 : store_u16_4x2_sse2(d, dst, dst_stride);
482 0 : }
483 :
484 4298480 : static INLINE void jnt_2d_no_avg_round_store_4x2_avx2(
485 : const __m256i res, const __m256i offset, ConvBufType *const dst,
486 : const int32_t dst_stride) {
487 4298480 : const __m256i d = jnt_2d_round_4x2_avx2(res, offset);
488 4298450 : const __m128i d0 = _mm256_castsi256_si128(d);
489 4298450 : const __m128i d1 = _mm256_extracti128_si256(d, 1);
490 4298450 : _mm_storel_epi64((__m128i *)dst, d0);
491 4298450 : _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
492 4298450 : }
493 :
494 0 : static INLINE void jnt_2d_no_avg_round_store_half_pel_4x2_sse2(
495 : const __m128i res, const __m128i offset, ConvBufType *const dst,
496 : const int32_t dst_stride) {
497 0 : const __m128i d = jnt_2d_round_half_pel_sse2(res, offset);
498 : store_u16_4x2_sse2(d, dst, dst_stride);
499 0 : }
500 :
501 278780000 : static INLINE void jnt_2d_no_avg_round_store_8x2_avx2(
502 : const __m256i res[2], const __m256i offset, ConvBufType *const dst,
503 : const int32_t dst_stride) {
504 278780000 : const __m256i d = jnt_2d_round_16_avx2(res, offset);
505 279259000 : storeu_u16_8x2_avx2(d, dst, dst_stride);
506 279488000 : }
507 :
508 40594300 : static INLINE void jnt_2d_no_avg_round_store_half_pel_8x2_avx2(
509 : const __m256i res, const __m256i offset, ConvBufType *const dst,
510 : const int32_t dst_stride) {
511 40594300 : const __m256i d = jnt_2d_round_half_pel_avx2(res, offset);
512 40592500 : storeu_u16_8x2_avx2(d, dst, dst_stride);
513 40597600 : }
514 :
515 266936000 : static INLINE void jnt_2d_no_avg_round_store_16x2_avx2(
516 : const __m256i res[4], const __m256i offset, ConvBufType *const dst,
517 : const int32_t dst_stride) {
518 266936000 : const __m256i d0 = jnt_2d_round_16_avx2(res + 0, offset);
519 267823000 : const __m256i d1 = jnt_2d_round_16_avx2(res + 2, offset);
520 : _mm256_storeu_si256((__m256i *)dst, d0);
521 268116000 : _mm256_storeu_si256((__m256i *)(dst + dst_stride), d1);
522 268116000 : }
523 :
524 43083800 : static INLINE void jnt_2d_no_avg_round_store_half_pel_16x2_avx2(
525 : const __m256i res[2], const __m256i offset, ConvBufType *const dst,
526 : const int32_t dst_stride) {
527 43083800 : const __m256i d0 = jnt_2d_round_half_pel_avx2(res[0], offset);
528 43081900 : const __m256i d1 = jnt_2d_round_half_pel_avx2(res[1], offset);
529 : _mm256_storeu_si256((__m256i *)dst, d0);
530 43080500 : _mm256_storeu_si256((__m256i *)(dst + dst_stride), d1);
531 43080500 : }
532 :
533 581160000 : static INLINE void jnt_2d_no_avg_round_store_32_avx2(const __m256i r0[2],
534 : const __m256i r1[2],
535 : const __m256i offset,
536 : ConvBufType *const dst) {
537 581160000 : const __m256i d0 = jnt_2d_round_16_avx2(r0, offset);
538 585857000 : const __m256i d1 = jnt_2d_round_16_avx2(r1, offset);
539 585480000 : jnt_no_avg_store_16x2_avx2(d0, d1, dst, 16);
540 585649000 : }
541 :
542 91258500 : static INLINE void jnt_2d_no_avg_round_store_half_pel_32_avx2(
543 : const __m256i res[2], const __m256i offset, ConvBufType *const dst) {
544 91258500 : const __m256i d0 = jnt_2d_round_half_pel_avx2(res[0], offset);
545 91261800 : const __m256i d1 = jnt_2d_round_half_pel_avx2(res[1], offset);
546 91227100 : jnt_no_avg_store_16x2_avx2(d0, d1, dst, 16);
547 91245500 : }
548 :
549 92521900 : static void jnt_convolve_2d_hor_2tap_avx2(
550 : const uint8_t *src, const int32_t src_stride, const int32_t w,
551 : const int32_t h, const InterpFilterParams *filter_params_x,
552 : const int32_t subpel_x_q4, int16_t *const im_block) {
553 92521900 : const uint8_t *src_ptr = src;
554 92521900 : int32_t y = h;
555 92521900 : int16_t *im = im_block;
556 : __m128i coeffs_128[4];
557 : __m256i coeffs_256[4];
558 :
559 92521900 : if (w <= 8) {
560 40526200 : prepare_half_coeffs_2tap_ssse3(
561 : filter_params_x, subpel_x_q4, coeffs_128);
562 :
563 40532600 : if (w == 2) {
564 : do {
565 : const __m128i r =
566 0 : x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
567 0 : xy_x_round_store_2x2_sse2(r, im);
568 0 : src_ptr += 2 * src_stride;
569 0 : im += 2 * 2;
570 0 : y -= 2;
571 0 : } while (y);
572 : }
573 40535200 : else if (w == 4) {
574 : do {
575 : const __m128i r =
576 0 : x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
577 0 : xy_x_round_store_4x2_sse2(r, im);
578 0 : src_ptr += 2 * src_stride;
579 0 : im += 2 * 4;
580 0 : y -= 2;
581 0 : } while (y);
582 : }
583 : else {
584 40539300 : assert(w == 8);
585 :
586 : do {
587 : __m128i r[2];
588 :
589 331476000 : x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, r);
590 330888000 : xy_x_round_store_8x2_sse2(r, im);
591 330635000 : src_ptr += 2 * src_stride;
592 330635000 : im += 2 * 8;
593 330635000 : y -= 2;
594 330635000 : } while (y);
595 : }
596 : }
597 : else {
598 51995700 : prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
599 :
600 52341500 : if (w == 16) {
601 : do {
602 : __m256i r[2];
603 :
604 325434000 : x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
605 323488000 : xy_x_round_store_32_avx2(r, im);
606 325072000 : src_ptr += 2 * src_stride;
607 325072000 : im += 2 * 16;
608 325072000 : y -= 2;
609 325072000 : } while (y);
610 : }
611 21555300 : else if (w == 32) {
612 : do {
613 388049000 : xy_x_2tap_32_avx2(src_ptr, coeffs_256, im);
614 387730000 : src_ptr += src_stride;
615 387730000 : im += 32;
616 387730000 : } while (--y);
617 : }
618 4744960 : else if (w == 64) {
619 : do {
620 154700000 : xy_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, im + 0 * 32);
621 154650000 : xy_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, im + 1 * 32);
622 154645000 : src_ptr += src_stride;
623 154645000 : im += 64;
624 154645000 : } while (--y);
625 : }
626 : else {
627 0 : assert(w == 128);
628 :
629 : do {
630 0 : xy_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, im + 0 * 32);
631 0 : xy_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, im + 1 * 32);
632 0 : xy_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, im + 2 * 32);
633 0 : xy_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, im + 3 * 32);
634 0 : src_ptr += src_stride;
635 0 : im += 128;
636 0 : } while (--y);
637 : }
638 : }
639 91304300 : }
640 :
641 1842500 : static void jnt_convolve_2d_hor_4tap_avx2(
642 : const uint8_t *src, const int32_t src_stride, const int32_t w,
643 : const int32_t h, const InterpFilterParams *filter_params_x,
644 : const int32_t subpel_x_q4, int16_t *const im_block) {
645 1842500 : const uint8_t *src_ptr = src - 1;
646 1842500 : int32_t y = h;
647 1842500 : int16_t *im = im_block;
648 : __m128i coeffs_128[4];
649 :
650 1842500 : prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
651 :
652 1842530 : if (w == 2) {
653 : do {
654 : const __m128i r =
655 0 : x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
656 0 : xy_x_round_store_2x2_sse2(r, im);
657 0 : src_ptr += 2 * src_stride;
658 0 : im += 2 * 2;
659 0 : y -= 2;
660 0 : } while (y);
661 : }
662 : else {
663 1842530 : assert(w == 4);
664 :
665 : do {
666 : const __m128i r =
667 12752200 : x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
668 12751600 : xy_x_round_store_4x2_sse2(r, im);
669 12750700 : src_ptr += 2 * src_stride;
670 12750700 : im += 2 * 4;
671 12750700 : y -= 2;
672 12750700 : } while (y);
673 : }
674 1841070 : }
675 :
676 38411000 : static void jnt_convolve_2d_hor_6tap_avx2(
677 : const uint8_t *src, const int32_t src_stride, const int32_t w,
678 : const int32_t h, const InterpFilterParams *filter_params_x,
679 : const int32_t subpel_x_q4, int16_t *const im_block) {
680 38411000 : const uint8_t *src_ptr = src - 2;
681 38411000 : int32_t y = h;
682 38411000 : int16_t *im = im_block;
683 : __m256i coeffs_256[4], filt_256[4];
684 :
685 38411000 : filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
686 38411000 : filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
687 38411000 : filt_256[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
688 :
689 38411000 : prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
690 :
691 38440200 : if (w == 8) {
692 : do {
693 182864000 : const __m256i res = x_convolve_6tap_8x2_avx2(
694 : src_ptr, src_stride, coeffs_256, filt_256);
695 182558000 : xy_x_round_store_8x2_avx2(res, im);
696 182732000 : src_ptr += 2 * src_stride;
697 182732000 : im += 2 * 8;
698 182732000 : y -= 2;
699 182732000 : } while (y);
700 : }
701 20671400 : else if (w == 16) {
702 : do {
703 : __m256i r[2];
704 :
705 150065000 : x_convolve_6tap_16x2_avx2(
706 : src_ptr, src_stride, coeffs_256, filt_256, r);
707 149552000 : xy_x_round_store_32_avx2(r, im);
708 149999000 : src_ptr += 2 * src_stride;
709 149999000 : im += 2 * 16;
710 149999000 : y -= 2;
711 149999000 : } while (y);
712 : }
713 8763400 : else if (w == 32) {
714 : do {
715 172998000 : xy_x_6tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
716 172938000 : src_ptr += src_stride;
717 172938000 : im += 32;
718 172938000 : } while (--y);
719 : }
720 1915660 : else if (w == 64) {
721 : do {
722 64319500 : xy_x_6tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
723 64328000 : xy_x_6tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
724 64314700 : src_ptr += src_stride;
725 64314700 : im += 64;
726 64314700 : } while (--y);
727 : }
728 : else {
729 0 : assert(w == 128);
730 :
731 : do {
732 0 : xy_x_6tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
733 0 : xy_x_6tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
734 0 : xy_x_6tap_32_avx2(src_ptr + 64, 16, coeffs_256, filt_256, im + 64);
735 0 : xy_x_6tap_32_avx2(src_ptr + 96, 16, coeffs_256, filt_256, im + 96);
736 0 : src_ptr += src_stride;
737 0 : im += 128;
738 0 : } while (--y);
739 : }
740 38175800 : }
741 :
742 16457900 : static void jnt_convolve_2d_hor_8tap_avx2(
743 : const uint8_t *src, const int32_t src_stride, const int32_t w,
744 : const int32_t h, const InterpFilterParams *filter_params_x,
745 : const int32_t subpel_x_q4, int16_t *const im_block) {
746 16457900 : const uint8_t *src_ptr = src - 3;
747 16457900 : int32_t y = h;
748 16457900 : int16_t *im = im_block;
749 : __m256i coeffs_256[4], filt_256[4];
750 :
751 16457900 : filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
752 16457900 : filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
753 16457900 : filt_256[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
754 16457900 : filt_256[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
755 :
756 16457900 : prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
757 :
758 16464100 : if (w == 8) {
759 : do {
760 84064200 : const __m256i res = x_convolve_8tap_8x2_avx2(
761 : src_ptr, src_stride, coeffs_256, filt_256);
762 84007400 : xy_x_round_store_8x2_avx2(res, im);
763 84039500 : src_ptr += 2 * src_stride;
764 84039500 : im += 2 * 8;
765 84039500 : y -= 2;
766 84039500 : } while (y);
767 : }
768 9201710 : else if (w == 16) {
769 : do {
770 : __m256i r[2];
771 :
772 : x_convolve_8tap_16x2_avx2(
773 : src_ptr, src_stride, coeffs_256, filt_256, r);
774 71086600 : xy_x_round_store_32_avx2(r, im);
775 71155700 : src_ptr += 2 * src_stride;
776 71155700 : im += 2 * 16;
777 71155700 : y -= 2;
778 71155700 : } while (y);
779 : }
780 4162370 : else if (w == 32) {
781 : do {
782 89758000 : xy_x_8tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
783 89743900 : src_ptr += src_stride;
784 89743900 : im += 32;
785 89743900 : } while (--y);
786 : }
787 972550 : else if (w == 64) {
788 : do {
789 35313800 : xy_x_8tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
790 35311700 : xy_x_8tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
791 35311700 : src_ptr += src_stride;
792 35311700 : im += 64;
793 35311700 : } while (--y);
794 : }
795 : else {
796 0 : assert(w == 128);
797 :
798 : do {
799 0 : xy_x_8tap_32_avx2(src_ptr, 16, coeffs_256, filt_256, im);
800 0 : xy_x_8tap_32_avx2(src_ptr + 32, 16, coeffs_256, filt_256, im + 32);
801 0 : xy_x_8tap_32_avx2(src_ptr + 64, 16, coeffs_256, filt_256, im + 64);
802 0 : xy_x_8tap_32_avx2(src_ptr + 96, 16, coeffs_256, filt_256, im + 96);
803 0 : src_ptr += src_stride;
804 0 : im += 128;
805 0 : } while (--y);
806 : }
807 16424800 : }
808 :
809 74381700 : static void jnt_convolve_2d_ver_2tap_avx2(
810 : const int16_t *const im_block, const int32_t w, const int32_t h,
811 : const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
812 : const ConvolveParams *const conv_params, uint8_t *dst8,
813 : const int32_t dst8_stride) {
814 74381700 : const int32_t dst_stride = conv_params->dst_stride;
815 74381700 : const int32_t bd = 8;
816 74381700 : const int32_t round_0 = 3;
817 74381700 : const int16_t *im = im_block;
818 74381700 : const int32_t round_1 = COMPOUND_ROUND1_BITS;
819 74381700 : const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0; // 19
820 74381700 : const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1; // 4
821 74381700 : const int32_t round_offset = 1 << (offset_bits - round_1);
822 74381700 : const int32_t factor =
823 74381700 : conv_params->fwd_offset | (conv_params->bck_offset << 16);
824 74381700 : const int32_t offset_comp_avg =
825 74381700 : (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
826 74381700 : (round_offset << DIST_PRECISION_BITS) -
827 74381700 : (round_offset << (DIST_PRECISION_BITS - 1)) +
828 74381700 : (1 << (round_bits + DIST_PRECISION_BITS - 1));
829 74381700 : const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
830 74381700 : const __m128i factor_128 = _mm_set1_epi32(factor);
831 74381700 : const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
832 74381700 : const __m256i factor_256 = _mm256_set1_epi32(factor);
833 74381700 : const int32_t offset_avg = (1 << (round_1 - 1)) +
834 74381700 : (1 << (round_bits + round_1)) -
835 74381700 : (1 << offset_bits) - (1 << (offset_bits - 1));
836 74381700 : const int32_t offset_no_avg =
837 74381700 : (1 << (round_1 - 1)) + (1 << offset_bits) + (1 << (offset_bits - 1));
838 74381700 : const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
839 74381700 : const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
840 74381700 : const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
841 74381700 : const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
842 74381700 : ConvBufType *dst = conv_params->dst;
843 74381700 : int32_t y = h;
844 : __m128i coeffs_128[4];
845 : __m256i coeffs_256[4];
846 :
847 74381700 : if (w <= 4) {
848 0 : prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
849 :
850 0 : if (w == 2) {
851 : __m128i s_32[2];
852 :
853 0 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
854 :
855 0 : if (conv_params->do_average) {
856 0 : if (conv_params->use_jnt_comp_avg) {
857 : do {
858 : const __m128i res =
859 0 : xy_y_convolve_2tap_2x2_sse2(im, s_32, coeffs_128);
860 0 : jnt_2d_comp_avg_round_store_2x2_sse2(
861 : res,
862 : factor_128,
863 : offset_comp_avg_128,
864 : dst,
865 : dst_stride,
866 : dst8,
867 : dst8_stride);
868 0 : im += 2 * 2;
869 0 : dst += 2 * dst_stride;
870 0 : dst8 += 2 * dst8_stride;
871 0 : y -= 2;
872 0 : } while (y);
873 : }
874 : else {
875 : do {
876 : const __m128i res =
877 0 : xy_y_convolve_2tap_2x2_sse2(im, s_32, coeffs_128);
878 0 : jnt_2d_avg_round_store_2x2_sse2(res,
879 : offset_avg_128,
880 : dst,
881 : dst_stride,
882 : dst8,
883 : dst8_stride);
884 0 : im += 2 * 2;
885 0 : dst += 2 * dst_stride;
886 0 : dst8 += 2 * dst8_stride;
887 0 : y -= 2;
888 0 : } while (y);
889 : }
890 : }
891 : else {
892 : do {
893 : const __m128i res =
894 0 : xy_y_convolve_2tap_2x2_sse2(im, s_32, coeffs_128);
895 0 : jnt_2d_no_avg_round_store_2x2_sse2(
896 : res, offset_no_avg_128, dst, dst_stride);
897 0 : im += 2 * 2;
898 0 : dst += 2 * dst_stride;
899 0 : y -= 2;
900 0 : } while (y);
901 : }
902 : }
903 : else {
904 : __m128i s_64[2], r[2];
905 :
906 0 : assert(w == 4);
907 :
908 0 : s_64[0] = _mm_loadl_epi64((__m128i *)im);
909 :
910 0 : if (conv_params->do_average) {
911 0 : if (conv_params->use_jnt_comp_avg) {
912 : do {
913 0 : xy_y_convolve_2tap_4x2_sse2(im, s_64, coeffs_128, r);
914 0 : jnt_2d_comp_avg_round_store_4x2_sse2(
915 : r,
916 : factor_128,
917 : offset_comp_avg_128,
918 : dst,
919 : dst_stride,
920 : dst8,
921 : dst8_stride);
922 0 : im += 2 * 4;
923 0 : dst += 2 * dst_stride;
924 0 : dst8 += 2 * dst8_stride;
925 0 : y -= 2;
926 0 : } while (y);
927 : }
928 : else {
929 : do {
930 0 : xy_y_convolve_2tap_4x2_sse2(im, s_64, coeffs_128, r);
931 0 : jnt_2d_avg_round_store_4x2_sse2(r,
932 : offset_avg_128,
933 : dst,
934 : dst_stride,
935 : dst8,
936 : dst8_stride);
937 0 : im += 2 * 4;
938 0 : dst += 2 * dst_stride;
939 0 : dst8 += 2 * dst8_stride;
940 0 : y -= 2;
941 0 : } while (y);
942 : }
943 : }
944 : else {
945 : do {
946 0 : xy_y_convolve_2tap_4x2_sse2(im, s_64, coeffs_128, r);
947 0 : jnt_2d_no_avg_round_store_4x2_sse2(
948 : r, offset_no_avg_128, dst, dst_stride);
949 0 : im += 2 * 4;
950 0 : dst += 2 * dst_stride;
951 0 : y -= 2;
952 0 : } while (y);
953 : }
954 : }
955 : }
956 : else {
957 74381700 : prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
958 :
959 74443600 : if (w == 8) {
960 : __m128i s_128[2];
961 : __m256i r[2];
962 :
963 32584400 : s_128[0] = _mm_load_si128((__m128i *)im);
964 :
965 32584400 : if (conv_params->do_average) {
966 8919240 : if (conv_params->use_jnt_comp_avg) {
967 : do {
968 33826600 : xy_y_convolve_2tap_8x2_avx2(im, s_128, coeffs_256, r);
969 33799700 : jnt_2d_comp_avg_round_store_8x2_avx2(
970 : r,
971 : factor_256,
972 : offset_comp_avg_256,
973 : dst,
974 : dst_stride,
975 : dst8,
976 : dst8_stride);
977 33820200 : im += 2 * 8;
978 33820200 : dst += 2 * dst_stride;
979 33820200 : dst8 += 2 * dst8_stride;
980 33820200 : y -= 2;
981 33820200 : } while (y);
982 : }
983 : else {
984 : do {
985 33815000 : xy_y_convolve_2tap_8x2_avx2(im, s_128, coeffs_256, r);
986 33803500 : jnt_2d_avg_round_store_8x2_avx2(r,
987 : offset_avg_256,
988 : dst,
989 : dst_stride,
990 : dst8,
991 : dst8_stride);
992 33804600 : im += 2 * 8;
993 33804600 : dst += 2 * dst_stride;
994 33804600 : dst8 += 2 * dst8_stride;
995 33804600 : y -= 2;
996 33804600 : } while (y);
997 : }
998 : }
999 : else {
1000 : do {
1001 168593000 : xy_y_convolve_2tap_8x2_avx2(im, s_128, coeffs_256, r);
1002 168291000 : jnt_2d_no_avg_round_store_8x2_avx2(
1003 : r, offset_no_avg_256, dst, dst_stride);
1004 168598000 : im += 2 * 8;
1005 168598000 : dst += 2 * dst_stride;
1006 168598000 : y -= 2;
1007 168598000 : } while (y);
1008 : }
1009 : }
1010 41859200 : else if (w == 16) {
1011 : __m256i s_256[2], r[4];
1012 :
1013 24531400 : s_256[0] = _mm256_load_si256((__m256i *)im);
1014 :
1015 24531400 : if (conv_params->do_average) {
1016 6635360 : if (conv_params->use_jnt_comp_avg) {
1017 : do {
1018 33634400 : xy_y_convolve_2tap_16x2_avx2(im, s_256, coeffs_256, r);
1019 33584700 : jnt_2d_comp_avg_round_store_16x2_avx2(
1020 : r,
1021 : factor_256,
1022 : offset_comp_avg_256,
1023 : dst,
1024 : dst_stride,
1025 : dst8,
1026 : dst8_stride);
1027 33627500 : im += 2 * 16;
1028 33627500 : dst += 2 * dst_stride;
1029 33627500 : dst8 += 2 * dst8_stride;
1030 33627500 : y -= 2;
1031 33627500 : } while (y);
1032 : }
1033 : else {
1034 : do {
1035 33635800 : xy_y_convolve_2tap_16x2_avx2(im, s_256, coeffs_256, r);
1036 33594600 : jnt_2d_avg_round_store_16x2_avx2(r,
1037 : offset_avg_256,
1038 : dst,
1039 : dst_stride,
1040 : dst8,
1041 : dst8_stride);
1042 33629900 : im += 2 * 16;
1043 33629900 : dst += 2 * dst_stride;
1044 33629900 : dst8 += 2 * dst8_stride;
1045 33629900 : y -= 2;
1046 33629900 : } while (y);
1047 : }
1048 : }
1049 : else {
1050 : do {
1051 168345000 : xy_y_convolve_2tap_16x2_avx2(im, s_256, coeffs_256, r);
1052 167555000 : jnt_2d_no_avg_round_store_16x2_avx2(
1053 : r, offset_no_avg_256, dst, dst_stride);
1054 168358000 : im += 2 * 16;
1055 168358000 : dst += 2 * dst_stride;
1056 168358000 : y -= 2;
1057 168358000 : } while (y);
1058 : }
1059 : }
1060 17327800 : else if (w == 32) {
1061 : __m256i s_256[2][2];
1062 :
1063 13570000 : s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
1064 13570000 : s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
1065 :
1066 13570000 : if (conv_params->do_average) {
1067 4051380 : if (conv_params->use_jnt_comp_avg) {
1068 : do {
1069 : __m256i r[4];
1070 :
1071 22251500 : xy_y_convolve_2tap_32_avx2(
1072 : im + 1 * 32, s_256[0], s_256[1], coeffs_256, r);
1073 22228300 : jnt_2d_comp_avg_round_store_32_avx2(r + 0,
1074 : r + 2,
1075 : factor_256,
1076 : offset_comp_avg_256,
1077 : dst,
1078 : dst8);
1079 :
1080 22249500 : xy_y_convolve_2tap_32_avx2(
1081 : im + 2 * 32, s_256[1], s_256[0], coeffs_256, r);
1082 22228900 : jnt_2d_comp_avg_round_store_32_avx2(r + 0,
1083 : r + 2,
1084 : factor_256,
1085 : offset_comp_avg_256,
1086 22228900 : dst + dst_stride,
1087 : dst8 + dst8_stride);
1088 :
1089 22248300 : im += 2 * 32;
1090 22248300 : dst += 2 * dst_stride;
1091 22248300 : dst8 += 2 * dst8_stride;
1092 22248300 : y -= 2;
1093 22248300 : } while (y);
1094 : }
1095 : else {
1096 : do {
1097 : __m256i r[4];
1098 :
1099 22249300 : xy_y_convolve_2tap_32_avx2(
1100 : im + 1 * 32, s_256[0], s_256[1], coeffs_256, r);
1101 22233300 : jnt_2d_avg_round_store_32_avx2(
1102 : r + 0, r + 2, offset_avg_256, dst, dst8);
1103 :
1104 22246300 : xy_y_convolve_2tap_32_avx2(
1105 : im + 2 * 32, s_256[1], s_256[0], coeffs_256, r);
1106 22233100 : jnt_2d_avg_round_store_32_avx2(r + 0,
1107 : r + 2,
1108 : offset_avg_256,
1109 22233100 : dst + dst_stride,
1110 : dst8 + dst8_stride);
1111 :
1112 22244100 : im += 2 * 32;
1113 22244100 : dst += 2 * dst_stride;
1114 22244100 : dst8 += 2 * dst8_stride;
1115 22244100 : y -= 2;
1116 22244100 : } while (y);
1117 : }
1118 : }
1119 : else {
1120 : do {
1121 : __m256i r[4];
1122 :
1123 105580000 : xy_y_convolve_2tap_32_avx2(
1124 : im + 1 * 32, s_256[0], s_256[1], coeffs_256, r);
1125 105315000 : jnt_2d_no_avg_round_store_32_avx2(
1126 : r + 0, r + 2, offset_no_avg_256, dst);
1127 :
1128 105586000 : xy_y_convolve_2tap_32_avx2(
1129 : im + 2 * 32, s_256[1], s_256[0], coeffs_256, r);
1130 105340000 : jnt_2d_no_avg_round_store_32_avx2(
1131 105340000 : r + 0, r + 2, offset_no_avg_256, dst + dst_stride);
1132 :
1133 105586000 : im += 2 * 32;
1134 105586000 : dst += 2 * dst_stride;
1135 105586000 : y -= 2;
1136 105586000 : } while (y);
1137 : }
1138 : }
1139 3757820 : else if (w == 64) {
1140 : __m256i s_256[2][4];
1141 :
1142 3835020 : s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
1143 3835020 : s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
1144 3835020 : s_256[0][2] = _mm256_load_si256((__m256i *)(im + 2 * 16));
1145 3835020 : s_256[0][3] = _mm256_load_si256((__m256i *)(im + 3 * 16));
1146 :
1147 3835020 : if (conv_params->do_average) {
1148 1288640 : if (conv_params->use_jnt_comp_avg) {
1149 : do {
1150 : __m256i r[4];
1151 :
1152 10199900 : xy_y_convolve_2tap_32_avx2(im + 2 * 32,
1153 : s_256[0] + 0,
1154 : s_256[1] + 0,
1155 : coeffs_256,
1156 : r);
1157 10192600 : jnt_2d_comp_avg_round_store_32_avx2(r + 0,
1158 : r + 2,
1159 : factor_256,
1160 : offset_comp_avg_256,
1161 : dst,
1162 : dst8);
1163 :
1164 10199400 : xy_y_convolve_2tap_32_avx2(im + 3 * 32,
1165 : s_256[0] + 2,
1166 : s_256[1] + 2,
1167 : coeffs_256,
1168 : r);
1169 10192100 : jnt_2d_comp_avg_round_store_32_avx2(r + 0,
1170 : r + 2,
1171 : factor_256,
1172 : offset_comp_avg_256,
1173 10192100 : dst + 32,
1174 : dst8 + 32);
1175 10199100 : im += 2 * 64;
1176 :
1177 10199100 : xy_y_convolve_2tap_32_avx2(im + 0 * 32,
1178 : s_256[1] + 0,
1179 : s_256[0] + 0,
1180 : coeffs_256,
1181 : r);
1182 10191400 : jnt_2d_comp_avg_round_store_32_avx2(r + 0,
1183 : r + 2,
1184 : factor_256,
1185 : offset_comp_avg_256,
1186 10191400 : dst + dst8_stride,
1187 : dst8 + dst8_stride);
1188 :
1189 10199000 : xy_y_convolve_2tap_32_avx2(im + 1 * 32,
1190 : s_256[1] + 2,
1191 : s_256[0] + 2,
1192 : coeffs_256,
1193 : r);
1194 10191600 : jnt_2d_comp_avg_round_store_32_avx2(
1195 : r + 0,
1196 : r + 2,
1197 : factor_256,
1198 : offset_comp_avg_256,
1199 10191600 : dst + dst8_stride + 32,
1200 10191600 : dst8 + dst8_stride + 32);
1201 :
1202 10199300 : dst += 2 * dst_stride;
1203 10199300 : dst8 += 2 * dst8_stride;
1204 10199300 : y -= 2;
1205 10199300 : } while (y);
1206 : }
1207 : else {
1208 : do {
1209 : __m256i r[4];
1210 :
1211 10198400 : xy_y_convolve_2tap_32_avx2(im + 2 * 32,
1212 : s_256[0] + 0,
1213 : s_256[1] + 0,
1214 : coeffs_256,
1215 : r);
1216 10192700 : jnt_2d_avg_round_store_32_avx2(
1217 : r + 0, r + 2, offset_avg_256, dst, dst8);
1218 :
1219 10197600 : xy_y_convolve_2tap_32_avx2(im + 3 * 32,
1220 : s_256[0] + 2,
1221 : s_256[1] + 2,
1222 : coeffs_256,
1223 : r);
1224 10192800 : jnt_2d_avg_round_store_32_avx2(
1225 10192800 : r + 0, r + 2, offset_avg_256, dst + 32, dst8 + 32);
1226 10197600 : im += 2 * 64;
1227 :
1228 10197600 : xy_y_convolve_2tap_32_avx2(im + 0 * 32,
1229 : s_256[1] + 0,
1230 : s_256[0] + 0,
1231 : coeffs_256,
1232 : r);
1233 10193300 : jnt_2d_avg_round_store_32_avx2(r + 0,
1234 : r + 2,
1235 : offset_avg_256,
1236 10193300 : dst + dst_stride,
1237 : dst8 + dst8_stride);
1238 :
1239 10197600 : xy_y_convolve_2tap_32_avx2(im + 1 * 32,
1240 : s_256[1] + 2,
1241 : s_256[0] + 2,
1242 : coeffs_256,
1243 : r);
1244 10192900 : jnt_2d_avg_round_store_32_avx2(r + 0,
1245 : r + 2,
1246 : offset_avg_256,
1247 10192900 : dst + dst_stride + 32,
1248 10192900 : dst8 + dst8_stride + 32);
1249 :
1250 10197100 : dst += 2 * dst_stride;
1251 10197100 : dst8 += 2 * dst8_stride;
1252 10197100 : y -= 2;
1253 10197100 : } while (y);
1254 : }
1255 : }
1256 : else {
1257 : do {
1258 : __m256i r[4];
1259 :
1260 40276100 : xy_y_convolve_2tap_32_avx2(
1261 : im + 2 * 32, s_256[0] + 0, s_256[1] + 0, coeffs_256, r);
1262 40215200 : jnt_2d_no_avg_round_store_32_avx2(
1263 : r + 0, r + 2, offset_no_avg_256, dst);
1264 :
1265 40280100 : xy_y_convolve_2tap_32_avx2(
1266 : im + 3 * 32, s_256[0] + 2, s_256[1] + 2, coeffs_256, r);
1267 40219800 : jnt_2d_no_avg_round_store_32_avx2(
1268 : r + 0, r + 2, offset_no_avg_256, dst + 32);
1269 40278900 : im += 2 * 64;
1270 :
1271 40278900 : xy_y_convolve_2tap_32_avx2(
1272 : im + 0 * 32, s_256[1] + 0, s_256[0] + 0, coeffs_256, r);
1273 40217800 : jnt_2d_no_avg_round_store_32_avx2(
1274 40217800 : r + 0, r + 2, offset_no_avg_256, dst + dst_stride);
1275 :
1276 40277600 : xy_y_convolve_2tap_32_avx2(
1277 : im + 1 * 32, s_256[1] + 2, s_256[0] + 2, coeffs_256, r);
1278 40217000 : jnt_2d_no_avg_round_store_32_avx2(
1279 40217000 : r + 0, r + 2, offset_no_avg_256, dst + dst_stride + 32);
1280 :
1281 40278000 : dst += 2 * dst_stride;
1282 40278000 : y -= 2;
1283 40278000 : } while (y);
1284 : }
1285 : }
1286 : else {
1287 : __m256i s_256[2][8];
1288 :
1289 0 : assert(w == 128);
1290 :
1291 : load_16bit_8rows_avx2(im, 16, s_256[0]);
1292 :
1293 0 : if (conv_params->do_average) {
1294 0 : if (conv_params->use_jnt_comp_avg) {
1295 : do {
1296 : __m256i r[4];
1297 :
1298 0 : xy_y_convolve_2tap_32_avx2(im + 4 * 32,
1299 : s_256[0] + 0,
1300 : s_256[1] + 0,
1301 : coeffs_256,
1302 : r);
1303 0 : jnt_2d_comp_avg_round_store_32_avx2(r + 0,
1304 : r + 2,
1305 : factor_256,
1306 : offset_comp_avg_256,
1307 : dst,
1308 : dst8);
1309 :
1310 0 : xy_y_convolve_2tap_32_avx2(im + 5 * 32,
1311 : s_256[0] + 2,
1312 : s_256[1] + 2,
1313 : coeffs_256,
1314 : r);
1315 0 : jnt_2d_comp_avg_round_store_32_avx2(r + 0,
1316 : r + 2,
1317 : factor_256,
1318 : offset_comp_avg_256,
1319 0 : dst + 1 * 32,
1320 : dst8 + 1 * 32);
1321 :
1322 0 : xy_y_convolve_2tap_32_avx2(im + 6 * 32,
1323 : s_256[0] + 4,
1324 : s_256[1] + 4,
1325 : coeffs_256,
1326 : r);
1327 0 : jnt_2d_comp_avg_round_store_32_avx2(r + 0,
1328 : r + 2,
1329 : factor_256,
1330 : offset_comp_avg_256,
1331 0 : dst + 2 * 32,
1332 : dst8 + 2 * 32);
1333 :
1334 0 : xy_y_convolve_2tap_32_avx2(im + 7 * 32,
1335 : s_256[0] + 6,
1336 : s_256[1] + 6,
1337 : coeffs_256,
1338 : r);
1339 0 : jnt_2d_comp_avg_round_store_32_avx2(r + 0,
1340 : r + 2,
1341 : factor_256,
1342 : offset_comp_avg_256,
1343 0 : dst + 3 * 32,
1344 : dst8 + 3 * 32);
1345 0 : im += 2 * 128;
1346 :
1347 0 : xy_y_convolve_2tap_32_avx2(im + 0 * 32,
1348 : s_256[1] + 0,
1349 : s_256[0] + 0,
1350 : coeffs_256,
1351 : r);
1352 0 : jnt_2d_comp_avg_round_store_32_avx2(
1353 : r + 0,
1354 : r + 2,
1355 : factor_256,
1356 : offset_comp_avg_256,
1357 0 : dst + dst8_stride + 0 * 32,
1358 : dst8 + dst8_stride + 0 * 32);
1359 :
1360 0 : xy_y_convolve_2tap_32_avx2(im + 1 * 32,
1361 : s_256[1] + 2,
1362 : s_256[0] + 2,
1363 : coeffs_256,
1364 : r);
1365 0 : jnt_2d_comp_avg_round_store_32_avx2(
1366 : r + 0,
1367 : r + 2,
1368 : factor_256,
1369 : offset_comp_avg_256,
1370 0 : dst + dst8_stride + 1 * 32,
1371 0 : dst8 + dst8_stride + 1 * 32);
1372 :
1373 0 : xy_y_convolve_2tap_32_avx2(im + 2 * 32,
1374 : s_256[1] + 4,
1375 : s_256[0] + 4,
1376 : coeffs_256,
1377 : r);
1378 0 : jnt_2d_comp_avg_round_store_32_avx2(
1379 : r + 0,
1380 : r + 2,
1381 : factor_256,
1382 : offset_comp_avg_256,
1383 0 : dst + dst8_stride + 2 * 32,
1384 0 : dst8 + dst8_stride + 2 * 32);
1385 :
1386 0 : xy_y_convolve_2tap_32_avx2(im + 3 * 32,
1387 : s_256[1] + 6,
1388 : s_256[0] + 6,
1389 : coeffs_256,
1390 : r);
1391 0 : jnt_2d_comp_avg_round_store_32_avx2(
1392 : r + 0,
1393 : r + 2,
1394 : factor_256,
1395 : offset_comp_avg_256,
1396 0 : dst + dst8_stride + 3 * 32,
1397 0 : dst8 + dst8_stride + 3 * 32);
1398 :
1399 0 : dst += 2 * dst_stride;
1400 0 : dst8 += 2 * dst8_stride;
1401 0 : y -= 2;
1402 0 : } while (y);
1403 : }
1404 : else {
1405 : do {
1406 : __m256i r[4];
1407 :
1408 0 : xy_y_convolve_2tap_32_avx2(im + 4 * 32,
1409 : s_256[0] + 0,
1410 : s_256[1] + 0,
1411 : coeffs_256,
1412 : r);
1413 0 : jnt_2d_avg_round_store_32_avx2(r + 0,
1414 : r + 2,
1415 : offset_avg_256,
1416 : dst + 0 * 32,
1417 : dst8 + 0 * 32);
1418 :
1419 0 : xy_y_convolve_2tap_32_avx2(im + 5 * 32,
1420 : s_256[0] + 2,
1421 : s_256[1] + 2,
1422 : coeffs_256,
1423 : r);
1424 0 : jnt_2d_avg_round_store_32_avx2(r + 0,
1425 : r + 2,
1426 : offset_avg_256,
1427 0 : dst + 1 * 32,
1428 : dst8 + 1 * 32);
1429 :
1430 0 : xy_y_convolve_2tap_32_avx2(im + 6 * 32,
1431 : s_256[0] + 4,
1432 : s_256[1] + 4,
1433 : coeffs_256,
1434 : r);
1435 0 : jnt_2d_avg_round_store_32_avx2(r + 0,
1436 : r + 2,
1437 : offset_avg_256,
1438 0 : dst + 2 * 32,
1439 : dst8 + 2 * 32);
1440 :
1441 0 : xy_y_convolve_2tap_32_avx2(im + 7 * 32,
1442 : s_256[0] + 6,
1443 : s_256[1] + 6,
1444 : coeffs_256,
1445 : r);
1446 0 : jnt_2d_avg_round_store_32_avx2(r + 0,
1447 : r + 2,
1448 : offset_avg_256,
1449 0 : dst + 3 * 32,
1450 : dst8 + 3 * 32);
1451 0 : im += 2 * 128;
1452 :
1453 0 : xy_y_convolve_2tap_32_avx2(im + 0 * 32,
1454 : s_256[1] + 0,
1455 : s_256[0] + 0,
1456 : coeffs_256,
1457 : r);
1458 0 : jnt_2d_avg_round_store_32_avx2(
1459 : r + 0,
1460 : r + 2,
1461 : offset_avg_256,
1462 0 : dst + dst_stride + 0 * 32,
1463 : dst8 + dst8_stride + 0 * 32);
1464 :
1465 0 : xy_y_convolve_2tap_32_avx2(im + 1 * 32,
1466 : s_256[1] + 2,
1467 : s_256[0] + 2,
1468 : coeffs_256,
1469 : r);
1470 0 : jnt_2d_avg_round_store_32_avx2(
1471 : r + 0,
1472 : r + 2,
1473 : offset_avg_256,
1474 0 : dst + dst_stride + 1 * 32,
1475 0 : dst8 + dst8_stride + 1 * 32);
1476 :
1477 0 : xy_y_convolve_2tap_32_avx2(im + 2 * 32,
1478 : s_256[1] + 4,
1479 : s_256[0] + 4,
1480 : coeffs_256,
1481 : r);
1482 0 : jnt_2d_avg_round_store_32_avx2(
1483 : r + 0,
1484 : r + 2,
1485 : offset_avg_256,
1486 0 : dst + dst_stride + 2 * 32,
1487 0 : dst8 + dst8_stride + 2 * 32);
1488 :
1489 0 : xy_y_convolve_2tap_32_avx2(im + 3 * 32,
1490 : s_256[1] + 6,
1491 : s_256[0] + 6,
1492 : coeffs_256,
1493 : r);
1494 0 : jnt_2d_avg_round_store_32_avx2(
1495 : r + 0,
1496 : r + 2,
1497 : offset_avg_256,
1498 0 : dst + dst_stride + 3 * 32,
1499 0 : dst8 + dst8_stride + 3 * 32);
1500 :
1501 0 : dst += 2 * dst_stride;
1502 0 : dst8 += 2 * dst8_stride;
1503 0 : y -= 2;
1504 0 : } while (y);
1505 : }
1506 : }
1507 : else {
1508 : do {
1509 : __m256i r[4];
1510 :
1511 0 : xy_y_convolve_2tap_32_avx2(
1512 : im + 4 * 32, s_256[0] + 0, s_256[1] + 0, coeffs_256, r);
1513 0 : jnt_2d_no_avg_round_store_32_avx2(
1514 : r + 0, r + 2, offset_no_avg_256, dst + 0 * 32);
1515 :
1516 0 : xy_y_convolve_2tap_32_avx2(
1517 : im + 5 * 32, s_256[0] + 2, s_256[1] + 2, coeffs_256, r);
1518 0 : jnt_2d_no_avg_round_store_32_avx2(
1519 : r + 0, r + 2, offset_no_avg_256, dst + 1 * 32);
1520 :
1521 0 : xy_y_convolve_2tap_32_avx2(
1522 : im + 6 * 32, s_256[0] + 4, s_256[1] + 4, coeffs_256, r);
1523 0 : jnt_2d_no_avg_round_store_32_avx2(
1524 : r + 0, r + 2, offset_no_avg_256, dst + 2 * 32);
1525 :
1526 0 : xy_y_convolve_2tap_32_avx2(
1527 : im + 7 * 32, s_256[0] + 6, s_256[1] + 6, coeffs_256, r);
1528 0 : jnt_2d_no_avg_round_store_32_avx2(
1529 : r + 0, r + 2, offset_no_avg_256, dst + 3 * 32);
1530 0 : im += 2 * 128;
1531 :
1532 0 : xy_y_convolve_2tap_32_avx2(
1533 : im + 0 * 32, s_256[1] + 0, s_256[0] + 0, coeffs_256, r);
1534 0 : jnt_2d_no_avg_round_store_32_avx2(
1535 : r + 0,
1536 : r + 2,
1537 : offset_no_avg_256,
1538 0 : dst + dst_stride + 0 * 32);
1539 :
1540 0 : xy_y_convolve_2tap_32_avx2(
1541 : im + 1 * 32, s_256[1] + 2, s_256[0] + 2, coeffs_256, r);
1542 0 : jnt_2d_no_avg_round_store_32_avx2(
1543 : r + 0,
1544 : r + 2,
1545 : offset_no_avg_256,
1546 0 : dst + dst_stride + 1 * 32);
1547 :
1548 0 : xy_y_convolve_2tap_32_avx2(
1549 : im + 2 * 32, s_256[1] + 4, s_256[0] + 4, coeffs_256, r);
1550 0 : jnt_2d_no_avg_round_store_32_avx2(
1551 : r + 0,
1552 : r + 2,
1553 : offset_no_avg_256,
1554 0 : dst + dst_stride + 2 * 32);
1555 :
1556 0 : xy_y_convolve_2tap_32_avx2(
1557 : im + 3 * 32, s_256[1] + 6, s_256[0] + 6, coeffs_256, r);
1558 0 : jnt_2d_no_avg_round_store_32_avx2(
1559 : r + 0,
1560 : r + 2,
1561 : offset_no_avg_256,
1562 0 : dst + dst_stride + 3 * 32);
1563 :
1564 0 : dst += 2 * dst_stride;
1565 0 : y -= 2;
1566 0 : } while (y);
1567 : }
1568 : }
1569 : }
1570 74507400 : }
1571 :
1572 18493400 : static void jnt_convolve_2d_ver_2tap_half_avx2(
1573 : const int16_t *const im_block, const int32_t w, const int32_t h,
1574 : const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
1575 : const ConvolveParams *const conv_params, uint8_t *dst8,
1576 : const int32_t dst8_stride) {
1577 18493400 : const int32_t dst_stride = conv_params->dst_stride;
1578 18493400 : const int32_t bd = 8;
1579 18493400 : const int32_t round_0 = 3;
1580 18493400 : const int16_t *im = im_block;
1581 18493400 : const int32_t round_1 = COMPOUND_ROUND1_BITS;
1582 18493400 : const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0; // 19
1583 18493400 : const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1; // 4
1584 18493400 : const int32_t round_offset = 1 << (offset_bits - round_1);
1585 18493400 : const int32_t factor =
1586 18493400 : conv_params->fwd_offset | (conv_params->bck_offset << 16);
1587 18493400 : const int32_t offset_comp_avg =
1588 18493400 : (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
1589 18493400 : (round_offset << DIST_PRECISION_BITS) -
1590 18493400 : (round_offset << (DIST_PRECISION_BITS - 1)) +
1591 18493400 : (1 << (round_bits + DIST_PRECISION_BITS - 1));
1592 18493400 : const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
1593 18493400 : const __m128i factor_128 = _mm_set1_epi32(factor);
1594 18493400 : const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1595 18493400 : const __m256i factor_256 = _mm256_set1_epi32(factor);
1596 18493400 : const int32_t offset_avg =
1597 18493400 : (1 << (round_1 - COMPOUND_ROUND1_BITS)) +
1598 18493400 : (1 << (round_bits + round_1 - COMPOUND_ROUND1_BITS + 1)) -
1599 18493400 : (1 << (offset_bits - COMPOUND_ROUND1_BITS + 1)) -
1600 18493400 : (1 << (offset_bits - COMPOUND_ROUND1_BITS));
1601 18493400 : const int32_t offset_no_avg =
1602 18493400 : (1 << (round_1 - COMPOUND_ROUND1_BITS)) +
1603 18493400 : (1 << (offset_bits - COMPOUND_ROUND1_BITS + 1)) +
1604 18493400 : (1 << (offset_bits - COMPOUND_ROUND1_BITS));
1605 18493400 : const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
1606 18493400 : const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
1607 18493400 : const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
1608 18493400 : const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
1609 18493400 : ConvBufType *dst = conv_params->dst;
1610 18493400 : int32_t y = h;
1611 :
1612 : (void)filter_params_y;
1613 : (void)subpel_y_q4;
1614 :
1615 18493400 : if (w == 2) {
1616 : __m128i s_32[2];
1617 :
1618 0 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
1619 :
1620 0 : if (conv_params->do_average) {
1621 0 : if (conv_params->use_jnt_comp_avg) {
1622 : do {
1623 : const __m128i res =
1624 0 : xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
1625 0 : jnt_2d_comp_avg_round_store_half_pel_2x2_sse2(
1626 : res,
1627 : factor_128,
1628 : offset_comp_avg_128,
1629 : dst,
1630 : dst_stride,
1631 : dst8,
1632 : dst8_stride);
1633 0 : im += 2 * 2;
1634 0 : dst += 2 * dst_stride;
1635 0 : dst8 += 2 * dst8_stride;
1636 0 : y -= 2;
1637 0 : } while (y);
1638 : }
1639 : else {
1640 : do {
1641 : const __m128i res =
1642 0 : xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
1643 0 : jnt_2d_avg_round_store_half_pel_2x2_sse2(res,
1644 : offset_avg_128,
1645 : dst,
1646 : dst_stride,
1647 : dst8,
1648 : dst8_stride);
1649 0 : im += 2 * 2;
1650 0 : dst += 2 * dst_stride;
1651 0 : dst8 += 2 * dst8_stride;
1652 0 : y -= 2;
1653 0 : } while (y);
1654 : }
1655 : }
1656 : else {
1657 : do {
1658 : const __m128i res =
1659 0 : xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
1660 0 : jnt_2d_no_avg_round_store_half_pel_2x2_sse2(
1661 : res, offset_no_avg_128, dst, dst_stride);
1662 0 : im += 2 * 2;
1663 0 : dst += 2 * dst_stride;
1664 0 : y -= 2;
1665 0 : } while (y);
1666 : }
1667 : }
1668 18493400 : else if (w == 4) {
1669 : __m128i s_64[2];
1670 :
1671 0 : s_64[0] = _mm_loadl_epi64((__m128i *)im);
1672 :
1673 0 : if (conv_params->do_average) {
1674 0 : if (conv_params->use_jnt_comp_avg) {
1675 : do {
1676 : const __m128i res =
1677 0 : xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
1678 0 : jnt_2d_comp_avg_round_store_half_pel_4x2_sse2(
1679 : res,
1680 : factor_128,
1681 : offset_comp_avg_128,
1682 : dst,
1683 : dst_stride,
1684 : dst8,
1685 : dst8_stride);
1686 0 : im += 2 * 4;
1687 0 : dst += 2 * dst_stride;
1688 0 : dst8 += 2 * dst8_stride;
1689 0 : y -= 2;
1690 0 : } while (y);
1691 : }
1692 : else {
1693 : do {
1694 : const __m128i res =
1695 0 : xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
1696 0 : jnt_2d_avg_round_store_half_pel_4x2_sse2(res,
1697 : offset_avg_128,
1698 : dst,
1699 : dst_stride,
1700 : dst8,
1701 : dst8_stride);
1702 0 : im += 2 * 4;
1703 0 : dst += 2 * dst_stride;
1704 0 : dst8 += 2 * dst8_stride;
1705 0 : y -= 2;
1706 0 : } while (y);
1707 : }
1708 : }
1709 : else {
1710 : do {
1711 : const __m128i res =
1712 0 : xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
1713 0 : jnt_2d_no_avg_round_store_half_pel_4x2_sse2(
1714 : res, offset_no_avg_128, dst, dst_stride);
1715 0 : im += 2 * 4;
1716 0 : dst += 2 * dst_stride;
1717 0 : y -= 2;
1718 0 : } while (y);
1719 : }
1720 : }
1721 18493400 : else if (w == 8) {
1722 : __m128i s_128[2];
1723 :
1724 8011830 : s_128[0] = _mm_load_si128((__m128i *)im);
1725 :
1726 8011830 : if (conv_params->do_average) {
1727 2171040 : if (conv_params->use_jnt_comp_avg) {
1728 : do {
1729 : const __m256i res =
1730 8006750 : xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
1731 8006130 : jnt_2d_comp_avg_round_store_half_pel_8x2_avx2(
1732 : res,
1733 : factor_256,
1734 : offset_comp_avg_256,
1735 : dst,
1736 : dst_stride,
1737 : dst8,
1738 : dst8_stride);
1739 8006410 : im += 2 * 8;
1740 8006410 : dst += 2 * dst_stride;
1741 8006410 : dst8 += 2 * dst8_stride;
1742 8006410 : y -= 2;
1743 8006410 : } while (y);
1744 : }
1745 : else {
1746 : do {
1747 : const __m256i res =
1748 8006490 : xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
1749 8006050 : jnt_2d_avg_round_store_half_pel_8x2_avx2(res,
1750 : offset_avg_256,
1751 : dst,
1752 : dst_stride,
1753 : dst8,
1754 : dst8_stride);
1755 8005970 : im += 2 * 8;
1756 8005970 : dst += 2 * dst_stride;
1757 8005970 : dst8 += 2 * dst8_stride;
1758 8005970 : y -= 2;
1759 8005970 : } while (y);
1760 : }
1761 : }
1762 : else {
1763 : do {
1764 : const __m256i res =
1765 40598300 : xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
1766 40595300 : jnt_2d_no_avg_round_store_half_pel_8x2_avx2(
1767 : res, offset_no_avg_256, dst, dst_stride);
1768 40598100 : im += 2 * 8;
1769 40598100 : dst += 2 * dst_stride;
1770 40598100 : y -= 2;
1771 40598100 : } while (y);
1772 : }
1773 : }
1774 10481600 : else if (w == 16) {
1775 : __m256i s_256[2], r[2];
1776 :
1777 6328260 : s_256[0] = _mm256_load_si256((__m256i *)im);
1778 :
1779 6328260 : if (conv_params->do_average) {
1780 1693650 : if (conv_params->use_jnt_comp_avg) {
1781 : do {
1782 8432670 : xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
1783 8431980 : jnt_2d_comp_avg_round_store_half_pel_16x2_avx2(
1784 : r,
1785 : factor_256,
1786 : offset_comp_avg_256,
1787 : dst,
1788 : dst_stride,
1789 : dst8,
1790 : dst8_stride);
1791 8432270 : im += 2 * 16;
1792 8432270 : dst += 2 * dst_stride;
1793 8432270 : dst8 += 2 * dst8_stride;
1794 8432270 : y -= 2;
1795 8432270 : } while (y);
1796 : }
1797 : else {
1798 : do {
1799 8433100 : xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
1800 8432670 : jnt_2d_avg_round_store_half_pel_16x2_avx2(
1801 : r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1802 8432800 : im += 2 * 16;
1803 8432800 : dst += 2 * dst_stride;
1804 8432800 : dst8 += 2 * dst8_stride;
1805 8432800 : y -= 2;
1806 8432800 : } while (y);
1807 : }
1808 : }
1809 : else {
1810 : do {
1811 43079600 : xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
1812 43086400 : jnt_2d_no_avg_round_store_half_pel_16x2_avx2(
1813 : r, offset_no_avg_256, dst, dst_stride);
1814 43079900 : im += 2 * 16;
1815 43079900 : dst += 2 * dst_stride;
1816 43079900 : y -= 2;
1817 43079900 : } while (y);
1818 : }
1819 : }
1820 4153340 : else if (w == 32) {
1821 : __m256i s_256[2][2];
1822 :
1823 3252100 : s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
1824 3252100 : s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
1825 :
1826 3252100 : if (conv_params->do_average) {
1827 959619 : if (conv_params->use_jnt_comp_avg) {
1828 : do {
1829 : __m256i r[2];
1830 :
1831 5377610 : xy_y_convolve_2tap_half_pel_32_avx2(
1832 : im + 1 * 32, s_256[0], s_256[1], r);
1833 : jnt_2d_comp_avg_round_store_half_pel_32_avx2(
1834 : r, factor_256, offset_comp_avg_256, dst, dst8);
1835 :
1836 5377580 : xy_y_convolve_2tap_half_pel_32_avx2(
1837 : im + 2 * 32, s_256[1], s_256[0], r);
1838 5377440 : jnt_2d_comp_avg_round_store_half_pel_32_avx2(
1839 : r,
1840 : factor_256,
1841 : offset_comp_avg_256,
1842 5377440 : dst + dst_stride,
1843 : dst8 + dst8_stride);
1844 :
1845 5377450 : im += 2 * 32;
1846 5377450 : dst += 2 * dst_stride;
1847 5377450 : dst8 += 2 * dst8_stride;
1848 5377450 : y -= 2;
1849 5377450 : } while (y);
1850 : }
1851 : else {
1852 : do {
1853 : __m256i r[2];
1854 :
1855 5377620 : xy_y_convolve_2tap_half_pel_32_avx2(
1856 : im + 1 * 32, s_256[0], s_256[1], r);
1857 5377580 : jnt_2d_avg_round_store_half_pel_32_avx2(
1858 : r, offset_avg_256, dst, dst8);
1859 :
1860 5377480 : xy_y_convolve_2tap_half_pel_32_avx2(
1861 : im + 2 * 32, s_256[1], s_256[0], r);
1862 5377520 : jnt_2d_avg_round_store_half_pel_32_avx2(r,
1863 : offset_avg_256,
1864 5377520 : dst + dst_stride,
1865 : dst8 + dst8_stride);
1866 :
1867 5377460 : im += 2 * 32;
1868 5377460 : dst += 2 * dst_stride;
1869 5377460 : dst8 += 2 * dst8_stride;
1870 5377460 : y -= 2;
1871 5377460 : } while (y);
1872 : }
1873 : }
1874 : else {
1875 : do {
1876 : __m256i r[2];
1877 :
1878 26125500 : xy_y_convolve_2tap_half_pel_32_avx2(
1879 : im + 1 * 32, s_256[0], s_256[1], r);
1880 26130400 : jnt_2d_no_avg_round_store_half_pel_32_avx2(
1881 : r, offset_no_avg_256, dst);
1882 :
1883 26127200 : xy_y_convolve_2tap_half_pel_32_avx2(
1884 : im + 2 * 32, s_256[1], s_256[0], r);
1885 26130200 : jnt_2d_no_avg_round_store_half_pel_32_avx2(
1886 26130200 : r, offset_no_avg_256, dst + dst_stride);
1887 :
1888 26125700 : im += 2 * 32;
1889 26125700 : dst += 2 * dst_stride;
1890 26125700 : y -= 2;
1891 26125700 : } while (y);
1892 : }
1893 : }
1894 901232 : else if (w == 64) {
1895 : __m256i s_256[2][4];
1896 :
1897 910630 : s_256[0][0] = _mm256_load_si256((__m256i *)(im + 0 * 16));
1898 910630 : s_256[0][1] = _mm256_load_si256((__m256i *)(im + 1 * 16));
1899 910630 : s_256[0][2] = _mm256_load_si256((__m256i *)(im + 2 * 16));
1900 910630 : s_256[0][3] = _mm256_load_si256((__m256i *)(im + 3 * 16));
1901 :
1902 910630 : if (conv_params->do_average) {
1903 301416 : if (conv_params->use_jnt_comp_avg) {
1904 : do {
1905 : __m256i r[2];
1906 :
1907 2425470 : xy_y_convolve_2tap_half_pel_32_avx2(
1908 : im + 2 * 32, s_256[0] + 0, s_256[1] + 0, r);
1909 : jnt_2d_comp_avg_round_store_half_pel_32_avx2(
1910 : r, factor_256, offset_comp_avg_256, dst, dst8);
1911 :
1912 2425400 : xy_y_convolve_2tap_half_pel_32_avx2(
1913 : im + 3 * 32, s_256[0] + 2, s_256[1] + 2, r);
1914 2425350 : jnt_2d_comp_avg_round_store_half_pel_32_avx2(
1915 : r,
1916 : factor_256,
1917 : offset_comp_avg_256,
1918 2425350 : dst + 32,
1919 : dst8 + 32);
1920 2425440 : im += 2 * 64;
1921 :
1922 2425440 : xy_y_convolve_2tap_half_pel_32_avx2(
1923 : im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
1924 2425400 : jnt_2d_comp_avg_round_store_half_pel_32_avx2(
1925 : r,
1926 : factor_256,
1927 : offset_comp_avg_256,
1928 2425400 : dst + dst_stride,
1929 : dst8 + dst8_stride);
1930 :
1931 2425390 : xy_y_convolve_2tap_half_pel_32_avx2(
1932 : im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
1933 2425370 : jnt_2d_comp_avg_round_store_half_pel_32_avx2(
1934 : r,
1935 : factor_256,
1936 : offset_comp_avg_256,
1937 2425370 : dst + dst_stride + 32,
1938 2425370 : dst8 + dst8_stride + 32);
1939 :
1940 2425430 : dst += 2 * dst_stride;
1941 2425430 : dst8 += 2 * dst8_stride;
1942 2425430 : y -= 2;
1943 2425430 : } while (y);
1944 : }
1945 : else {
1946 : do {
1947 : __m256i r[2];
1948 :
1949 2425380 : xy_y_convolve_2tap_half_pel_32_avx2(
1950 : im + 2 * 32, s_256[0] + 0, s_256[1] + 0, r);
1951 2425370 : jnt_2d_avg_round_store_half_pel_32_avx2(
1952 : r, offset_avg_256, dst, dst8);
1953 :
1954 2425350 : xy_y_convolve_2tap_half_pel_32_avx2(
1955 : im + 3 * 32, s_256[0] + 2, s_256[1] + 2, r);
1956 2425370 : jnt_2d_avg_round_store_half_pel_32_avx2(
1957 2425370 : r, offset_avg_256, dst + 32, dst8 + 32);
1958 2425340 : im += 2 * 64;
1959 :
1960 2425340 : xy_y_convolve_2tap_half_pel_32_avx2(
1961 : im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
1962 2425340 : jnt_2d_avg_round_store_half_pel_32_avx2(r,
1963 : offset_avg_256,
1964 2425340 : dst + dst_stride,
1965 : dst8 + dst8_stride);
1966 :
1967 2425340 : xy_y_convolve_2tap_half_pel_32_avx2(
1968 : im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
1969 2425370 : jnt_2d_avg_round_store_half_pel_32_avx2(
1970 : r,
1971 : offset_avg_256,
1972 2425370 : dst + dst_stride + 32,
1973 2425370 : dst8 + dst8_stride + 32);
1974 :
1975 2425340 : dst += 2 * dst_stride;
1976 2425340 : dst8 += 2 * dst8_stride;
1977 2425340 : y -= 2;
1978 2425340 : } while (y);
1979 : }
1980 : }
1981 : else {
1982 : do {
1983 : __m256i r[2];
1984 :
1985 9798030 : xy_y_convolve_2tap_half_pel_32_avx2(
1986 : im + 2 * 32, s_256[0] + 0, s_256[1] + 0, r);
1987 9799170 : jnt_2d_no_avg_round_store_half_pel_32_avx2(
1988 : r, offset_no_avg_256, dst);
1989 :
1990 9798260 : xy_y_convolve_2tap_half_pel_32_avx2(
1991 : im + 3 * 32, s_256[0] + 2, s_256[1] + 2, r);
1992 9799090 : jnt_2d_no_avg_round_store_half_pel_32_avx2(
1993 : r, offset_no_avg_256, dst + 32);
1994 9798200 : im += 2 * 64;
1995 :
1996 9798200 : xy_y_convolve_2tap_half_pel_32_avx2(
1997 : im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
1998 9799230 : jnt_2d_no_avg_round_store_half_pel_32_avx2(
1999 9799230 : r, offset_no_avg_256, dst + dst_stride);
2000 :
2001 9798250 : xy_y_convolve_2tap_half_pel_32_avx2(
2002 : im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
2003 9799080 : jnt_2d_no_avg_round_store_half_pel_32_avx2(
2004 9799080 : r, offset_no_avg_256, dst + dst_stride + 32);
2005 :
2006 9798120 : dst += 2 * dst_stride;
2007 9798120 : y -= 2;
2008 9798120 : } while (y);
2009 : }
2010 : }
2011 : else {
2012 : __m256i s_256[2][8];
2013 :
2014 0 : assert(w == 128);
2015 :
2016 : load_16bit_8rows_avx2(im, 16, s_256[0]);
2017 :
2018 0 : if (conv_params->do_average) {
2019 0 : if (conv_params->use_jnt_comp_avg) {
2020 : do {
2021 : __m256i r[2];
2022 :
2023 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2024 : im + 4 * 32, s_256[0] + 0, s_256[1] + 0, r);
2025 : jnt_2d_comp_avg_round_store_half_pel_32_avx2(
2026 : r,
2027 : factor_256,
2028 : offset_comp_avg_256,
2029 : dst + 0 * 32,
2030 : dst8 + 0 * 32);
2031 :
2032 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2033 : im + 5 * 32, s_256[0] + 2, s_256[1] + 2, r);
2034 0 : jnt_2d_comp_avg_round_store_half_pel_32_avx2(
2035 : r,
2036 : factor_256,
2037 : offset_comp_avg_256,
2038 0 : dst + 1 * 32,
2039 : dst8 + 1 * 32);
2040 :
2041 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2042 : im + 6 * 32, s_256[0] + 4, s_256[1] + 4, r);
2043 0 : jnt_2d_comp_avg_round_store_half_pel_32_avx2(
2044 : r,
2045 : factor_256,
2046 : offset_comp_avg_256,
2047 0 : dst + 2 * 32,
2048 : dst8 + 2 * 32);
2049 :
2050 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2051 : im + 7 * 32, s_256[0] + 6, s_256[1] + 6, r);
2052 0 : jnt_2d_comp_avg_round_store_half_pel_32_avx2(
2053 : r,
2054 : factor_256,
2055 : offset_comp_avg_256,
2056 0 : dst + 3 * 32,
2057 : dst8 + 3 * 32);
2058 0 : im += 2 * 128;
2059 :
2060 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2061 : im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
2062 0 : jnt_2d_comp_avg_round_store_half_pel_32_avx2(
2063 : r,
2064 : factor_256,
2065 : offset_comp_avg_256,
2066 0 : dst + dst_stride + 0 * 32,
2067 : dst8 + dst8_stride + 0 * 32);
2068 :
2069 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2070 : im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
2071 0 : jnt_2d_comp_avg_round_store_half_pel_32_avx2(
2072 : r,
2073 : factor_256,
2074 : offset_comp_avg_256,
2075 0 : dst + dst_stride + 1 * 32,
2076 0 : dst8 + dst8_stride + 1 * 32);
2077 :
2078 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2079 : im + 2 * 32, s_256[1] + 4, s_256[0] + 4, r);
2080 0 : jnt_2d_comp_avg_round_store_half_pel_32_avx2(
2081 : r,
2082 : factor_256,
2083 : offset_comp_avg_256,
2084 0 : dst + dst_stride + 2 * 32,
2085 0 : dst8 + dst8_stride + 2 * 32);
2086 :
2087 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2088 : im + 3 * 32, s_256[1] + 6, s_256[0] + 6, r);
2089 0 : jnt_2d_comp_avg_round_store_half_pel_32_avx2(
2090 : r,
2091 : factor_256,
2092 : offset_comp_avg_256,
2093 0 : dst + dst_stride + 3 * 32,
2094 0 : dst8 + dst8_stride + 3 * 32);
2095 :
2096 0 : dst += 2 * dst_stride;
2097 0 : dst8 += 2 * dst8_stride;
2098 0 : y -= 2;
2099 0 : } while (y);
2100 : }
2101 : else {
2102 : do {
2103 : __m256i r[2];
2104 :
2105 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2106 : im + 4 * 32, s_256[0] + 0, s_256[1] + 0, r);
2107 0 : jnt_2d_avg_round_store_half_pel_32_avx2(
2108 : r, offset_avg_256, dst + 0 * 32, dst8 + 0 * 32);
2109 :
2110 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2111 : im + 5 * 32, s_256[0] + 2, s_256[1] + 2, r);
2112 0 : jnt_2d_avg_round_store_half_pel_32_avx2(
2113 0 : r, offset_avg_256, dst + 1 * 32, dst8 + 1 * 32);
2114 :
2115 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2116 : im + 6 * 32, s_256[0] + 4, s_256[1] + 4, r);
2117 0 : jnt_2d_avg_round_store_half_pel_32_avx2(
2118 0 : r, offset_avg_256, dst + 2 * 32, dst8 + 2 * 32);
2119 :
2120 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2121 : im + 7 * 32, s_256[0] + 6, s_256[1] + 6, r);
2122 0 : jnt_2d_avg_round_store_half_pel_32_avx2(
2123 0 : r, offset_avg_256, dst + 3 * 32, dst8 + 3 * 32);
2124 0 : im += 2 * 128;
2125 :
2126 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2127 : im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
2128 0 : jnt_2d_avg_round_store_half_pel_32_avx2(
2129 : r,
2130 : offset_avg_256,
2131 0 : dst + dst_stride + 0 * 32,
2132 : dst8 + dst8_stride + 0 * 32);
2133 :
2134 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2135 : im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
2136 0 : jnt_2d_avg_round_store_half_pel_32_avx2(
2137 : r,
2138 : offset_avg_256,
2139 0 : dst + dst_stride + 1 * 32,
2140 0 : dst8 + dst8_stride + 1 * 32);
2141 :
2142 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2143 : im + 2 * 32, s_256[1] + 4, s_256[0] + 4, r);
2144 0 : jnt_2d_avg_round_store_half_pel_32_avx2(
2145 : r,
2146 : offset_avg_256,
2147 0 : dst + dst_stride + 2 * 32,
2148 0 : dst8 + dst8_stride + 2 * 32);
2149 :
2150 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2151 : im + 3 * 32, s_256[1] + 6, s_256[0] + 6, r);
2152 0 : jnt_2d_avg_round_store_half_pel_32_avx2(
2153 : r,
2154 : offset_avg_256,
2155 0 : dst + dst_stride + 3 * 32,
2156 0 : dst8 + dst8_stride + 3 * 32);
2157 :
2158 0 : dst += 2 * dst_stride;
2159 0 : dst8 += 2 * dst8_stride;
2160 0 : y -= 2;
2161 0 : } while (y);
2162 : }
2163 : }
2164 : else {
2165 : do {
2166 : __m256i r[2];
2167 :
2168 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2169 : im + 4 * 32, s_256[0] + 0, s_256[1] + 0, r);
2170 0 : jnt_2d_no_avg_round_store_half_pel_32_avx2(
2171 : r, offset_no_avg_256, dst + 0 * 32);
2172 :
2173 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2174 : im + 5 * 32, s_256[0] + 2, s_256[1] + 2, r);
2175 0 : jnt_2d_no_avg_round_store_half_pel_32_avx2(
2176 : r, offset_no_avg_256, dst + 1 * 32);
2177 :
2178 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2179 : im + 6 * 32, s_256[0] + 4, s_256[1] + 4, r);
2180 0 : jnt_2d_no_avg_round_store_half_pel_32_avx2(
2181 : r, offset_no_avg_256, dst + 2 * 32);
2182 :
2183 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2184 : im + 7 * 32, s_256[0] + 6, s_256[1] + 6, r);
2185 0 : jnt_2d_no_avg_round_store_half_pel_32_avx2(
2186 : r, offset_no_avg_256, dst + 3 * 32);
2187 0 : im += 2 * 128;
2188 :
2189 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2190 : im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
2191 0 : jnt_2d_no_avg_round_store_half_pel_32_avx2(
2192 0 : r, offset_no_avg_256, dst + dst_stride + 0 * 32);
2193 :
2194 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2195 : im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
2196 0 : jnt_2d_no_avg_round_store_half_pel_32_avx2(
2197 0 : r, offset_no_avg_256, dst + dst_stride + 1 * 32);
2198 :
2199 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2200 : im + 2 * 32, s_256[1] + 4, s_256[0] + 4, r);
2201 0 : jnt_2d_no_avg_round_store_half_pel_32_avx2(
2202 0 : r, offset_no_avg_256, dst + dst_stride + 2 * 32);
2203 :
2204 0 : xy_y_convolve_2tap_half_pel_32_avx2(
2205 : im + 3 * 32, s_256[1] + 6, s_256[0] + 6, r);
2206 0 : jnt_2d_no_avg_round_store_half_pel_32_avx2(
2207 0 : r, offset_no_avg_256, dst + dst_stride + 3 * 32);
2208 :
2209 0 : dst += 2 * dst_stride;
2210 0 : y -= 2;
2211 0 : } while (y);
2212 : }
2213 : }
2214 18501300 : }
2215 :
2216 1653540 : static void jnt_convolve_2d_ver_4tap_avx2(
2217 : const int16_t *const im_block, const int32_t w, const int32_t h,
2218 : const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
2219 : const ConvolveParams *const conv_params, uint8_t *dst8,
2220 : const int32_t dst8_stride) {
2221 1653540 : const int32_t dst_stride = conv_params->dst_stride;
2222 1653540 : const int32_t bd = 8;
2223 1653540 : const int32_t round_0 = 3;
2224 1653540 : const int16_t *im = im_block;
2225 1653540 : const int32_t round_1 = COMPOUND_ROUND1_BITS;
2226 1653540 : const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0; // 19
2227 1653540 : const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1; // 4
2228 1653540 : const int32_t round_offset = 1 << (offset_bits - round_1);
2229 1653540 : const int32_t factor =
2230 1653540 : conv_params->fwd_offset | (conv_params->bck_offset << 16);
2231 1653540 : const int32_t offset_comp_avg =
2232 1653540 : (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
2233 1653540 : (round_offset << DIST_PRECISION_BITS) -
2234 1653540 : (round_offset << (DIST_PRECISION_BITS - 1)) +
2235 1653540 : (1 << (round_bits + DIST_PRECISION_BITS - 1));
2236 1653540 : const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
2237 1653540 : const __m128i factor_128 = _mm_set1_epi32(factor);
2238 1653540 : const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
2239 1653540 : const __m256i factor_256 = _mm256_set1_epi32(factor);
2240 1653540 : const int32_t offset_avg = (1 << (round_1 - 1)) +
2241 1653540 : (1 << (round_bits + round_1)) -
2242 1653540 : (1 << offset_bits) - (1 << (offset_bits - 1));
2243 1653540 : const int32_t offset_no_avg =
2244 1653540 : (1 << (round_1 - 1)) + (1 << offset_bits) + (1 << (offset_bits - 1));
2245 1653540 : const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
2246 1653540 : const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
2247 1653540 : const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
2248 1653540 : const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
2249 1653540 : int32_t y = h;
2250 1653540 : ConvBufType *dst = conv_params->dst;
2251 : __m128i coeffs_128[4];
2252 : __m256i coeffs_256[4];
2253 :
2254 1653540 : if (w == 2) {
2255 : __m128i s_32[4], ss_128[2];
2256 :
2257 0 : prepare_coeffs_4tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
2258 :
2259 0 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
2260 0 : s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
2261 0 : s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
2262 :
2263 0 : const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2264 0 : const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2265 :
2266 0 : ss_128[0] = _mm_unpacklo_epi16(src01, src12);
2267 :
2268 0 : if (conv_params->do_average) {
2269 0 : if (conv_params->use_jnt_comp_avg) {
2270 : do {
2271 0 : const __m128i res = xy_y_convolve_4tap_2x2_sse2(
2272 : im, s_32, ss_128, coeffs_128);
2273 0 : jnt_2d_comp_avg_round_store_2x2_sse2(res,
2274 : factor_128,
2275 : offset_comp_avg_128,
2276 : dst,
2277 : dst_stride,
2278 : dst8,
2279 : dst8_stride);
2280 0 : im += 2 * 2;
2281 0 : dst += 2 * dst_stride;
2282 0 : dst8 += 2 * dst8_stride;
2283 0 : y -= 2;
2284 0 : } while (y);
2285 : }
2286 : else {
2287 : do {
2288 0 : const __m128i res = xy_y_convolve_4tap_2x2_sse2(
2289 : im, s_32, ss_128, coeffs_128);
2290 0 : jnt_2d_avg_round_store_2x2_sse2(res,
2291 : offset_avg_128,
2292 : dst,
2293 : dst_stride,
2294 : dst8,
2295 : dst8_stride);
2296 0 : im += 2 * 2;
2297 0 : dst += 2 * dst_stride;
2298 0 : dst8 += 2 * dst8_stride;
2299 0 : y -= 2;
2300 0 : } while (y);
2301 : }
2302 : }
2303 : else {
2304 : do {
2305 : const __m128i res =
2306 0 : xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
2307 0 : jnt_2d_no_avg_round_store_2x2_sse2(
2308 : res, offset_no_avg_128, dst, dst_stride);
2309 0 : im += 2 * 2;
2310 0 : dst += 2 * dst_stride;
2311 0 : y -= 2;
2312 0 : } while (y);
2313 : }
2314 : }
2315 : else {
2316 1653540 : prepare_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2317 :
2318 1653560 : if (w == 4) {
2319 : __m128i s_64[4];
2320 : __m256i s_256[2], ss_256[2];
2321 :
2322 669237 : s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
2323 669237 : s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
2324 669237 : s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
2325 :
2326 : // Load lines a and b. Line a to lower 128, line b to upper
2327 : // 128
2328 669237 : s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
2329 669237 : s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
2330 :
2331 669237 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
2332 :
2333 669237 : if (conv_params->do_average) {
2334 305068 : if (conv_params->use_jnt_comp_avg) {
2335 : do {
2336 192140 : const __m256i res = xy_y_convolve_4tap_4x2_avx2(
2337 : im, s_64, ss_256, coeffs_256);
2338 192140 : jnt_2d_comp_avg_round_store_4x2_avx2(
2339 : res,
2340 : factor_256,
2341 : offset_comp_avg_256,
2342 : dst,
2343 : dst_stride,
2344 : dst8,
2345 : dst8_stride);
2346 192139 : im += 2 * 4;
2347 192139 : dst += 2 * dst_stride;
2348 192139 : dst8 += 2 * dst8_stride;
2349 192139 : y -= 2;
2350 192139 : } while (y);
2351 : }
2352 : else {
2353 : do {
2354 417999 : const __m256i res = xy_y_convolve_4tap_4x2_avx2(
2355 : im, s_64, ss_256, coeffs_256);
2356 417997 : jnt_2d_avg_round_store_4x2_avx2(res,
2357 : offset_avg_256,
2358 : dst,
2359 : dst_stride,
2360 : dst8,
2361 : dst8_stride);
2362 418000 : im += 2 * 4;
2363 418000 : dst += 2 * dst_stride;
2364 418000 : dst8 += 2 * dst8_stride;
2365 418000 : y -= 2;
2366 418000 : } while (y);
2367 : }
2368 : }
2369 : else {
2370 : do {
2371 728344 : const __m256i res = xy_y_convolve_4tap_4x2_avx2(
2372 : im, s_64, ss_256, coeffs_256);
2373 728342 : jnt_2d_no_avg_round_store_4x2_avx2(
2374 : res, offset_no_avg_256, dst, dst_stride);
2375 728343 : im += 2 * 4;
2376 728343 : dst += 2 * dst_stride;
2377 728343 : y -= 2;
2378 728343 : } while (y);
2379 : }
2380 : }
2381 984328 : else if (w == 8) {
2382 : __m256i s_256[4], r[2];
2383 :
2384 549716 : s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
2385 549716 : s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
2386 :
2387 : __m256i ss_256[4];
2388 :
2389 549716 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
2390 549716 : ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
2391 :
2392 549716 : if (conv_params->do_average) {
2393 245740 : if (conv_params->use_jnt_comp_avg) {
2394 : do {
2395 173488 : xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
2396 173486 : jnt_2d_comp_avg_round_store_8x2_avx2(
2397 : r,
2398 : factor_256,
2399 : offset_comp_avg_256,
2400 : dst,
2401 : dst_stride,
2402 : dst8,
2403 : dst8_stride);
2404 173487 : im += 2 * 8;
2405 173487 : dst += 2 * dst_stride;
2406 173487 : dst8 += 2 * dst8_stride;
2407 173487 : y -= 2;
2408 173487 : } while (y);
2409 : }
2410 : else {
2411 : do {
2412 317991 : xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
2413 317989 : jnt_2d_avg_round_store_8x2_avx2(r,
2414 : offset_avg_256,
2415 : dst,
2416 : dst_stride,
2417 : dst8,
2418 : dst8_stride);
2419 317988 : im += 2 * 8;
2420 317988 : dst += 2 * dst_stride;
2421 317988 : dst8 += 2 * dst8_stride;
2422 317988 : y -= 2;
2423 317988 : } while (y);
2424 : }
2425 : }
2426 : else {
2427 : do {
2428 607955 : xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
2429 607957 : jnt_2d_no_avg_round_store_8x2_avx2(
2430 : r, offset_no_avg_256, dst, dst_stride);
2431 607962 : im += 2 * 8;
2432 607962 : dst += 2 * dst_stride;
2433 607962 : y -= 2;
2434 607962 : } while (y);
2435 : }
2436 : }
2437 : else {
2438 : __m256i s_256[5];
2439 :
2440 434612 : assert(w == 16);
2441 :
2442 434612 : s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
2443 434612 : s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
2444 434612 : s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
2445 :
2446 : __m256i ss_256[4], tt_256[4], r[4];
2447 :
2448 434612 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
2449 434612 : ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
2450 :
2451 434612 : tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
2452 434612 : tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
2453 :
2454 434612 : if (conv_params->do_average) {
2455 191764 : if (conv_params->use_jnt_comp_avg) {
2456 : do {
2457 151996 : xy_y_convolve_4tap_16x2_avx2(
2458 : im, s_256, ss_256, tt_256, coeffs_256, r);
2459 151996 : jnt_2d_comp_avg_round_store_16x2_avx2(
2460 : r,
2461 : factor_256,
2462 : offset_comp_avg_256,
2463 : dst,
2464 : dst_stride,
2465 : dst8,
2466 : dst8_stride);
2467 151996 : im += 2 * 16;
2468 151996 : dst += 2 * dst_stride;
2469 151996 : dst8 += 2 * dst8_stride;
2470 151996 : y -= 2;
2471 151996 : } while (y);
2472 : }
2473 : else {
2474 : do {
2475 231534 : xy_y_convolve_4tap_16x2_avx2(
2476 : im, s_256, ss_256, tt_256, coeffs_256, r);
2477 231531 : jnt_2d_avg_round_store_16x2_avx2(r,
2478 : offset_avg_256,
2479 : dst,
2480 : dst_stride,
2481 : dst8,
2482 : dst8_stride);
2483 231536 : im += 2 * 16;
2484 231536 : dst += 2 * dst_stride;
2485 231536 : dst8 += 2 * dst8_stride;
2486 231536 : y -= 2;
2487 231536 : } while (y);
2488 : }
2489 : }
2490 : else {
2491 : do {
2492 485725 : xy_y_convolve_4tap_16x2_avx2(
2493 : im, s_256, ss_256, tt_256, coeffs_256, r);
2494 485742 : jnt_2d_no_avg_round_store_16x2_avx2(
2495 : r, offset_no_avg_256, dst, dst_stride);
2496 485746 : im += 2 * 16;
2497 485746 : dst += 2 * dst_stride;
2498 485746 : y -= 2;
2499 485746 : } while (y);
2500 : }
2501 : }
2502 : }
2503 1653590 : }
2504 :
2505 38603000 : static void jnt_convolve_2d_ver_6tap_avx2(
2506 : const int16_t *const im_block, const int32_t w, const int32_t h,
2507 : const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
2508 : const ConvolveParams *const conv_params, uint8_t *dst8,
2509 : const int32_t dst8_stride) {
2510 38603000 : const int32_t dst_stride = conv_params->dst_stride;
2511 38603000 : const int32_t bd = 8;
2512 38603000 : const int32_t round_0 = 3;
2513 38603000 : const int16_t *im = im_block;
2514 38603000 : const int32_t round_1 = COMPOUND_ROUND1_BITS;
2515 38603000 : const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0; // 19
2516 38603000 : const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1; // 4
2517 38603000 : const int32_t round_offset = 1 << (offset_bits - round_1);
2518 38603000 : const int32_t factor =
2519 38603000 : conv_params->fwd_offset | (conv_params->bck_offset << 16);
2520 38603000 : const int32_t offset_comp_avg =
2521 38603000 : (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
2522 38603000 : (round_offset << DIST_PRECISION_BITS) -
2523 38603000 : (round_offset << (DIST_PRECISION_BITS - 1)) +
2524 38603000 : (1 << (round_bits + DIST_PRECISION_BITS - 1));
2525 38603000 : const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
2526 38603000 : const __m128i factor_128 = _mm_set1_epi32(factor);
2527 38603000 : const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
2528 38603000 : const __m256i factor_256 = _mm256_set1_epi32(factor);
2529 38603000 : const int32_t offset_avg = (1 << (round_1 - 1)) +
2530 38603000 : (1 << (round_bits + round_1)) -
2531 38603000 : (1 << offset_bits) - (1 << (offset_bits - 1));
2532 38603000 : const int32_t offset_no_avg =
2533 38603000 : (1 << (round_1 - 1)) + (1 << offset_bits) + (1 << (offset_bits - 1));
2534 38603000 : const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
2535 38603000 : const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
2536 38603000 : const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
2537 38603000 : const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
2538 38603000 : int32_t y = h;
2539 38603000 : ConvBufType *dst = conv_params->dst;
2540 : __m128i coeffs_128[4];
2541 : __m256i coeffs_256[4];
2542 :
2543 38603000 : if (w == 2) {
2544 : __m128i s_32[6], ss_128[3];
2545 :
2546 0 : prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2547 :
2548 0 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
2549 0 : s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
2550 0 : s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
2551 0 : s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
2552 0 : s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
2553 :
2554 0 : const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2555 0 : const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2556 0 : const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2557 0 : const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2558 :
2559 0 : ss_128[0] = _mm_unpacklo_epi16(src01, src12);
2560 0 : ss_128[1] = _mm_unpacklo_epi16(src23, src34);
2561 :
2562 0 : y = h;
2563 :
2564 0 : if (conv_params->do_average) {
2565 0 : if (conv_params->use_jnt_comp_avg) {
2566 : do {
2567 0 : const __m128i res = xy_y_convolve_6tap_2x2_sse2(
2568 : im, s_32, ss_128, coeffs_128);
2569 0 : jnt_2d_comp_avg_round_store_2x2_sse2(res,
2570 : factor_128,
2571 : offset_comp_avg_128,
2572 : dst,
2573 : dst_stride,
2574 : dst8,
2575 : dst8_stride);
2576 0 : im += 2 * 2;
2577 0 : dst += 2 * dst_stride;
2578 0 : dst8 += 2 * dst8_stride;
2579 0 : y -= 2;
2580 0 : } while (y);
2581 : }
2582 : else {
2583 : do {
2584 0 : const __m128i res = xy_y_convolve_6tap_2x2_sse2(
2585 : im, s_32, ss_128, coeffs_128);
2586 0 : jnt_2d_avg_round_store_2x2_sse2(res,
2587 : offset_avg_128,
2588 : dst,
2589 : dst_stride,
2590 : dst8,
2591 : dst8_stride);
2592 0 : im += 2 * 2;
2593 0 : dst += 2 * dst_stride;
2594 0 : dst8 += 2 * dst8_stride;
2595 0 : y -= 2;
2596 0 : } while (y);
2597 : }
2598 : }
2599 : else {
2600 : do {
2601 : const __m128i res =
2602 0 : xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
2603 0 : jnt_2d_no_avg_round_store_2x2_sse2(
2604 : res, offset_no_avg_128, dst, dst_stride);
2605 0 : im += 2 * 2;
2606 0 : dst += 2 * dst_stride;
2607 0 : y -= 2;
2608 0 : } while (y);
2609 : }
2610 : }
2611 : else {
2612 38603000 : prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2613 :
2614 38610700 : if (w == 4) {
2615 : __m128i s_64[6];
2616 : __m256i s_256[6], ss_256[3];
2617 :
2618 1051860 : s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
2619 1051860 : s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
2620 1051860 : s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
2621 1051860 : s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
2622 1051860 : s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
2623 :
2624 : // Load lines a and b. Line a to lower 128, line b to upper
2625 : // 128
2626 1051860 : s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
2627 1051860 : s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
2628 1051860 : s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
2629 1051860 : s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
2630 :
2631 1051860 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
2632 1051860 : ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
2633 :
2634 1051860 : y = h;
2635 :
2636 1051860 : if (conv_params->do_average) {
2637 474975 : if (conv_params->use_jnt_comp_avg) {
2638 : do {
2639 939236 : const __m256i res = xy_y_convolve_6tap_4x2_avx2(
2640 : im, s_64, ss_256, coeffs_256);
2641 939229 : jnt_2d_comp_avg_round_store_4x2_avx2(
2642 : res,
2643 : factor_256,
2644 : offset_comp_avg_256,
2645 : dst,
2646 : dst_stride,
2647 : dst8,
2648 : dst8_stride);
2649 939226 : im += 2 * 4;
2650 939226 : dst += 2 * dst_stride;
2651 939226 : dst8 += 2 * dst8_stride;
2652 939226 : y -= 2;
2653 939226 : } while (y);
2654 : }
2655 : else {
2656 : do {
2657 1642760 : const __m256i res = xy_y_convolve_6tap_4x2_avx2(
2658 : im, s_64, ss_256, coeffs_256);
2659 1642740 : jnt_2d_avg_round_store_4x2_avx2(res,
2660 : offset_avg_256,
2661 : dst,
2662 : dst_stride,
2663 : dst8,
2664 : dst8_stride);
2665 1642740 : im += 2 * 4;
2666 1642740 : dst += 2 * dst_stride;
2667 1642740 : dst8 += 2 * dst8_stride;
2668 1642740 : y -= 2;
2669 1642740 : } while (y);
2670 : }
2671 : }
2672 : else {
2673 : do {
2674 3145700 : const __m256i res = xy_y_convolve_6tap_4x2_avx2(
2675 : im, s_64, ss_256, coeffs_256);
2676 3145750 : jnt_2d_no_avg_round_store_4x2_avx2(
2677 : res, offset_no_avg_256, dst, dst_stride);
2678 3145730 : im += 2 * 4;
2679 3145730 : dst += 2 * dst_stride;
2680 3145730 : y -= 2;
2681 3145730 : } while (y);
2682 : }
2683 : }
2684 37558800 : else if (w == 8) {
2685 : __m256i s_256[6], r[2];
2686 :
2687 17287800 : s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
2688 17287800 : s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
2689 17287800 : s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
2690 17287800 : s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
2691 17287800 : y = h;
2692 :
2693 : __m256i ss_256[6];
2694 :
2695 17287800 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
2696 17287800 : ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
2697 :
2698 17287800 : ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
2699 17287800 : ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
2700 :
2701 17287800 : if (conv_params->do_average) {
2702 6784090 : if (conv_params->use_jnt_comp_avg) {
2703 : do {
2704 22128500 : xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
2705 22120800 : jnt_2d_comp_avg_round_store_8x2_avx2(
2706 : r,
2707 : factor_256,
2708 : offset_comp_avg_256,
2709 : dst,
2710 : dst_stride,
2711 : dst8,
2712 : dst8_stride);
2713 22125900 : im += 2 * 8;
2714 22125900 : dst += 2 * dst_stride;
2715 22125900 : dst8 += 2 * dst8_stride;
2716 22125900 : y -= 2;
2717 22125900 : } while (y);
2718 : }
2719 : else {
2720 : do {
2721 28722800 : xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
2722 28718600 : jnt_2d_avg_round_store_8x2_avx2(r,
2723 : offset_avg_256,
2724 : dst,
2725 : dst_stride,
2726 : dst8,
2727 : dst8_stride);
2728 28714800 : im += 2 * 8;
2729 28714800 : dst += 2 * dst_stride;
2730 28714800 : dst8 += 2 * dst8_stride;
2731 28714800 : y -= 2;
2732 28714800 : } while (y);
2733 : }
2734 : }
2735 : else {
2736 : do {
2737 78557200 : xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
2738 78541800 : jnt_2d_no_avg_round_store_8x2_avx2(
2739 : r, offset_no_avg_256, dst, dst_stride);
2740 78563400 : im += 2 * 8;
2741 78563400 : dst += 2 * dst_stride;
2742 78563400 : y -= 2;
2743 78563400 : } while (y);
2744 : }
2745 : }
2746 20271000 : else if (w == 16) {
2747 : __m256i s_256[6];
2748 :
2749 11530700 : s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
2750 11530700 : s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
2751 11530700 : s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
2752 11530700 : s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
2753 11530700 : s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 16));
2754 11530700 : y = h;
2755 :
2756 : __m256i ss_256[6], tt_256[6], r[4];
2757 :
2758 11530700 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
2759 11530700 : ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
2760 11530700 : ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
2761 11530700 : ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
2762 :
2763 11530700 : tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
2764 11530700 : tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
2765 11530700 : tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
2766 11530700 : tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
2767 :
2768 11530700 : if (conv_params->do_average) {
2769 4472190 : if (conv_params->use_jnt_comp_avg) {
2770 : do {
2771 17961100 : xy_y_convolve_6tap_16x2_avx2(
2772 : im, 16, s_256, ss_256, tt_256, coeffs_256, r);
2773 17955300 : jnt_2d_comp_avg_round_store_16x2_avx2(
2774 : r,
2775 : factor_256,
2776 : offset_comp_avg_256,
2777 : dst,
2778 : dst_stride,
2779 : dst8,
2780 : dst8_stride);
2781 17958900 : im += 2 * 16;
2782 17958900 : dst += 2 * dst_stride;
2783 17958900 : dst8 += 2 * dst8_stride;
2784 17958900 : y -= 2;
2785 17958900 : } while (y);
2786 : }
2787 : else {
2788 : do {
2789 26439300 : xy_y_convolve_6tap_16x2_avx2(
2790 : im, 16, s_256, ss_256, tt_256, coeffs_256, r);
2791 26428000 : jnt_2d_avg_round_store_16x2_avx2(r,
2792 : offset_avg_256,
2793 : dst,
2794 : dst_stride,
2795 : dst8,
2796 : dst8_stride);
2797 26435000 : im += 2 * 16;
2798 26435000 : dst += 2 * dst_stride;
2799 26435000 : dst8 += 2 * dst8_stride;
2800 26435000 : y -= 2;
2801 26435000 : } while (y);
2802 : }
2803 : }
2804 : else {
2805 : do {
2806 69692500 : xy_y_convolve_6tap_16x2_avx2(
2807 : im, 16, s_256, ss_256, tt_256, coeffs_256, r);
2808 69654200 : jnt_2d_no_avg_round_store_16x2_avx2(
2809 : r, offset_no_avg_256, dst, dst_stride);
2810 69698200 : im += 2 * 16;
2811 69698200 : dst += 2 * dst_stride;
2812 69698200 : y -= 2;
2813 69698200 : } while (y);
2814 : }
2815 : }
2816 : else {
2817 8740340 : int32_t x = 0;
2818 :
2819 8740340 : assert(!(w % 32));
2820 :
2821 : __m256i s_256[2][6], ss_256[2][6], tt_256[2][6], r0[4], r1[4];
2822 :
2823 : do {
2824 10677600 : const int16_t *s = im + x;
2825 10677600 : ConvBufType *d = dst + x;
2826 10677600 : uint8_t *d8 = dst8 + x;
2827 :
2828 : loadu_unpack_16bit_5rows_avx2(
2829 : s, w, s_256[0], ss_256[0], tt_256[0]);
2830 10677600 : loadu_unpack_16bit_5rows_avx2(
2831 : s + 16, w, s_256[1], ss_256[1], tt_256[1]);
2832 :
2833 10677600 : y = h;
2834 :
2835 10677600 : if (conv_params->do_average) {
2836 4202570 : if (conv_params->use_jnt_comp_avg) {
2837 : do {
2838 18810600 : xy_y_convolve_6tap_16x2_avx2(s,
2839 : w,
2840 : s_256[0],
2841 : ss_256[0],
2842 : tt_256[0],
2843 : coeffs_256,
2844 : r0);
2845 18801200 : xy_y_convolve_6tap_16x2_avx2(s + 16,
2846 : w,
2847 : s_256[1],
2848 : ss_256[1],
2849 : tt_256[1],
2850 : coeffs_256,
2851 : r1);
2852 : jnt_2d_comp_avg_round_store_32_avx2(
2853 : r0 + 0,
2854 : r1 + 0,
2855 : factor_256,
2856 : offset_comp_avg_256,
2857 : d,
2858 : d8);
2859 18809100 : jnt_2d_comp_avg_round_store_32_avx2(
2860 : r0 + 2,
2861 : r1 + 2,
2862 : factor_256,
2863 : offset_comp_avg_256,
2864 18809100 : d + dst_stride,
2865 : d8 + dst8_stride);
2866 18808200 : s += 2 * w;
2867 18808200 : d += 2 * dst_stride;
2868 18808200 : d8 += 2 * dst8_stride;
2869 18808200 : y -= 2;
2870 18808200 : } while (y);
2871 : }
2872 : else {
2873 : do {
2874 30279300 : xy_y_convolve_6tap_16x2_avx2(s,
2875 : w,
2876 : s_256[0],
2877 : ss_256[0],
2878 : tt_256[0],
2879 : coeffs_256,
2880 : r0);
2881 30262100 : xy_y_convolve_6tap_16x2_avx2(s + 16,
2882 : w,
2883 : s_256[1],
2884 : ss_256[1],
2885 : tt_256[1],
2886 : coeffs_256,
2887 : r1);
2888 : jnt_2d_avg_round_store_32_avx2(
2889 : r0 + 0, r1 + 0, offset_avg_256, d, d8);
2890 30268300 : jnt_2d_avg_round_store_32_avx2(r0 + 2,
2891 : r1 + 2,
2892 : offset_avg_256,
2893 30268300 : d + dst_stride,
2894 : d8 + dst8_stride);
2895 30268600 : s += 2 * w;
2896 30268600 : d += 2 * dst_stride;
2897 30268600 : d8 += 2 * dst8_stride;
2898 30268600 : y -= 2;
2899 30268600 : } while (y);
2900 : }
2901 : }
2902 : else {
2903 : do {
2904 75155600 : xy_y_convolve_6tap_16x2_avx2(s,
2905 : w,
2906 : s_256[0],
2907 : ss_256[0],
2908 : tt_256[0],
2909 : coeffs_256,
2910 : r0);
2911 75131500 : xy_y_convolve_6tap_16x2_avx2(s + 16,
2912 : w,
2913 : s_256[1],
2914 : ss_256[1],
2915 : tt_256[1],
2916 : coeffs_256,
2917 : r1);
2918 75156800 : jnt_2d_no_avg_round_store_32_avx2(
2919 : r0 + 0, r1 + 0, offset_no_avg_256, d);
2920 75172500 : jnt_2d_no_avg_round_store_32_avx2(
2921 75172500 : r0 + 2, r1 + 2, offset_no_avg_256, d + dst_stride);
2922 75190500 : s += 2 * w;
2923 75190500 : d += 2 * dst_stride;
2924 75190500 : y -= 2;
2925 75190500 : } while (y);
2926 : }
2927 :
2928 10699300 : x += 32;
2929 10699300 : } while (x < w);
2930 : }
2931 : }
2932 38627300 : }
2933 :
2934 16479000 : static void jnt_convolve_2d_ver_8tap_avx2(
2935 : const int16_t *const im_block, const int32_t w, const int32_t h,
2936 : const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
2937 : const ConvolveParams *const conv_params, uint8_t *dst8,
2938 : const int32_t dst8_stride) {
2939 16479000 : const int32_t dst_stride = conv_params->dst_stride;
2940 16479000 : const int32_t bd = 8;
2941 16479000 : const int32_t round_0 = 3;
2942 16479000 : const int16_t *im = im_block;
2943 16479000 : const int32_t round_1 = COMPOUND_ROUND1_BITS;
2944 16479000 : const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0; // 19
2945 16479000 : const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1; // 4
2946 16479000 : const int32_t round_offset = 1 << (offset_bits - round_1);
2947 16479000 : const int32_t factor =
2948 16479000 : conv_params->fwd_offset | (conv_params->bck_offset << 16);
2949 16479000 : const int32_t offset_comp_avg =
2950 16479000 : (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
2951 16479000 : (round_offset << DIST_PRECISION_BITS) -
2952 16479000 : (round_offset << (DIST_PRECISION_BITS - 1)) +
2953 16479000 : (1 << (round_bits + DIST_PRECISION_BITS - 1));
2954 16479000 : const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
2955 16479000 : const __m128i factor_128 = _mm_set1_epi32(factor);
2956 16479000 : const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
2957 16479000 : const __m256i factor_256 = _mm256_set1_epi32(factor);
2958 16479000 : const int32_t offset_avg = (1 << (round_1 - 1)) +
2959 16479000 : (1 << (round_bits + round_1)) -
2960 16479000 : (1 << offset_bits) - (1 << (offset_bits - 1));
2961 16479000 : const int32_t offset_no_avg =
2962 16479000 : (1 << (round_1 - 1)) + (1 << offset_bits) + (1 << (offset_bits - 1));
2963 16479000 : const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
2964 16479000 : const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
2965 16479000 : const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
2966 16479000 : const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
2967 16479000 : int32_t y = h;
2968 16479000 : ConvBufType *dst = conv_params->dst;
2969 : __m128i coeffs_128[4];
2970 : __m256i coeffs_256[4];
2971 :
2972 16479000 : if (w == 2) {
2973 : __m128i s_32[8], ss_128[4];
2974 :
2975 0 : prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
2976 :
2977 0 : s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
2978 0 : s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
2979 0 : s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
2980 0 : s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
2981 0 : s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
2982 0 : s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2));
2983 0 : s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2));
2984 :
2985 0 : const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2986 0 : const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2987 0 : const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2988 0 : const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2989 0 : const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2990 0 : const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2991 :
2992 0 : ss_128[0] = _mm_unpacklo_epi16(src01, src12);
2993 0 : ss_128[1] = _mm_unpacklo_epi16(src23, src34);
2994 0 : ss_128[2] = _mm_unpacklo_epi16(src45, src56);
2995 :
2996 0 : y = h;
2997 :
2998 0 : if (conv_params->do_average) {
2999 0 : if (conv_params->use_jnt_comp_avg) {
3000 : do {
3001 0 : const __m128i res = xy_y_convolve_8tap_2x2_sse2(
3002 : im, s_32, ss_128, coeffs_128);
3003 0 : jnt_2d_comp_avg_round_store_2x2_sse2(res,
3004 : factor_128,
3005 : offset_comp_avg_128,
3006 : dst,
3007 : dst_stride,
3008 : dst8,
3009 : dst8_stride);
3010 0 : im += 2 * 2;
3011 0 : dst += 2 * dst_stride;
3012 0 : dst8 += 2 * dst8_stride;
3013 0 : y -= 2;
3014 0 : } while (y);
3015 : }
3016 : else {
3017 : do {
3018 0 : const __m128i res = xy_y_convolve_8tap_2x2_sse2(
3019 : im, s_32, ss_128, coeffs_128);
3020 0 : jnt_2d_avg_round_store_2x2_sse2(res,
3021 : offset_avg_128,
3022 : dst,
3023 : dst_stride,
3024 : dst8,
3025 : dst8_stride);
3026 0 : im += 2 * 2;
3027 0 : dst += 2 * dst_stride;
3028 0 : dst8 += 2 * dst8_stride;
3029 0 : y -= 2;
3030 0 : } while (y);
3031 : }
3032 : }
3033 : else {
3034 : do {
3035 : const __m128i res =
3036 0 : xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
3037 0 : jnt_2d_no_avg_round_store_2x2_sse2(
3038 : res, offset_no_avg_128, dst, dst_stride);
3039 0 : im += 2 * 2;
3040 0 : dst += 2 * dst_stride;
3041 0 : y -= 2;
3042 0 : } while (y);
3043 : }
3044 : }
3045 : else {
3046 16479000 : prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
3047 :
3048 16479800 : if (w == 4) {
3049 : __m128i s_64[8];
3050 : __m256i s_256[8], ss_256[4];
3051 :
3052 121482 : s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
3053 121482 : s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
3054 121482 : s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
3055 121482 : s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
3056 121482 : s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
3057 121482 : s_64[5] = _mm_loadl_epi64((__m128i *)(im + 5 * 4));
3058 121482 : s_64[6] = _mm_loadl_epi64((__m128i *)(im + 6 * 4));
3059 :
3060 : // Load lines a and b. Line a to lower 128, line b to upper
3061 : // 128
3062 121482 : s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
3063 121482 : s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
3064 121482 : s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
3065 121482 : s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
3066 121482 : s_256[4] = _mm256_setr_m128i(s_64[4], s_64[5]);
3067 121482 : s_256[5] = _mm256_setr_m128i(s_64[5], s_64[6]);
3068 :
3069 121482 : ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
3070 121482 : ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
3071 121482 : ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
3072 :
3073 121482 : y = h;
3074 :
3075 121482 : if (conv_params->do_average) {
3076 48774 : if (conv_params->use_jnt_comp_avg) {
3077 : do {
3078 145040 : const __m256i res = xy_y_convolve_8tap_4x2_avx2(
3079 : im, s_64, ss_256, coeffs_256);
3080 145040 : jnt_2d_comp_avg_round_store_4x2_avx2(
3081 : res,
3082 : factor_256,
3083 : offset_comp_avg_256,
3084 : dst,
3085 : dst_stride,
3086 : dst8,
3087 : dst8_stride);
3088 145040 : im += 2 * 4;
3089 145040 : dst += 2 * dst_stride;
3090 145040 : dst8 += 2 * dst8_stride;
3091 145040 : y -= 2;
3092 145040 : } while (y);
3093 : }
3094 : else {
3095 : do {
3096 138568 : const __m256i res = xy_y_convolve_8tap_4x2_avx2(
3097 : im, s_64, ss_256, coeffs_256);
3098 138568 : jnt_2d_avg_round_store_4x2_avx2(res,
3099 : offset_avg_256,
3100 : dst,
3101 : dst_stride,
3102 : dst8,
3103 : dst8_stride);
3104 138568 : im += 2 * 4;
3105 138568 : dst += 2 * dst_stride;
3106 138568 : dst8 += 2 * dst8_stride;
3107 138568 : y -= 2;
3108 138568 : } while (y);
3109 : }
3110 : }
3111 : else {
3112 : do {
3113 424568 : const __m256i res = xy_y_convolve_8tap_4x2_avx2(
3114 : im, s_64, ss_256, coeffs_256);
3115 424567 : jnt_2d_no_avg_round_store_4x2_avx2(
3116 : res, offset_no_avg_256, dst, dst_stride);
3117 424568 : im += 2 * 4;
3118 424568 : dst += 2 * dst_stride;
3119 424568 : y -= 2;
3120 424568 : } while (y);
3121 : }
3122 : }
3123 16358300 : else if (w == 8) {
3124 : __m256i s_256[8], r[2];
3125 :
3126 7213610 : s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
3127 7213610 : s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
3128 7213610 : s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
3129 7213610 : s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
3130 7213610 : s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 8));
3131 7213610 : s_256[5] = _mm256_loadu_si256((__m256i *)(im + 5 * 8));
3132 7213610 : y = h;
3133 :
3134 : __m256i ss_256[8];
3135 :
3136 7213610 : convolve_8tap_unapck_avx2(s_256, ss_256);
3137 :
3138 7213720 : if (conv_params->do_average) {
3139 2899340 : if (conv_params->use_jnt_comp_avg) {
3140 : do {
3141 9567550 : xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
3142 9566860 : jnt_2d_comp_avg_round_store_8x2_avx2(
3143 : r,
3144 : factor_256,
3145 : offset_comp_avg_256,
3146 : dst,
3147 : dst_stride,
3148 : dst8,
3149 : dst8_stride);
3150 9567050 : im += 2 * 8;
3151 9567050 : dst += 2 * dst_stride;
3152 9567050 : dst8 += 2 * dst8_stride;
3153 9567050 : y -= 2;
3154 9567050 : } while (y);
3155 : }
3156 : else {
3157 : do {
3158 12614700 : xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
3159 12614800 : jnt_2d_avg_round_store_8x2_avx2(r,
3160 : offset_avg_256,
3161 : dst,
3162 : dst_stride,
3163 : dst8,
3164 : dst8_stride);
3165 12613400 : im += 2 * 8;
3166 12613400 : dst += 2 * dst_stride;
3167 12613400 : dst8 += 2 * dst8_stride;
3168 12613400 : y -= 2;
3169 12613400 : } while (y);
3170 : }
3171 : }
3172 : else {
3173 : do {
3174 32917700 : xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
3175 32920900 : jnt_2d_no_avg_round_store_8x2_avx2(
3176 : r, offset_no_avg_256, dst, dst_stride);
3177 32919100 : im += 2 * 8;
3178 32919100 : dst += 2 * dst_stride;
3179 32919100 : y -= 2;
3180 32919100 : } while (y);
3181 : }
3182 : }
3183 9144730 : else if (w == 16) {
3184 : __m256i s_256[8], r[4];
3185 :
3186 4986720 : load_16bit_7rows_avx2(im, 16, s_256);
3187 4986800 : y = h;
3188 :
3189 : __m256i ss_256[8], tt_256[8];
3190 :
3191 4986800 : convolve_8tap_unapck_avx2(s_256, ss_256);
3192 4986780 : convolve_8tap_unapck_avx2(s_256 + 1, tt_256);
3193 :
3194 4990510 : if (conv_params->do_average) {
3195 1974960 : if (conv_params->use_jnt_comp_avg) {
3196 : do {
3197 : xy_y_convolve_8tap_16x2_avx2(
3198 : im, 16, coeffs_256, s_256, ss_256, tt_256, r);
3199 8170770 : jnt_2d_comp_avg_round_store_16x2_avx2(
3200 : r,
3201 : factor_256,
3202 : offset_comp_avg_256,
3203 : dst,
3204 : dst_stride,
3205 : dst8,
3206 : dst8_stride);
3207 8170660 : im += 2 * 16;
3208 8170660 : dst += 2 * dst_stride;
3209 8170660 : dst8 += 2 * dst8_stride;
3210 8170660 : y -= 2;
3211 8170660 : } while (y);
3212 : }
3213 : else {
3214 : do {
3215 : xy_y_convolve_8tap_16x2_avx2(
3216 : im, 16, coeffs_256, s_256, ss_256, tt_256, r);
3217 12107200 : jnt_2d_avg_round_store_16x2_avx2(r,
3218 : offset_avg_256,
3219 : dst,
3220 : dst_stride,
3221 : dst8,
3222 : dst8_stride);
3223 12105900 : im += 2 * 16;
3224 12105900 : dst += 2 * dst_stride;
3225 12105900 : dst8 += 2 * dst8_stride;
3226 12105900 : y -= 2;
3227 12105900 : } while (y);
3228 : }
3229 : }
3230 : else {
3231 : do {
3232 : xy_y_convolve_8tap_16x2_avx2(
3233 : im, 16, coeffs_256, s_256, ss_256, tt_256, r);
3234 30792800 : jnt_2d_no_avg_round_store_16x2_avx2(
3235 : r, offset_no_avg_256, dst, dst_stride);
3236 30789300 : im += 2 * 16;
3237 30789300 : dst += 2 * dst_stride;
3238 30789300 : y -= 2;
3239 30789300 : } while (y);
3240 : }
3241 : }
3242 : else {
3243 4158010 : int32_t x = 0;
3244 : __m256i s_256[2][8], r0[4], r1[4];
3245 :
3246 4158010 : assert(!(w % 32));
3247 :
3248 : __m256i ss_256[2][8], tt_256[2][8];
3249 :
3250 : do {
3251 5120930 : const int16_t *s = im + x;
3252 5120930 : ConvBufType *d = dst + x;
3253 5120930 : uint8_t *d8 = dst8 + x;
3254 :
3255 5120930 : load_16bit_7rows_avx2(s, w, s_256[0]);
3256 5134530 : convolve_8tap_unapck_avx2(s_256[0], ss_256[0]);
3257 5134520 : convolve_8tap_unapck_avx2(s_256[0] + 1, tt_256[0]);
3258 :
3259 5134430 : load_16bit_7rows_avx2(s + 16, w, s_256[1]);
3260 5134570 : convolve_8tap_unapck_avx2(s_256[1], ss_256[1]);
3261 5134550 : convolve_8tap_unapck_avx2(s_256[1] + 1, tt_256[1]);
3262 :
3263 5134530 : y = h;
3264 :
3265 5134530 : if (conv_params->do_average) {
3266 2058960 : if (conv_params->use_jnt_comp_avg) {
3267 : do {
3268 : xy_y_convolve_8tap_16x2_avx2(s,
3269 : w,
3270 : coeffs_256,
3271 : s_256[0],
3272 : ss_256[0],
3273 : tt_256[0],
3274 : r0);
3275 9470950 : xy_y_convolve_8tap_16x2_avx2(s + 16,
3276 : w,
3277 : coeffs_256,
3278 : s_256[1],
3279 : ss_256[1],
3280 : tt_256[1],
3281 : r1);
3282 : jnt_2d_comp_avg_round_store_32_avx2(
3283 : r0 + 0,
3284 : r1 + 0,
3285 : factor_256,
3286 : offset_comp_avg_256,
3287 : d,
3288 : d8);
3289 9470940 : jnt_2d_comp_avg_round_store_32_avx2(
3290 : r0 + 2,
3291 : r1 + 2,
3292 : factor_256,
3293 : offset_comp_avg_256,
3294 9470940 : d + dst_stride,
3295 : d8 + dst8_stride);
3296 9470640 : s += 2 * w;
3297 9470640 : d += 2 * dst_stride;
3298 9470640 : d8 += 2 * dst8_stride;
3299 9470640 : y -= 2;
3300 9470640 : } while (y);
3301 : }
3302 : else {
3303 : do {
3304 : xy_y_convolve_8tap_16x2_avx2(s,
3305 : w,
3306 : coeffs_256,
3307 : s_256[0],
3308 : ss_256[0],
3309 : tt_256[0],
3310 : r0);
3311 15503900 : xy_y_convolve_8tap_16x2_avx2(s + 16,
3312 : w,
3313 : coeffs_256,
3314 : s_256[1],
3315 : ss_256[1],
3316 : tt_256[1],
3317 : r1);
3318 : jnt_2d_avg_round_store_32_avx2(
3319 : r0 + 0, r1 + 0, offset_avg_256, d, d8);
3320 15500300 : jnt_2d_avg_round_store_32_avx2(r0 + 2,
3321 : r1 + 2,
3322 : offset_avg_256,
3323 15500300 : d + dst_stride,
3324 : d8 + dst8_stride);
3325 15500200 : s += 2 * w;
3326 15500200 : d += 2 * dst_stride;
3327 15500200 : d8 += 2 * dst8_stride;
3328 15500200 : y -= 2;
3329 15500200 : } while (y);
3330 : }
3331 : }
3332 : else {
3333 : do {
3334 : xy_y_convolve_8tap_16x2_avx2(s,
3335 : w,
3336 : coeffs_256,
3337 : s_256[0],
3338 : ss_256[0],
3339 : tt_256[0],
3340 : r0);
3341 37297500 : xy_y_convolve_8tap_16x2_avx2(s + 16,
3342 : w,
3343 : coeffs_256,
3344 : s_256[1],
3345 : ss_256[1],
3346 : tt_256[1],
3347 : r1);
3348 37308100 : jnt_2d_no_avg_round_store_32_avx2(
3349 : r0 + 0, r1 + 0, offset_no_avg_256, d);
3350 37285300 : jnt_2d_no_avg_round_store_32_avx2(
3351 37285300 : r0 + 2, r1 + 2, offset_no_avg_256, d + dst_stride);
3352 37285300 : s += 2 * w;
3353 37285300 : d += 2 * dst_stride;
3354 37285300 : y -= 2;
3355 37285300 : } while (y);
3356 : }
3357 :
3358 5125030 : x += 32;
3359 5125030 : } while (x < w);
3360 : }
3361 : }
3362 16483600 : }
3363 :
3364 : typedef void(*jnt_convolve_2d_hor_tap_func)(
3365 : const uint8_t *src, const int32_t src_stride, const int32_t w,
3366 : const int32_t h, const InterpFilterParams *filter_params_x,
3367 : const int32_t subpel_x_q4, int16_t *const im_block);
3368 :
3369 : typedef void(*jnt_convolve_2d_ver_tap_func)(
3370 : const int16_t *const im_block, const int32_t w, const int32_t h,
3371 : const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
3372 : const ConvolveParams *const conv_params, uint8_t *dst8,
3373 : const int32_t dst8_stride);
3374 :
3375 148838000 : void eb_av1_jnt_convolve_2d_avx2(const uint8_t *src, int32_t src_stride,
3376 : uint8_t *dst8, int32_t dst8_stride, int32_t w,
3377 : int32_t h, InterpFilterParams *filter_params_x,
3378 : InterpFilterParams *filter_params_y,
3379 : const int32_t subpel_x_q4,
3380 : const int32_t subpel_y_q4,
3381 : ConvolveParams *conv_params) {
3382 : static const jnt_convolve_2d_hor_tap_func
3383 : jnt_convolve_2d_hor_tap_func_table[MAX_FILTER_TAP + 1] = {
3384 : NULL,
3385 : NULL,
3386 : jnt_convolve_2d_hor_2tap_avx2,
3387 : NULL,
3388 : jnt_convolve_2d_hor_4tap_avx2,
3389 : NULL,
3390 : jnt_convolve_2d_hor_6tap_avx2,
3391 : NULL,
3392 : jnt_convolve_2d_hor_8tap_avx2 };
3393 : static const jnt_convolve_2d_ver_tap_func
3394 : jnt_convolve_2d_ver_tap_func_table[MAX_FILTER_TAP + 1] = {
3395 : NULL,
3396 : jnt_convolve_2d_ver_2tap_half_avx2,
3397 : jnt_convolve_2d_ver_2tap_avx2,
3398 : jnt_convolve_2d_ver_4tap_avx2,
3399 : jnt_convolve_2d_ver_4tap_avx2,
3400 : jnt_convolve_2d_ver_6tap_avx2,
3401 : jnt_convolve_2d_ver_6tap_avx2,
3402 : jnt_convolve_2d_ver_8tap_avx2,
3403 : jnt_convolve_2d_ver_8tap_avx2 };
3404 148838000 : const int32_t tap_x = get_convolve_tap(filter_params_x->filter_ptr);
3405 148787000 : const int32_t tap_y = get_convolve_tap(filter_params_y->filter_ptr);
3406 148766000 : const uint8_t *src_ptr =
3407 148766000 : src + ((MAX_FILTER_TAP - tap_y) / 2 - 3) * src_stride;
3408 : // Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data
3409 : // permutation.
3410 : DECLARE_ALIGNED(
3411 : 32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
3412 :
3413 148766000 : assert(conv_params->round_0 == 3);
3414 148766000 : assert(conv_params->round_1 == COMPOUND_ROUND1_BITS);
3415 :
3416 : // horizontal filter
3417 :
3418 : // Have to calculate 1 more row for small widths, since 2 lines are
3419 : // calculated in each loop for them.
3420 148766000 : const int32_t hh = h + tap_y - (w >= 32);
3421 :
3422 148766000 : jnt_convolve_2d_hor_tap_func_table[tap_x](
3423 : src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block);
3424 :
3425 : // vertical filter
3426 149206000 : jnt_convolve_2d_ver_tap_func_table[tap_y - (subpel_y_q4 == 8)](
3427 : im_block,
3428 : w,
3429 : h,
3430 : filter_params_y,
3431 : subpel_y_q4,
3432 : conv_params,
3433 : dst8,
3434 : dst8_stride);
3435 149196000 : }
|