Line data Source code
1 : /*
2 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <immintrin.h> // AVX2
13 : #include "synonyms.h"
14 : #include "synonyms_avx2.h"
15 : #include "aom_dsp_rtcd.h"
16 : #include "EbPictureOperators_Inline_AVX2.h"
17 : #include "EbRestoration.h"
18 : #include "EbRestorationPick.h"
19 : #include "EbUtility.h"
20 : #include "pickrst_avx2.h"
21 : #include "transpose_sse2.h"
22 : #include "transpose_avx2.h"
23 :
24 300 : static INLINE uint8_t find_average_avx2(const uint8_t *src, int32_t h_start,
25 : int32_t h_end, int32_t v_start,
26 : int32_t v_end, int32_t stride) {
27 300 : const int32_t width = h_end - h_start;
28 300 : const int32_t height = v_end - v_start;
29 300 : const uint8_t *srcT = src + v_start * stride + h_start;
30 300 : const int32_t leftover = width & 31;
31 300 : int32_t i = height;
32 300 : __m256i ss = _mm256_setzero_si256();
33 :
34 300 : if (!leftover) {
35 : do {
36 86400 : int32_t j = 0;
37 : do {
38 1296000 : const __m256i s = _mm256_loadu_si256((__m256i *)(srcT + j));
39 1296000 : const __m256i sad = _mm256_sad_epu8(s, _mm256_setzero_si256());
40 648000 : ss = _mm256_add_epi32(ss, sad);
41 648000 : j += 32;
42 648000 : } while (j < width);
43 :
44 86400 : srcT += stride;
45 86400 : } while (--i);
46 : } else {
47 0 : const int32_t w32 = width - leftover;
48 : __m128i maskL, maskH;
49 :
50 0 : if (leftover >= 16) {
51 0 : maskL = _mm_set1_epi8(-1);
52 0 : maskH = _mm_load_si128((__m128i *)(mask_8bit[leftover - 16]));
53 : } else {
54 0 : maskL = _mm_load_si128((__m128i *)(mask_8bit[leftover]));
55 0 : maskH = _mm_setzero_si128();
56 : }
57 : const __m256i mask =
58 0 : _mm256_inserti128_si256(_mm256_castsi128_si256(maskL), maskH, 1);
59 :
60 : do {
61 0 : int32_t j = 0;
62 : do {
63 0 : const __m256i s = _mm256_loadu_si256((__m256i *)(srcT + j));
64 0 : const __m256i sad = _mm256_sad_epu8(s, _mm256_setzero_si256());
65 0 : ss = _mm256_add_epi32(ss, sad);
66 0 : j += 32;
67 0 : } while (j < w32);
68 :
69 0 : const __m256i s = _mm256_loadu_si256((__m256i *)(srcT + j));
70 0 : const __m256i sT = _mm256_and_si256(s, mask);
71 0 : const __m256i sad = _mm256_sad_epu8(sT, _mm256_setzero_si256());
72 0 : ss = _mm256_add_epi32(ss, sad);
73 0 : srcT += stride;
74 0 : } while (--i);
75 : }
76 :
77 300 : const uint32_t sum = Hadd32_AVX2_INTRIN(ss);
78 300 : const uint32_t avg = sum / (width * height);
79 300 : return (uint8_t)avg;
80 : }
81 :
82 0 : static INLINE void add_u16_to_u32_avx2(const __m256i src, __m256i *const sum) {
83 0 : const __m256i s0 = _mm256_unpacklo_epi16(src, _mm256_setzero_si256());
84 0 : const __m256i s1 = _mm256_unpackhi_epi16(src, _mm256_setzero_si256());
85 0 : *sum = _mm256_add_epi32(*sum, s0);
86 0 : *sum = _mm256_add_epi32(*sum, s1);
87 0 : }
88 :
89 0 : static INLINE void add_32_to_64_avx2(const __m256i src, __m256i *const sum) {
90 0 : const __m256i s0 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src, 0));
91 0 : const __m256i s1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src, 1));
92 0 : *sum = _mm256_add_epi64(*sum, s0);
93 0 : *sum = _mm256_add_epi64(*sum, s1);
94 0 : }
95 :
96 0 : static INLINE uint16_t find_average_highbd_avx2(const uint16_t *src,
97 : int32_t h_start, int32_t h_end,
98 : int32_t v_start, int32_t v_end,
99 : int32_t stride,
100 : AomBitDepth bit_depth) {
101 0 : const int32_t width = h_end - h_start;
102 0 : const int32_t height = v_end - v_start;
103 0 : const uint16_t *srcT = src + v_start * stride + h_start;
104 0 : const int32_t leftover = width & 15;
105 0 : int32_t i = height;
106 0 : __m256i sss = _mm256_setzero_si256();
107 :
108 0 : if (bit_depth <= 10 || width <= 256) {
109 0 : if (!leftover) {
110 : do {
111 0 : __m256i ss = _mm256_setzero_si256();
112 :
113 0 : int32_t j = 0;
114 : do {
115 0 : const __m256i s = _mm256_loadu_si256((__m256i *)(srcT + j));
116 0 : ss = _mm256_add_epi16(ss, s);
117 0 : j += 16;
118 0 : } while (j < width);
119 :
120 0 : add_u16_to_u32_avx2(ss, &sss);
121 :
122 0 : srcT += stride;
123 0 : } while (--i);
124 : } else {
125 0 : const int32_t w16 = width - leftover;
126 : const __m256i mask =
127 0 : _mm256_load_si256((__m256i *)(mask_16bit[leftover]));
128 :
129 : do {
130 0 : __m256i ss = _mm256_setzero_si256();
131 :
132 0 : int32_t j = 0;
133 : do {
134 0 : const __m256i s = _mm256_loadu_si256((__m256i *)(srcT + j));
135 0 : ss = _mm256_add_epi16(ss, s);
136 0 : j += 16;
137 0 : } while (j < w16);
138 :
139 0 : const __m256i s = _mm256_loadu_si256((__m256i *)(srcT + j));
140 0 : const __m256i sT = _mm256_and_si256(s, mask);
141 0 : ss = _mm256_add_epi16(ss, sT);
142 :
143 0 : add_u16_to_u32_avx2(ss, &sss);
144 :
145 0 : srcT += stride;
146 0 : } while (--i);
147 : }
148 : } else {
149 0 : if (!leftover) {
150 : do {
151 0 : __m256i ss = _mm256_setzero_si256();
152 :
153 0 : int32_t j = 0;
154 : do {
155 0 : const __m256i s = _mm256_loadu_si256((__m256i *)(srcT + j));
156 0 : ss = _mm256_add_epi16(ss, s);
157 0 : j += 16;
158 0 : } while (j < 256);
159 :
160 0 : add_u16_to_u32_avx2(ss, &sss);
161 0 : ss = _mm256_setzero_si256();
162 :
163 : do {
164 0 : const __m256i s = _mm256_loadu_si256((__m256i *)(srcT + j));
165 0 : ss = _mm256_add_epi16(ss, s);
166 0 : j += 16;
167 0 : } while (j < width);
168 :
169 0 : add_u16_to_u32_avx2(ss, &sss);
170 :
171 0 : srcT += stride;
172 0 : } while (--i);
173 : } else {
174 0 : const int32_t w16 = width - leftover;
175 : const __m256i mask =
176 0 : _mm256_load_si256((__m256i *)(mask_16bit[leftover]));
177 :
178 : do {
179 0 : __m256i ss = _mm256_setzero_si256();
180 :
181 0 : int32_t j = 0;
182 : do {
183 0 : const __m256i s = _mm256_loadu_si256((__m256i *)(srcT + j));
184 0 : ss = _mm256_add_epi16(ss, s);
185 0 : j += 16;
186 0 : } while (j < 256);
187 :
188 0 : add_u16_to_u32_avx2(ss, &sss);
189 0 : ss = _mm256_setzero_si256();
190 :
191 0 : while (j < w16) {
192 0 : const __m256i s = _mm256_loadu_si256((__m256i *)(srcT + j));
193 0 : ss = _mm256_add_epi16(ss, s);
194 0 : j += 16;
195 : }
196 :
197 0 : const __m256i s = _mm256_loadu_si256((__m256i *)(srcT + j));
198 0 : const __m256i sT = _mm256_and_si256(s, mask);
199 0 : ss = _mm256_add_epi16(ss, sT);
200 :
201 0 : add_u16_to_u32_avx2(ss, &sss);
202 :
203 0 : srcT += stride;
204 0 : } while (--i);
205 : }
206 : }
207 :
208 0 : const uint32_t sum = Hadd32_AVX2_INTRIN(sss);
209 0 : const uint32_t avg = sum / (width * height);
210 0 : return (uint16_t)avg;
211 : }
212 :
213 : // Note: when n = (width % 16) is not 0, it writes (16 - n) more data than
214 : // required.
215 600 : static INLINE void sub_avg_block_avx2(const uint8_t *src,
216 : const int32_t src_stride,
217 : const uint8_t avg, const int32_t width,
218 : const int32_t height, int16_t *dst,
219 : const int32_t dst_stride) {
220 600 : const __m256i a = _mm256_set1_epi16(avg);
221 :
222 600 : int32_t i = height;
223 : do {
224 174360 : int32_t j = 0;
225 : do {
226 5407920 : const __m128i s = _mm_loadu_si128((__m128i *)(src + j));
227 2703960 : const __m256i ss = _mm256_cvtepu8_epi16(s);
228 2703960 : const __m256i d = _mm256_sub_epi16(ss, a);
229 2703960 : _mm256_store_si256((__m256i *)(dst + j), d);
230 2703960 : j += 16;
231 2703960 : } while (j < width);
232 :
233 174360 : src += src_stride;
234 174360 : dst += dst_stride;
235 174360 : } while (--i);
236 600 : }
237 :
238 : // Note: when n = (width % 16) is not 0, it writes (16 - n) more data than
239 : // required.
240 0 : static INLINE void sub_avg_block_highbd_avx2(const uint16_t *src,
241 : const int32_t src_stride,
242 : const uint16_t avg,
243 : const int32_t width,
244 : const int32_t height, int16_t *dst,
245 : const int32_t dst_stride) {
246 0 : const __m256i a = _mm256_set1_epi16(avg);
247 :
248 0 : int32_t i = height;
249 : do {
250 0 : int32_t j = 0;
251 : do {
252 0 : const __m256i s = _mm256_loadu_si256((__m256i *)(src + j));
253 0 : const __m256i d = _mm256_sub_epi16(s, a);
254 0 : _mm256_store_si256((__m256i *)(dst + j), d);
255 0 : j += 16;
256 0 : } while (j < width);
257 :
258 0 : src += src_stride;
259 0 : dst += dst_stride;
260 0 : } while (--i);
261 0 : }
262 :
263 0 : static INLINE void stats_top_win3_avx2(const __m256i src, const __m256i dgd,
264 : const int16_t *const d,
265 : const int32_t d_stride,
266 : __m256i sumM[WIENER_WIN_3TAP],
267 : __m256i sumH[WIENER_WIN_3TAP]) {
268 : __m256i dgds[WIENER_WIN_3TAP];
269 :
270 0 : dgds[0] = _mm256_loadu_si256((__m256i *)(d + 0 * d_stride));
271 0 : dgds[1] = _mm256_loadu_si256((__m256i *)(d + 1 * d_stride));
272 0 : dgds[2] = _mm256_loadu_si256((__m256i *)(d + 2 * d_stride));
273 :
274 0 : madd_avx2(src, dgds[0], &sumM[0]);
275 0 : madd_avx2(src, dgds[1], &sumM[1]);
276 0 : madd_avx2(src, dgds[2], &sumM[2]);
277 :
278 0 : madd_avx2(dgd, dgds[0], &sumH[0]);
279 0 : madd_avx2(dgd, dgds[1], &sumH[1]);
280 0 : madd_avx2(dgd, dgds[2], &sumH[2]);
281 0 : }
282 :
283 2160000 : static INLINE void stats_top_win5_avx2(const __m256i src, const __m256i dgd,
284 : const int16_t *const d,
285 : const int32_t d_stride,
286 : __m256i sumM[WIENER_WIN_CHROMA],
287 : __m256i sumH[WIENER_WIN_CHROMA]) {
288 : __m256i dgds[WIENER_WIN_CHROMA];
289 :
290 2160000 : dgds[0] = _mm256_loadu_si256((__m256i *)(d + 0 * d_stride));
291 2160000 : dgds[1] = _mm256_loadu_si256((__m256i *)(d + 1 * d_stride));
292 2160000 : dgds[2] = _mm256_loadu_si256((__m256i *)(d + 2 * d_stride));
293 2160000 : dgds[3] = _mm256_loadu_si256((__m256i *)(d + 3 * d_stride));
294 2160000 : dgds[4] = _mm256_loadu_si256((__m256i *)(d + 4 * d_stride));
295 :
296 2160000 : madd_avx2(src, dgds[0], &sumM[0]);
297 2160000 : madd_avx2(src, dgds[1], &sumM[1]);
298 2160000 : madd_avx2(src, dgds[2], &sumM[2]);
299 2160000 : madd_avx2(src, dgds[3], &sumM[3]);
300 2160000 : madd_avx2(src, dgds[4], &sumM[4]);
301 :
302 2160000 : madd_avx2(dgd, dgds[0], &sumH[0]);
303 2160000 : madd_avx2(dgd, dgds[1], &sumH[1]);
304 2160000 : madd_avx2(dgd, dgds[2], &sumH[2]);
305 2160000 : madd_avx2(dgd, dgds[3], &sumH[3]);
306 2160000 : madd_avx2(dgd, dgds[4], &sumH[4]);
307 2160000 : }
308 :
309 6048000 : static INLINE void stats_top_win7_avx2(const __m256i src, const __m256i dgd,
310 : const int16_t *const d,
311 : const int32_t d_stride,
312 : __m256i sumM[WIENER_WIN],
313 : __m256i sumH[WIENER_WIN]) {
314 : __m256i dgds[WIENER_WIN];
315 :
316 6048000 : dgds[0] = _mm256_loadu_si256((__m256i *)(d + 0 * d_stride));
317 6048000 : dgds[1] = _mm256_loadu_si256((__m256i *)(d + 1 * d_stride));
318 6048000 : dgds[2] = _mm256_loadu_si256((__m256i *)(d + 2 * d_stride));
319 6048000 : dgds[3] = _mm256_loadu_si256((__m256i *)(d + 3 * d_stride));
320 6048000 : dgds[4] = _mm256_loadu_si256((__m256i *)(d + 4 * d_stride));
321 6048000 : dgds[5] = _mm256_loadu_si256((__m256i *)(d + 5 * d_stride));
322 6048000 : dgds[6] = _mm256_loadu_si256((__m256i *)(d + 6 * d_stride));
323 :
324 6048000 : madd_avx2(src, dgds[0], &sumM[0]);
325 6048000 : madd_avx2(src, dgds[1], &sumM[1]);
326 6048000 : madd_avx2(src, dgds[2], &sumM[2]);
327 6048000 : madd_avx2(src, dgds[3], &sumM[3]);
328 6048000 : madd_avx2(src, dgds[4], &sumM[4]);
329 6048000 : madd_avx2(src, dgds[5], &sumM[5]);
330 6048000 : madd_avx2(src, dgds[6], &sumM[6]);
331 :
332 6048000 : madd_avx2(dgd, dgds[0], &sumH[0]);
333 6048000 : madd_avx2(dgd, dgds[1], &sumH[1]);
334 6048000 : madd_avx2(dgd, dgds[2], &sumH[2]);
335 6048000 : madd_avx2(dgd, dgds[3], &sumH[3]);
336 6048000 : madd_avx2(dgd, dgds[4], &sumH[4]);
337 6048000 : madd_avx2(dgd, dgds[5], &sumH[5]);
338 6048000 : madd_avx2(dgd, dgds[6], &sumH[6]);
339 6048000 : }
340 :
341 0 : static INLINE void stats_left_win3_avx2(const __m256i src, const int16_t *d,
342 : const int32_t d_stride,
343 : __m256i sum[WIENER_WIN_3TAP - 1]) {
344 : __m256i dgds[WIENER_WIN_3TAP - 1];
345 :
346 0 : dgds[0] = _mm256_load_si256((__m256i *)(d + 1 * d_stride));
347 0 : dgds[1] = _mm256_load_si256((__m256i *)(d + 2 * d_stride));
348 :
349 0 : madd_avx2(src, dgds[0], &sum[0]);
350 0 : madd_avx2(src, dgds[1], &sum[1]);
351 0 : }
352 :
353 1728000 : static INLINE void stats_left_win5_avx2(const __m256i src, const int16_t *d,
354 : const int32_t d_stride,
355 : __m256i sum[WIENER_WIN_CHROMA - 1]) {
356 : __m256i dgds[WIENER_WIN_CHROMA - 1];
357 :
358 1728000 : dgds[0] = _mm256_load_si256((__m256i *)(d + 1 * d_stride));
359 1728000 : dgds[1] = _mm256_load_si256((__m256i *)(d + 2 * d_stride));
360 1728000 : dgds[2] = _mm256_load_si256((__m256i *)(d + 3 * d_stride));
361 1728000 : dgds[3] = _mm256_load_si256((__m256i *)(d + 4 * d_stride));
362 :
363 1728000 : madd_avx2(src, dgds[0], &sum[0]);
364 1728000 : madd_avx2(src, dgds[1], &sum[1]);
365 1728000 : madd_avx2(src, dgds[2], &sum[2]);
366 1728000 : madd_avx2(src, dgds[3], &sum[3]);
367 1728000 : }
368 :
369 5184000 : static INLINE void stats_left_win7_avx2(const __m256i src, const int16_t *d,
370 : const int32_t d_stride,
371 : __m256i sum[WIENER_WIN - 1]) {
372 : __m256i dgds[WIENER_WIN - 1];
373 :
374 5184000 : dgds[0] = _mm256_load_si256((__m256i *)(d + 1 * d_stride));
375 5184000 : dgds[1] = _mm256_load_si256((__m256i *)(d + 2 * d_stride));
376 5184000 : dgds[2] = _mm256_load_si256((__m256i *)(d + 3 * d_stride));
377 5184000 : dgds[3] = _mm256_load_si256((__m256i *)(d + 4 * d_stride));
378 5184000 : dgds[4] = _mm256_load_si256((__m256i *)(d + 5 * d_stride));
379 5184000 : dgds[5] = _mm256_load_si256((__m256i *)(d + 6 * d_stride));
380 :
381 5184000 : madd_avx2(src, dgds[0], &sum[0]);
382 5184000 : madd_avx2(src, dgds[1], &sum[1]);
383 5184000 : madd_avx2(src, dgds[2], &sum[2]);
384 5184000 : madd_avx2(src, dgds[3], &sum[3]);
385 5184000 : madd_avx2(src, dgds[4], &sum[4]);
386 5184000 : madd_avx2(src, dgds[5], &sum[5]);
387 5184000 : }
388 :
389 0 : static INLINE void load_square_win3_avx2(
390 : const int16_t *const dI, const int16_t *const dJ, const int32_t d_stride,
391 : const int32_t height, __m256i dIs[WIENER_WIN_3TAP - 1],
392 : __m256i dIe[WIENER_WIN_3TAP - 1], __m256i dJs[WIENER_WIN_3TAP - 1],
393 : __m256i dJe[WIENER_WIN_3TAP - 1]) {
394 0 : dIs[0] = _mm256_loadu_si256((__m256i *)(dI + 0 * d_stride));
395 0 : dJs[0] = _mm256_loadu_si256((__m256i *)(dJ + 0 * d_stride));
396 0 : dIs[1] = _mm256_loadu_si256((__m256i *)(dI + 1 * d_stride));
397 0 : dJs[1] = _mm256_loadu_si256((__m256i *)(dJ + 1 * d_stride));
398 :
399 0 : dIe[0] = _mm256_loadu_si256((__m256i *)(dI + (0 + height) * d_stride));
400 0 : dJe[0] = _mm256_loadu_si256((__m256i *)(dJ + (0 + height) * d_stride));
401 0 : dIe[1] = _mm256_loadu_si256((__m256i *)(dI + (1 + height) * d_stride));
402 0 : dJe[1] = _mm256_loadu_si256((__m256i *)(dJ + (1 + height) * d_stride));
403 0 : }
404 :
405 24000 : static INLINE void load_square_win5_avx2(
406 : const int16_t *const dI, const int16_t *const dJ, const int32_t d_stride,
407 : const int32_t height, __m256i dIs[WIENER_WIN_CHROMA - 1],
408 : __m256i dIe[WIENER_WIN_CHROMA - 1], __m256i dJs[WIENER_WIN_CHROMA - 1],
409 : __m256i dJe[WIENER_WIN_CHROMA - 1]) {
410 24000 : dIs[0] = _mm256_loadu_si256((__m256i *)(dI + 0 * d_stride));
411 24000 : dJs[0] = _mm256_loadu_si256((__m256i *)(dJ + 0 * d_stride));
412 24000 : dIs[1] = _mm256_loadu_si256((__m256i *)(dI + 1 * d_stride));
413 24000 : dJs[1] = _mm256_loadu_si256((__m256i *)(dJ + 1 * d_stride));
414 24000 : dIs[2] = _mm256_loadu_si256((__m256i *)(dI + 2 * d_stride));
415 24000 : dJs[2] = _mm256_loadu_si256((__m256i *)(dJ + 2 * d_stride));
416 24000 : dIs[3] = _mm256_loadu_si256((__m256i *)(dI + 3 * d_stride));
417 24000 : dJs[3] = _mm256_loadu_si256((__m256i *)(dJ + 3 * d_stride));
418 :
419 24000 : dIe[0] = _mm256_loadu_si256((__m256i *)(dI + (0 + height) * d_stride));
420 24000 : dJe[0] = _mm256_loadu_si256((__m256i *)(dJ + (0 + height) * d_stride));
421 24000 : dIe[1] = _mm256_loadu_si256((__m256i *)(dI + (1 + height) * d_stride));
422 24000 : dJe[1] = _mm256_loadu_si256((__m256i *)(dJ + (1 + height) * d_stride));
423 24000 : dIe[2] = _mm256_loadu_si256((__m256i *)(dI + (2 + height) * d_stride));
424 24000 : dJe[2] = _mm256_loadu_si256((__m256i *)(dJ + (2 + height) * d_stride));
425 24000 : dIe[3] = _mm256_loadu_si256((__m256i *)(dI + (3 + height) * d_stride));
426 24000 : dJe[3] = _mm256_loadu_si256((__m256i *)(dJ + (3 + height) * d_stride));
427 24000 : }
428 :
429 50400 : static INLINE void load_square_win7_avx2(
430 : const int16_t *const dI, const int16_t *const dJ, const int32_t d_stride,
431 : const int32_t height, __m256i dIs[WIENER_WIN - 1],
432 : __m256i dIe[WIENER_WIN - 1], __m256i dJs[WIENER_WIN - 1],
433 : __m256i dJe[WIENER_WIN - 1]) {
434 50400 : dIs[0] = _mm256_loadu_si256((__m256i *)(dI + 0 * d_stride));
435 50400 : dJs[0] = _mm256_loadu_si256((__m256i *)(dJ + 0 * d_stride));
436 50400 : dIs[1] = _mm256_loadu_si256((__m256i *)(dI + 1 * d_stride));
437 50400 : dJs[1] = _mm256_loadu_si256((__m256i *)(dJ + 1 * d_stride));
438 50400 : dIs[2] = _mm256_loadu_si256((__m256i *)(dI + 2 * d_stride));
439 50400 : dJs[2] = _mm256_loadu_si256((__m256i *)(dJ + 2 * d_stride));
440 50400 : dIs[3] = _mm256_loadu_si256((__m256i *)(dI + 3 * d_stride));
441 50400 : dJs[3] = _mm256_loadu_si256((__m256i *)(dJ + 3 * d_stride));
442 50400 : dIs[4] = _mm256_loadu_si256((__m256i *)(dI + 4 * d_stride));
443 50400 : dJs[4] = _mm256_loadu_si256((__m256i *)(dJ + 4 * d_stride));
444 50400 : dIs[5] = _mm256_loadu_si256((__m256i *)(dI + 5 * d_stride));
445 50400 : dJs[5] = _mm256_loadu_si256((__m256i *)(dJ + 5 * d_stride));
446 :
447 50400 : dIe[0] = _mm256_loadu_si256((__m256i *)(dI + (0 + height) * d_stride));
448 50400 : dJe[0] = _mm256_loadu_si256((__m256i *)(dJ + (0 + height) * d_stride));
449 50400 : dIe[1] = _mm256_loadu_si256((__m256i *)(dI + (1 + height) * d_stride));
450 50400 : dJe[1] = _mm256_loadu_si256((__m256i *)(dJ + (1 + height) * d_stride));
451 50400 : dIe[2] = _mm256_loadu_si256((__m256i *)(dI + (2 + height) * d_stride));
452 50400 : dJe[2] = _mm256_loadu_si256((__m256i *)(dJ + (2 + height) * d_stride));
453 50400 : dIe[3] = _mm256_loadu_si256((__m256i *)(dI + (3 + height) * d_stride));
454 50400 : dJe[3] = _mm256_loadu_si256((__m256i *)(dJ + (3 + height) * d_stride));
455 50400 : dIe[4] = _mm256_loadu_si256((__m256i *)(dI + (4 + height) * d_stride));
456 50400 : dJe[4] = _mm256_loadu_si256((__m256i *)(dJ + (4 + height) * d_stride));
457 50400 : dIe[5] = _mm256_loadu_si256((__m256i *)(dI + (5 + height) * d_stride));
458 50400 : dJe[5] = _mm256_loadu_si256((__m256i *)(dJ + (5 + height) * d_stride));
459 50400 : }
460 :
461 0 : static INLINE void load_triangle_win3_avx2(const int16_t *const dI,
462 : const int32_t d_stride,
463 : const int32_t height,
464 : __m256i dIs[WIENER_WIN_3TAP - 1],
465 : __m256i dIe[WIENER_WIN_3TAP - 1]) {
466 0 : dIs[0] = _mm256_loadu_si256((__m256i *)(dI + 0 * d_stride));
467 0 : dIs[1] = _mm256_loadu_si256((__m256i *)(dI + 1 * d_stride));
468 :
469 0 : dIe[0] = _mm256_loadu_si256((__m256i *)(dI + (0 + height) * d_stride));
470 0 : dIe[1] = _mm256_loadu_si256((__m256i *)(dI + (1 + height) * d_stride));
471 0 : }
472 :
473 12000 : static INLINE void load_triangle_win5_avx2(const int16_t *const dI,
474 : const int32_t d_stride,
475 : const int32_t height,
476 : __m256i dIs[WIENER_WIN_CHROMA - 1],
477 : __m256i dIe[WIENER_WIN_CHROMA - 1]) {
478 12000 : dIs[0] = _mm256_loadu_si256((__m256i *)(dI + 0 * d_stride));
479 12000 : dIs[1] = _mm256_loadu_si256((__m256i *)(dI + 1 * d_stride));
480 12000 : dIs[2] = _mm256_loadu_si256((__m256i *)(dI + 2 * d_stride));
481 12000 : dIs[3] = _mm256_loadu_si256((__m256i *)(dI + 3 * d_stride));
482 :
483 12000 : dIe[0] = _mm256_loadu_si256((__m256i *)(dI + (0 + height) * d_stride));
484 12000 : dIe[1] = _mm256_loadu_si256((__m256i *)(dI + (1 + height) * d_stride));
485 12000 : dIe[2] = _mm256_loadu_si256((__m256i *)(dI + (2 + height) * d_stride));
486 12000 : dIe[3] = _mm256_loadu_si256((__m256i *)(dI + (3 + height) * d_stride));
487 12000 : }
488 :
489 16800 : static INLINE void load_triangle_win7_avx2(const int16_t *const dI,
490 : const int32_t d_stride,
491 : const int32_t height,
492 : __m256i dIs[WIENER_WIN - 1],
493 : __m256i dIe[WIENER_WIN - 1]) {
494 16800 : dIs[0] = _mm256_loadu_si256((__m256i *)(dI + 0 * d_stride));
495 16800 : dIs[1] = _mm256_loadu_si256((__m256i *)(dI + 1 * d_stride));
496 16800 : dIs[2] = _mm256_loadu_si256((__m256i *)(dI + 2 * d_stride));
497 16800 : dIs[3] = _mm256_loadu_si256((__m256i *)(dI + 3 * d_stride));
498 16800 : dIs[4] = _mm256_loadu_si256((__m256i *)(dI + 4 * d_stride));
499 16800 : dIs[5] = _mm256_loadu_si256((__m256i *)(dI + 5 * d_stride));
500 :
501 16800 : dIe[0] = _mm256_loadu_si256((__m256i *)(dI + (0 + height) * d_stride));
502 16800 : dIe[1] = _mm256_loadu_si256((__m256i *)(dI + (1 + height) * d_stride));
503 16800 : dIe[2] = _mm256_loadu_si256((__m256i *)(dI + (2 + height) * d_stride));
504 16800 : dIe[3] = _mm256_loadu_si256((__m256i *)(dI + (3 + height) * d_stride));
505 16800 : dIe[4] = _mm256_loadu_si256((__m256i *)(dI + (4 + height) * d_stride));
506 16800 : dIe[5] = _mm256_loadu_si256((__m256i *)(dI + (5 + height) * d_stride));
507 16800 : }
508 :
509 0 : static INLINE void derive_square_win3_avx2(
510 : const __m256i dIs[WIENER_WIN_3TAP - 1],
511 : const __m256i dIe[WIENER_WIN_3TAP - 1],
512 : const __m256i dJs[WIENER_WIN_3TAP - 1],
513 : const __m256i dJe[WIENER_WIN_3TAP - 1],
514 : __m256i deltas[WIENER_WIN_3TAP - 1][WIENER_WIN_3TAP - 1]) {
515 0 : msub_avx2(dIs[0], dJs[0], &deltas[0][0]);
516 0 : msub_avx2(dIs[0], dJs[1], &deltas[0][1]);
517 0 : msub_avx2(dIs[1], dJs[0], &deltas[1][0]);
518 0 : msub_avx2(dIs[1], dJs[1], &deltas[1][1]);
519 :
520 0 : madd_avx2(dIe[0], dJe[0], &deltas[0][0]);
521 0 : madd_avx2(dIe[0], dJe[1], &deltas[0][1]);
522 0 : madd_avx2(dIe[1], dJe[0], &deltas[1][0]);
523 0 : madd_avx2(dIe[1], dJe[1], &deltas[1][1]);
524 0 : }
525 :
526 24000 : static INLINE void derive_square_win5_avx2(
527 : const __m256i dIs[WIENER_WIN_CHROMA - 1],
528 : const __m256i dIe[WIENER_WIN_CHROMA - 1],
529 : const __m256i dJs[WIENER_WIN_CHROMA - 1],
530 : const __m256i dJe[WIENER_WIN_CHROMA - 1],
531 : __m256i deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1]) {
532 24000 : msub_avx2(dIs[0], dJs[0], &deltas[0][0]);
533 24000 : msub_avx2(dIs[0], dJs[1], &deltas[0][1]);
534 24000 : msub_avx2(dIs[0], dJs[2], &deltas[0][2]);
535 24000 : msub_avx2(dIs[0], dJs[3], &deltas[0][3]);
536 24000 : msub_avx2(dIs[1], dJs[0], &deltas[1][0]);
537 24000 : msub_avx2(dIs[1], dJs[1], &deltas[1][1]);
538 24000 : msub_avx2(dIs[1], dJs[2], &deltas[1][2]);
539 24000 : msub_avx2(dIs[1], dJs[3], &deltas[1][3]);
540 24000 : msub_avx2(dIs[2], dJs[0], &deltas[2][0]);
541 24000 : msub_avx2(dIs[2], dJs[1], &deltas[2][1]);
542 24000 : msub_avx2(dIs[2], dJs[2], &deltas[2][2]);
543 24000 : msub_avx2(dIs[2], dJs[3], &deltas[2][3]);
544 24000 : msub_avx2(dIs[3], dJs[0], &deltas[3][0]);
545 24000 : msub_avx2(dIs[3], dJs[1], &deltas[3][1]);
546 24000 : msub_avx2(dIs[3], dJs[2], &deltas[3][2]);
547 24000 : msub_avx2(dIs[3], dJs[3], &deltas[3][3]);
548 :
549 24000 : madd_avx2(dIe[0], dJe[0], &deltas[0][0]);
550 24000 : madd_avx2(dIe[0], dJe[1], &deltas[0][1]);
551 24000 : madd_avx2(dIe[0], dJe[2], &deltas[0][2]);
552 24000 : madd_avx2(dIe[0], dJe[3], &deltas[0][3]);
553 24000 : madd_avx2(dIe[1], dJe[0], &deltas[1][0]);
554 24000 : madd_avx2(dIe[1], dJe[1], &deltas[1][1]);
555 24000 : madd_avx2(dIe[1], dJe[2], &deltas[1][2]);
556 24000 : madd_avx2(dIe[1], dJe[3], &deltas[1][3]);
557 24000 : madd_avx2(dIe[2], dJe[0], &deltas[2][0]);
558 24000 : madd_avx2(dIe[2], dJe[1], &deltas[2][1]);
559 24000 : madd_avx2(dIe[2], dJe[2], &deltas[2][2]);
560 24000 : madd_avx2(dIe[2], dJe[3], &deltas[2][3]);
561 24000 : madd_avx2(dIe[3], dJe[0], &deltas[3][0]);
562 24000 : madd_avx2(dIe[3], dJe[1], &deltas[3][1]);
563 24000 : madd_avx2(dIe[3], dJe[2], &deltas[3][2]);
564 24000 : madd_avx2(dIe[3], dJe[3], &deltas[3][3]);
565 24000 : }
566 :
567 50400 : static INLINE void derive_square_win7_avx2(
568 : const __m256i dIs[WIENER_WIN - 1], const __m256i dIe[WIENER_WIN - 1],
569 : const __m256i dJs[WIENER_WIN - 1], const __m256i dJe[WIENER_WIN - 1],
570 : __m256i deltas[WIENER_WIN - 1][WIENER_WIN - 1]) {
571 50400 : msub_avx2(dIs[0], dJs[0], &deltas[0][0]);
572 50400 : msub_avx2(dIs[0], dJs[1], &deltas[0][1]);
573 50400 : msub_avx2(dIs[0], dJs[2], &deltas[0][2]);
574 50400 : msub_avx2(dIs[0], dJs[3], &deltas[0][3]);
575 50400 : msub_avx2(dIs[0], dJs[4], &deltas[0][4]);
576 50400 : msub_avx2(dIs[0], dJs[5], &deltas[0][5]);
577 50400 : msub_avx2(dIs[1], dJs[0], &deltas[1][0]);
578 50400 : msub_avx2(dIs[1], dJs[1], &deltas[1][1]);
579 50400 : msub_avx2(dIs[1], dJs[2], &deltas[1][2]);
580 50400 : msub_avx2(dIs[1], dJs[3], &deltas[1][3]);
581 50400 : msub_avx2(dIs[1], dJs[4], &deltas[1][4]);
582 50400 : msub_avx2(dIs[1], dJs[5], &deltas[1][5]);
583 50400 : msub_avx2(dIs[2], dJs[0], &deltas[2][0]);
584 50400 : msub_avx2(dIs[2], dJs[1], &deltas[2][1]);
585 50400 : msub_avx2(dIs[2], dJs[2], &deltas[2][2]);
586 50400 : msub_avx2(dIs[2], dJs[3], &deltas[2][3]);
587 50400 : msub_avx2(dIs[2], dJs[4], &deltas[2][4]);
588 50400 : msub_avx2(dIs[2], dJs[5], &deltas[2][5]);
589 50400 : msub_avx2(dIs[3], dJs[0], &deltas[3][0]);
590 50400 : msub_avx2(dIs[3], dJs[1], &deltas[3][1]);
591 50400 : msub_avx2(dIs[3], dJs[2], &deltas[3][2]);
592 50400 : msub_avx2(dIs[3], dJs[3], &deltas[3][3]);
593 50400 : msub_avx2(dIs[3], dJs[4], &deltas[3][4]);
594 50400 : msub_avx2(dIs[3], dJs[5], &deltas[3][5]);
595 50400 : msub_avx2(dIs[4], dJs[0], &deltas[4][0]);
596 50400 : msub_avx2(dIs[4], dJs[1], &deltas[4][1]);
597 50400 : msub_avx2(dIs[4], dJs[2], &deltas[4][2]);
598 50400 : msub_avx2(dIs[4], dJs[3], &deltas[4][3]);
599 50400 : msub_avx2(dIs[4], dJs[4], &deltas[4][4]);
600 50400 : msub_avx2(dIs[4], dJs[5], &deltas[4][5]);
601 50400 : msub_avx2(dIs[5], dJs[0], &deltas[5][0]);
602 50400 : msub_avx2(dIs[5], dJs[1], &deltas[5][1]);
603 50400 : msub_avx2(dIs[5], dJs[2], &deltas[5][2]);
604 50400 : msub_avx2(dIs[5], dJs[3], &deltas[5][3]);
605 50400 : msub_avx2(dIs[5], dJs[4], &deltas[5][4]);
606 50400 : msub_avx2(dIs[5], dJs[5], &deltas[5][5]);
607 :
608 50400 : madd_avx2(dIe[0], dJe[0], &deltas[0][0]);
609 50400 : madd_avx2(dIe[0], dJe[1], &deltas[0][1]);
610 50400 : madd_avx2(dIe[0], dJe[2], &deltas[0][2]);
611 50400 : madd_avx2(dIe[0], dJe[3], &deltas[0][3]);
612 50400 : madd_avx2(dIe[0], dJe[4], &deltas[0][4]);
613 50400 : madd_avx2(dIe[0], dJe[5], &deltas[0][5]);
614 50400 : madd_avx2(dIe[1], dJe[0], &deltas[1][0]);
615 50400 : madd_avx2(dIe[1], dJe[1], &deltas[1][1]);
616 50400 : madd_avx2(dIe[1], dJe[2], &deltas[1][2]);
617 50400 : madd_avx2(dIe[1], dJe[3], &deltas[1][3]);
618 50400 : madd_avx2(dIe[1], dJe[4], &deltas[1][4]);
619 50400 : madd_avx2(dIe[1], dJe[5], &deltas[1][5]);
620 50400 : madd_avx2(dIe[2], dJe[0], &deltas[2][0]);
621 50400 : madd_avx2(dIe[2], dJe[1], &deltas[2][1]);
622 50400 : madd_avx2(dIe[2], dJe[2], &deltas[2][2]);
623 50400 : madd_avx2(dIe[2], dJe[3], &deltas[2][3]);
624 50400 : madd_avx2(dIe[2], dJe[4], &deltas[2][4]);
625 50400 : madd_avx2(dIe[2], dJe[5], &deltas[2][5]);
626 50400 : madd_avx2(dIe[3], dJe[0], &deltas[3][0]);
627 50400 : madd_avx2(dIe[3], dJe[1], &deltas[3][1]);
628 50400 : madd_avx2(dIe[3], dJe[2], &deltas[3][2]);
629 50400 : madd_avx2(dIe[3], dJe[3], &deltas[3][3]);
630 50400 : madd_avx2(dIe[3], dJe[4], &deltas[3][4]);
631 50400 : madd_avx2(dIe[3], dJe[5], &deltas[3][5]);
632 50400 : madd_avx2(dIe[4], dJe[0], &deltas[4][0]);
633 50400 : madd_avx2(dIe[4], dJe[1], &deltas[4][1]);
634 50400 : madd_avx2(dIe[4], dJe[2], &deltas[4][2]);
635 50400 : madd_avx2(dIe[4], dJe[3], &deltas[4][3]);
636 50400 : madd_avx2(dIe[4], dJe[4], &deltas[4][4]);
637 50400 : madd_avx2(dIe[4], dJe[5], &deltas[4][5]);
638 50400 : madd_avx2(dIe[5], dJe[0], &deltas[5][0]);
639 50400 : madd_avx2(dIe[5], dJe[1], &deltas[5][1]);
640 50400 : madd_avx2(dIe[5], dJe[2], &deltas[5][2]);
641 50400 : madd_avx2(dIe[5], dJe[3], &deltas[5][3]);
642 50400 : madd_avx2(dIe[5], dJe[4], &deltas[5][4]);
643 50400 : madd_avx2(dIe[5], dJe[5], &deltas[5][5]);
644 50400 : }
645 :
646 0 : static INLINE void derive_triangle_win3_avx2(
647 : const __m256i dIs[WIENER_WIN_3TAP - 1],
648 : const __m256i dIe[WIENER_WIN_3TAP - 1],
649 : __m256i deltas[WIENER_WIN_3TAP * (WIENER_WIN_3TAP - 1) / 2]) {
650 0 : msub_avx2(dIs[0], dIs[0], &deltas[0]);
651 0 : msub_avx2(dIs[0], dIs[1], &deltas[1]);
652 0 : msub_avx2(dIs[1], dIs[1], &deltas[2]);
653 :
654 0 : madd_avx2(dIe[0], dIe[0], &deltas[0]);
655 0 : madd_avx2(dIe[0], dIe[1], &deltas[1]);
656 0 : madd_avx2(dIe[1], dIe[1], &deltas[2]);
657 0 : }
658 :
659 12000 : static INLINE void derive_triangle_win5_avx2(
660 : const __m256i dIs[WIENER_WIN_CHROMA - 1],
661 : const __m256i dIe[WIENER_WIN_CHROMA - 1],
662 : __m256i deltas[WIENER_WIN_CHROMA * (WIENER_WIN_CHROMA - 1) / 2]) {
663 12000 : msub_avx2(dIs[0], dIs[0], &deltas[0]);
664 12000 : msub_avx2(dIs[0], dIs[1], &deltas[1]);
665 12000 : msub_avx2(dIs[0], dIs[2], &deltas[2]);
666 12000 : msub_avx2(dIs[0], dIs[3], &deltas[3]);
667 12000 : msub_avx2(dIs[1], dIs[1], &deltas[4]);
668 12000 : msub_avx2(dIs[1], dIs[2], &deltas[5]);
669 12000 : msub_avx2(dIs[1], dIs[3], &deltas[6]);
670 12000 : msub_avx2(dIs[2], dIs[2], &deltas[7]);
671 12000 : msub_avx2(dIs[2], dIs[3], &deltas[8]);
672 12000 : msub_avx2(dIs[3], dIs[3], &deltas[9]);
673 :
674 12000 : madd_avx2(dIe[0], dIe[0], &deltas[0]);
675 12000 : madd_avx2(dIe[0], dIe[1], &deltas[1]);
676 12000 : madd_avx2(dIe[0], dIe[2], &deltas[2]);
677 12000 : madd_avx2(dIe[0], dIe[3], &deltas[3]);
678 12000 : madd_avx2(dIe[1], dIe[1], &deltas[4]);
679 12000 : madd_avx2(dIe[1], dIe[2], &deltas[5]);
680 12000 : madd_avx2(dIe[1], dIe[3], &deltas[6]);
681 12000 : madd_avx2(dIe[2], dIe[2], &deltas[7]);
682 12000 : madd_avx2(dIe[2], dIe[3], &deltas[8]);
683 12000 : madd_avx2(dIe[3], dIe[3], &deltas[9]);
684 12000 : }
685 :
686 16800 : static INLINE void derive_triangle_win7_avx2(
687 : const __m256i dIs[WIENER_WIN - 1], const __m256i dIe[WIENER_WIN - 1],
688 : __m256i deltas[WIENER_WIN * (WIENER_WIN - 1) / 2]) {
689 16800 : msub_avx2(dIs[0], dIs[0], &deltas[0]);
690 16800 : msub_avx2(dIs[0], dIs[1], &deltas[1]);
691 16800 : msub_avx2(dIs[0], dIs[2], &deltas[2]);
692 16800 : msub_avx2(dIs[0], dIs[3], &deltas[3]);
693 16800 : msub_avx2(dIs[0], dIs[4], &deltas[4]);
694 16800 : msub_avx2(dIs[0], dIs[5], &deltas[5]);
695 16800 : msub_avx2(dIs[1], dIs[1], &deltas[6]);
696 16800 : msub_avx2(dIs[1], dIs[2], &deltas[7]);
697 16800 : msub_avx2(dIs[1], dIs[3], &deltas[8]);
698 16800 : msub_avx2(dIs[1], dIs[4], &deltas[9]);
699 16800 : msub_avx2(dIs[1], dIs[5], &deltas[10]);
700 16800 : msub_avx2(dIs[2], dIs[2], &deltas[11]);
701 16800 : msub_avx2(dIs[2], dIs[3], &deltas[12]);
702 16800 : msub_avx2(dIs[2], dIs[4], &deltas[13]);
703 16800 : msub_avx2(dIs[2], dIs[5], &deltas[14]);
704 16800 : msub_avx2(dIs[3], dIs[3], &deltas[15]);
705 16800 : msub_avx2(dIs[3], dIs[4], &deltas[16]);
706 16800 : msub_avx2(dIs[3], dIs[5], &deltas[17]);
707 16800 : msub_avx2(dIs[4], dIs[4], &deltas[18]);
708 16800 : msub_avx2(dIs[4], dIs[5], &deltas[19]);
709 16800 : msub_avx2(dIs[5], dIs[5], &deltas[20]);
710 :
711 16800 : madd_avx2(dIe[0], dIe[0], &deltas[0]);
712 16800 : madd_avx2(dIe[0], dIe[1], &deltas[1]);
713 16800 : madd_avx2(dIe[0], dIe[2], &deltas[2]);
714 16800 : madd_avx2(dIe[0], dIe[3], &deltas[3]);
715 16800 : madd_avx2(dIe[0], dIe[4], &deltas[4]);
716 16800 : madd_avx2(dIe[0], dIe[5], &deltas[5]);
717 16800 : madd_avx2(dIe[1], dIe[1], &deltas[6]);
718 16800 : madd_avx2(dIe[1], dIe[2], &deltas[7]);
719 16800 : madd_avx2(dIe[1], dIe[3], &deltas[8]);
720 16800 : madd_avx2(dIe[1], dIe[4], &deltas[9]);
721 16800 : madd_avx2(dIe[1], dIe[5], &deltas[10]);
722 16800 : madd_avx2(dIe[2], dIe[2], &deltas[11]);
723 16800 : madd_avx2(dIe[2], dIe[3], &deltas[12]);
724 16800 : madd_avx2(dIe[2], dIe[4], &deltas[13]);
725 16800 : madd_avx2(dIe[2], dIe[5], &deltas[14]);
726 16800 : madd_avx2(dIe[3], dIe[3], &deltas[15]);
727 16800 : madd_avx2(dIe[3], dIe[4], &deltas[16]);
728 16800 : madd_avx2(dIe[3], dIe[5], &deltas[17]);
729 16800 : madd_avx2(dIe[4], dIe[4], &deltas[18]);
730 16800 : madd_avx2(dIe[4], dIe[5], &deltas[19]);
731 16800 : madd_avx2(dIe[5], dIe[5], &deltas[20]);
732 16800 : }
733 :
734 0 : static INLINE __m256i div4_avx2(const __m256i src) {
735 : __m256i sign, dst;
736 :
737 : // get sign
738 0 : sign = _mm256_srli_epi64(src, 63);
739 0 : sign = _mm256_sub_epi64(_mm256_setzero_si256(), sign);
740 :
741 : // abs
742 0 : dst = _mm256_xor_si256(src, sign);
743 0 : dst = _mm256_sub_epi64(dst, sign);
744 :
745 : // divide by 4
746 0 : dst = _mm256_srli_epi64(dst, 2);
747 :
748 : // apply sign
749 0 : dst = _mm256_xor_si256(dst, sign);
750 0 : return _mm256_sub_epi64(dst, sign);
751 : }
752 :
753 0 : static INLINE __m256i div16_avx2(const __m256i src) {
754 : __m256i sign, dst;
755 :
756 : // get sign
757 0 : sign = _mm256_srli_epi64(src, 63);
758 0 : sign = _mm256_sub_epi64(_mm256_setzero_si256(), sign);
759 :
760 : // abs
761 0 : dst = _mm256_xor_si256(src, sign);
762 0 : dst = _mm256_sub_epi64(dst, sign);
763 :
764 : // divide by 16
765 0 : dst = _mm256_srli_epi64(dst, 4);
766 :
767 : // apply sign
768 0 : dst = _mm256_xor_si256(dst, sign);
769 0 : return _mm256_sub_epi64(dst, sign);
770 : }
771 :
772 0 : static INLINE void div4_4x4_avx2(const int32_t wiener_win2, int64_t *const H,
773 : __m256i out[4]) {
774 0 : out[0] = _mm256_loadu_si256((__m256i *)(H + 0 * wiener_win2));
775 0 : out[1] = _mm256_loadu_si256((__m256i *)(H + 1 * wiener_win2));
776 0 : out[2] = _mm256_loadu_si256((__m256i *)(H + 2 * wiener_win2));
777 0 : out[3] = _mm256_loadu_si256((__m256i *)(H + 3 * wiener_win2));
778 :
779 0 : out[0] = div4_avx2(out[0]);
780 0 : out[1] = div4_avx2(out[1]);
781 0 : out[2] = div4_avx2(out[2]);
782 0 : out[3] = div4_avx2(out[3]);
783 :
784 0 : _mm256_storeu_si256((__m256i *)(H + 0 * wiener_win2), out[0]);
785 0 : _mm256_storeu_si256((__m256i *)(H + 1 * wiener_win2), out[1]);
786 0 : _mm256_storeu_si256((__m256i *)(H + 2 * wiener_win2), out[2]);
787 0 : _mm256_storeu_si256((__m256i *)(H + 3 * wiener_win2), out[3]);
788 0 : }
789 :
790 0 : static INLINE void div16_4x4_avx2(const int32_t wiener_win2, int64_t *const H,
791 : __m256i out[4]) {
792 0 : out[0] = _mm256_loadu_si256((__m256i *)(H + 0 * wiener_win2));
793 0 : out[1] = _mm256_loadu_si256((__m256i *)(H + 1 * wiener_win2));
794 0 : out[2] = _mm256_loadu_si256((__m256i *)(H + 2 * wiener_win2));
795 0 : out[3] = _mm256_loadu_si256((__m256i *)(H + 3 * wiener_win2));
796 :
797 0 : out[0] = div16_avx2(out[0]);
798 0 : out[1] = div16_avx2(out[1]);
799 0 : out[2] = div16_avx2(out[2]);
800 0 : out[3] = div16_avx2(out[3]);
801 :
802 0 : _mm256_storeu_si256((__m256i *)(H + 0 * wiener_win2), out[0]);
803 0 : _mm256_storeu_si256((__m256i *)(H + 1 * wiener_win2), out[1]);
804 0 : _mm256_storeu_si256((__m256i *)(H + 2 * wiener_win2), out[2]);
805 0 : _mm256_storeu_si256((__m256i *)(H + 3 * wiener_win2), out[3]);
806 0 : }
807 :
808 : // Transpose each 4x4 block starting from the second column, and save the needed
809 : // points only.
810 300 : static INLINE void diagonal_copy_stats_avx2(const int32_t wiener_win2,
811 : int64_t *const H) {
812 3180 : for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
813 : __m256i in[4], out[4];
814 :
815 2880 : in[0] =
816 2880 : _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + i + 1));
817 2880 : in[1] =
818 2880 : _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + i + 1));
819 2880 : in[2] =
820 2880 : _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + i + 1));
821 2880 : in[3] =
822 2880 : _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i + 1));
823 :
824 2880 : transpose_64bit_4x4_avx2(in, out);
825 :
826 5760 : _mm_storel_epi64((__m128i *)(H + (i + 1) * wiener_win2 + i),
827 2880 : _mm256_extracti128_si256(out[0], 0));
828 2880 : _mm_storeu_si128((__m128i *)(H + (i + 2) * wiener_win2 + i),
829 2880 : _mm256_extracti128_si256(out[1], 0));
830 2880 : _mm256_storeu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i), out[2]);
831 2880 : _mm256_storeu_si256((__m256i *)(H + (i + 4) * wiener_win2 + i), out[3]);
832 :
833 16560 : for (int32_t j = i + 5; j < wiener_win2; j += 4) {
834 13680 : in[0] =
835 13680 : _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + j));
836 13680 : in[1] =
837 13680 : _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + j));
838 13680 : in[2] =
839 13680 : _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + j));
840 13680 : in[3] =
841 13680 : _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + j));
842 :
843 13680 : transpose_64bit_4x4_avx2(in, out);
844 :
845 13680 : _mm256_storeu_si256((__m256i *)(H + (j + 0) * wiener_win2 + i),
846 : out[0]);
847 13680 : _mm256_storeu_si256((__m256i *)(H + (j + 1) * wiener_win2 + i),
848 : out[1]);
849 13680 : _mm256_storeu_si256((__m256i *)(H + (j + 2) * wiener_win2 + i),
850 : out[2]);
851 13680 : _mm256_storeu_si256((__m256i *)(H + (j + 3) * wiener_win2 + i),
852 : out[3]);
853 : }
854 : }
855 300 : }
856 :
857 : // Transpose each 4x4 block starting from the second column, and save the needed
858 : // points only.
859 : // H[4 * k * wiener_win2 + 4 * k] on the diagonal is omitted, and must be
860 : // processed separately.
861 0 : static INLINE void div4_diagonal_copy_stats_avx2(const int32_t wiener_win2,
862 : int64_t *const H) {
863 0 : for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
864 : __m256i in[4], out[4];
865 :
866 0 : div4_4x4_avx2(wiener_win2, H + i * wiener_win2 + i + 1, in);
867 0 : transpose_64bit_4x4_avx2(in, out);
868 :
869 0 : _mm_storel_epi64((__m128i *)(H + (i + 1) * wiener_win2 + i),
870 0 : _mm256_extracti128_si256(out[0], 0));
871 0 : _mm_storeu_si128((__m128i *)(H + (i + 2) * wiener_win2 + i),
872 0 : _mm256_extracti128_si256(out[1], 0));
873 0 : _mm256_storeu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i), out[2]);
874 0 : _mm256_storeu_si256((__m256i *)(H + (i + 4) * wiener_win2 + i), out[3]);
875 :
876 0 : for (int32_t j = i + 5; j < wiener_win2; j += 4) {
877 0 : div4_4x4_avx2(wiener_win2, H + i * wiener_win2 + j, in);
878 0 : transpose_64bit_4x4_avx2(in, out);
879 :
880 0 : _mm256_storeu_si256((__m256i *)(H + (j + 0) * wiener_win2 + i),
881 : out[0]);
882 0 : _mm256_storeu_si256((__m256i *)(H + (j + 1) * wiener_win2 + i),
883 : out[1]);
884 0 : _mm256_storeu_si256((__m256i *)(H + (j + 2) * wiener_win2 + i),
885 : out[2]);
886 0 : _mm256_storeu_si256((__m256i *)(H + (j + 3) * wiener_win2 + i),
887 : out[3]);
888 : }
889 : }
890 0 : }
891 :
892 : // Transpose each 4x4 block starting from the second column, and save the needed
893 : // points only.
894 : // H[4 * k * wiener_win2 + 4 * k] on the diagonal is omitted, and must be
895 : // processed separately.
896 0 : static INLINE void div16_diagonal_copy_stats_avx2(const int32_t wiener_win2,
897 : int64_t *const H) {
898 0 : for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
899 : __m256i in[4], out[4];
900 :
901 0 : div16_4x4_avx2(wiener_win2, H + i * wiener_win2 + i + 1, in);
902 0 : transpose_64bit_4x4_avx2(in, out);
903 :
904 0 : _mm_storel_epi64((__m128i *)(H + (i + 1) * wiener_win2 + i),
905 0 : _mm256_extracti128_si256(out[0], 0));
906 0 : _mm_storeu_si128((__m128i *)(H + (i + 2) * wiener_win2 + i),
907 0 : _mm256_extracti128_si256(out[1], 0));
908 0 : _mm256_storeu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i), out[2]);
909 0 : _mm256_storeu_si256((__m256i *)(H + (i + 4) * wiener_win2 + i), out[3]);
910 :
911 0 : for (int32_t j = i + 5; j < wiener_win2; j += 4) {
912 0 : div16_4x4_avx2(wiener_win2, H + i * wiener_win2 + j, in);
913 0 : transpose_64bit_4x4_avx2(in, out);
914 :
915 0 : _mm256_storeu_si256((__m256i *)(H + (j + 0) * wiener_win2 + i),
916 : out[0]);
917 0 : _mm256_storeu_si256((__m256i *)(H + (j + 1) * wiener_win2 + i),
918 : out[1]);
919 0 : _mm256_storeu_si256((__m256i *)(H + (j + 2) * wiener_win2 + i),
920 : out[2]);
921 0 : _mm256_storeu_si256((__m256i *)(H + (j + 3) * wiener_win2 + i),
922 : out[3]);
923 : }
924 : }
925 0 : }
926 :
927 0 : static INLINE void compute_stats_win3_avx2(
928 : const int16_t *const d, const int32_t d_stride, const int16_t *const s,
929 : const int32_t s_stride, const int32_t width, const int32_t height,
930 : int64_t *const M, int64_t *const H, AomBitDepth bit_depth) {
931 0 : const int32_t wiener_win = WIENER_WIN_3TAP;
932 0 : const int32_t wiener_win2 = wiener_win * wiener_win;
933 0 : const int32_t w16 = width & ~15;
934 0 : const int32_t h4 = height & ~3;
935 0 : const int32_t h8 = height & ~7;
936 : const __m256i mask =
937 0 : _mm256_load_si256((__m256i *)(mask_16bit[width - w16]));
938 : int32_t i, j, x, y;
939 :
940 0 : if (bit_depth == AOM_BITS_8) {
941 : // Step 1: Calculate the top edge of the whole matrix, i.e., the top
942 : // edge of each triangle and square on the top row.
943 0 : j = 0;
944 : do {
945 0 : const int16_t *sT = s;
946 0 : const int16_t *dT = d;
947 0 : __m256i sumM[WIENER_WIN_3TAP] = {0};
948 0 : __m256i sumH[WIENER_WIN_3TAP] = {0};
949 :
950 0 : y = height;
951 : do {
952 0 : x = 0;
953 : do {
954 0 : const __m256i src = _mm256_load_si256((__m256i *)(sT + x));
955 0 : const __m256i dgd = _mm256_load_si256((__m256i *)(dT + x));
956 0 : stats_top_win3_avx2(
957 0 : src, dgd, dT + j + x, d_stride, sumM, sumH);
958 0 : x += 16;
959 0 : } while (x < w16);
960 :
961 0 : if (w16 != width) {
962 : const __m256i src =
963 0 : _mm256_load_si256((__m256i *)(sT + w16));
964 : const __m256i dgd =
965 0 : _mm256_load_si256((__m256i *)(dT + w16));
966 0 : const __m256i srcMask = _mm256_and_si256(src, mask);
967 0 : const __m256i dgdMask = _mm256_and_si256(dgd, mask);
968 0 : stats_top_win3_avx2(
969 0 : srcMask, dgdMask, dT + j + w16, d_stride, sumM, sumH);
970 : }
971 :
972 0 : sT += s_stride;
973 0 : dT += d_stride;
974 0 : } while (--y);
975 :
976 : const __m256i sM =
977 0 : hadd_four_32_to_64_avx2(sumM[0], sumM[1], sumM[2], sumM[2]);
978 0 : _mm_storeu_si128((__m128i *)(M + wiener_win * j),
979 0 : _mm256_extracti128_si256(sM, 0));
980 0 : _mm_storel_epi64((__m128i *)&M[wiener_win * j + 2],
981 0 : _mm256_extracti128_si256(sM, 1));
982 :
983 : const __m256i sH =
984 0 : hadd_four_32_to_64_avx2(sumH[0], sumH[1], sumH[2], sumH[2]);
985 : // Writing one more H on the top edge falls to the second row, so it
986 : // won't overflow.
987 0 : _mm256_storeu_si256((__m256i *)(H + wiener_win * j), sH);
988 0 : } while (++j < wiener_win);
989 :
990 : // Step 2: Calculate the left edge of each square on the top row.
991 0 : j = 1;
992 : do {
993 0 : const int16_t *dT = d;
994 0 : __m256i sumH[WIENER_WIN_3TAP - 1] = {0};
995 :
996 0 : y = height;
997 : do {
998 0 : x = 0;
999 : do {
1000 : const __m256i dgd =
1001 0 : _mm256_loadu_si256((__m256i *)(dT + j + x));
1002 0 : stats_left_win3_avx2(dgd, dT + x, d_stride, sumH);
1003 0 : x += 16;
1004 0 : } while (x < w16);
1005 :
1006 0 : if (w16 != width) {
1007 : const __m256i dgd =
1008 0 : _mm256_loadu_si256((__m256i *)(dT + j + x));
1009 0 : const __m256i dgdMask = _mm256_and_si256(dgd, mask);
1010 0 : stats_left_win3_avx2(dgdMask, dT + x, d_stride, sumH);
1011 : }
1012 :
1013 0 : dT += d_stride;
1014 0 : } while (--y);
1015 :
1016 0 : const __m128i sum = hadd_two_32_to_64_avx2(sumH[0], sumH[1]);
1017 0 : _mm_storel_epi64((__m128i *)&H[1 * wiener_win2 + j * wiener_win],
1018 : sum);
1019 0 : _mm_storeh_epi64((__m128i *)&H[2 * wiener_win2 + j * wiener_win],
1020 : sum);
1021 0 : } while (++j < wiener_win);
1022 : } else {
1023 0 : const int32_t numBitLeft =
1024 0 : 32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 3 /* SIMD */;
1025 0 : const int32_t hAllowed =
1026 0 : (1 << numBitLeft) / (w16 + ((w16 != width) ? 16 : 0));
1027 :
1028 : // Step 1: Calculate the top edge of the whole matrix, i.e., the top
1029 : // edge of each triangle and square on the top row.
1030 0 : j = 0;
1031 : do {
1032 0 : const int16_t *sT = s;
1033 0 : const int16_t *dT = d;
1034 0 : int32_t heightT = 0;
1035 0 : __m256i sumM[WIENER_WIN_3TAP] = {0};
1036 0 : __m256i sumH[WIENER_WIN_3TAP] = {0};
1037 :
1038 : do {
1039 0 : const int32_t hT = ((height - heightT) < hAllowed)
1040 : ? (height - heightT)
1041 : : hAllowed;
1042 0 : __m256i rowM[WIENER_WIN_3TAP] = {0};
1043 0 : __m256i rowH[WIENER_WIN_3TAP] = {0};
1044 :
1045 0 : y = hT;
1046 : do {
1047 0 : x = 0;
1048 : do {
1049 : const __m256i src =
1050 0 : _mm256_load_si256((__m256i *)(sT + x));
1051 : const __m256i dgd =
1052 0 : _mm256_load_si256((__m256i *)(dT + x));
1053 0 : stats_top_win3_avx2(
1054 0 : src, dgd, dT + j + x, d_stride, rowM, rowH);
1055 0 : x += 16;
1056 0 : } while (x < w16);
1057 :
1058 0 : if (w16 != width) {
1059 : const __m256i src =
1060 0 : _mm256_load_si256((__m256i *)(sT + w16));
1061 : const __m256i dgd =
1062 0 : _mm256_load_si256((__m256i *)(dT + w16));
1063 0 : const __m256i srcMask = _mm256_and_si256(src, mask);
1064 0 : const __m256i dgdMask = _mm256_and_si256(dgd, mask);
1065 0 : stats_top_win3_avx2(srcMask,
1066 : dgdMask,
1067 0 : dT + j + w16,
1068 : d_stride,
1069 : rowM,
1070 : rowH);
1071 : }
1072 :
1073 0 : sT += s_stride;
1074 0 : dT += d_stride;
1075 0 : } while (--y);
1076 :
1077 0 : add_32_to_64_avx2(rowM[0], &sumM[0]);
1078 0 : add_32_to_64_avx2(rowM[1], &sumM[1]);
1079 0 : add_32_to_64_avx2(rowM[2], &sumM[2]);
1080 0 : add_32_to_64_avx2(rowH[0], &sumH[0]);
1081 0 : add_32_to_64_avx2(rowH[1], &sumH[1]);
1082 0 : add_32_to_64_avx2(rowH[2], &sumH[2]);
1083 :
1084 0 : heightT += hT;
1085 0 : } while (heightT < height);
1086 :
1087 : const __m256i sM =
1088 0 : hadd_four_64_avx2(sumM[0], sumM[1], sumM[2], sumM[2]);
1089 0 : _mm_storeu_si128((__m128i *)(M + wiener_win * j),
1090 0 : _mm256_extracti128_si256(sM, 0));
1091 0 : _mm_storel_epi64((__m128i *)&M[wiener_win * j + 2],
1092 0 : _mm256_extracti128_si256(sM, 1));
1093 :
1094 : const __m256i sH =
1095 0 : hadd_four_64_avx2(sumH[0], sumH[1], sumH[2], sumH[2]);
1096 : // Writing one more H on the top edge falls to the second row, so it
1097 : // won't overflow.
1098 0 : _mm256_storeu_si256((__m256i *)(H + wiener_win * j), sH);
1099 0 : } while (++j < wiener_win);
1100 :
1101 : // Step 2: Calculate the left edge of each square on the top row.
1102 0 : j = 1;
1103 : do {
1104 0 : const int16_t *dT = d;
1105 0 : int32_t heightT = 0;
1106 0 : __m256i sumH[WIENER_WIN_3TAP - 1] = {0};
1107 :
1108 : do {
1109 0 : const int32_t hT = ((height - heightT) < hAllowed)
1110 : ? (height - heightT)
1111 : : hAllowed;
1112 0 : __m256i rowH[WIENER_WIN_3TAP - 1] = {0};
1113 :
1114 0 : y = hT;
1115 : do {
1116 0 : x = 0;
1117 : do {
1118 : const __m256i dgd =
1119 0 : _mm256_loadu_si256((__m256i *)(dT + j + x));
1120 0 : stats_left_win3_avx2(dgd, dT + x, d_stride, rowH);
1121 0 : x += 16;
1122 0 : } while (x < w16);
1123 :
1124 0 : if (w16 != width) {
1125 : const __m256i dgd =
1126 0 : _mm256_loadu_si256((__m256i *)(dT + j + x));
1127 0 : const __m256i dgdMask = _mm256_and_si256(dgd, mask);
1128 0 : stats_left_win3_avx2(dgdMask, dT + x, d_stride, rowH);
1129 : }
1130 :
1131 0 : dT += d_stride;
1132 0 : } while (--y);
1133 :
1134 0 : add_32_to_64_avx2(rowH[0], &sumH[0]);
1135 0 : add_32_to_64_avx2(rowH[1], &sumH[1]);
1136 :
1137 0 : heightT += hT;
1138 0 : } while (heightT < height);
1139 :
1140 0 : const __m128i sum = hadd_two_64_avx2(sumH[0], sumH[1]);
1141 0 : _mm_storel_epi64((__m128i *)&H[1 * wiener_win2 + j * wiener_win],
1142 : sum);
1143 0 : _mm_storeh_epi64((__m128i *)&H[2 * wiener_win2 + j * wiener_win],
1144 : sum);
1145 0 : } while (++j < wiener_win);
1146 : }
1147 :
1148 : // Step 3: Derive the top edge of each triangle along the diagonal. No
1149 : // triangle in top row.
1150 : {
1151 0 : const int16_t *dT = d;
1152 0 : __m256i dd = _mm256_setzero_si256(); // Initialize to avoid warning.
1153 0 : __m256i deltas[4] = {0};
1154 : __m256i delta;
1155 :
1156 0 : dd = _mm256_insert_epi32(dd, *(int32_t *)(dT + 0 * d_stride), 0);
1157 0 : dd =
1158 0 : _mm256_insert_epi32(dd, *(int32_t *)(dT + 0 * d_stride + width), 4);
1159 0 : dd = _mm256_insert_epi32(dd, *(int32_t *)(dT + 1 * d_stride), 1);
1160 0 : dd =
1161 0 : _mm256_insert_epi32(dd, *(int32_t *)(dT + 1 * d_stride + width), 5);
1162 :
1163 0 : if (bit_depth < AOM_BITS_12) {
1164 0 : step3_win3_avx2(&dT, d_stride, width, h4, &dd, deltas);
1165 :
1166 : // 00 00 10 10 00 00 10 10
1167 : // 01 01 11 11 01 01 11 11
1168 : // 02 02 12 12 02 02 12 12
1169 0 : deltas[0] = _mm256_hadd_epi32(
1170 : deltas[0], deltas[1]); // 00 10 01 11 00 10 01 11
1171 0 : deltas[2] = _mm256_hadd_epi32(
1172 : deltas[2], deltas[2]); // 02 12 02 12 02 12 02 12
1173 0 : const __m128i delta0 = sub_hi_lo_32_avx2(deltas[0]); // 00 10 01 11
1174 0 : const __m128i delta1 = sub_hi_lo_32_avx2(deltas[2]); // 02 12 02 12
1175 0 : delta = _mm256_inserti128_si256(_mm256_castsi128_si256(delta0),
1176 : delta1,
1177 : 1); // 00 10 01 11 02 12 02 12
1178 : } else {
1179 0 : int32_t h4T = 0;
1180 :
1181 : do {
1182 0 : __m256i deltasT[WIENER_WIN_3TAP] = {0};
1183 :
1184 0 : const int32_t hT = ((h4 - h4T) < 256) ? (h4 - h4T) : 256;
1185 :
1186 0 : step3_win3_avx2(&dT, d_stride, width, hT, &dd, deltasT);
1187 :
1188 0 : deltasT[0] = hsub_32x8_to_64x4_avx2(deltasT[0]); // 00 00 10 10
1189 0 : deltasT[1] = hsub_32x8_to_64x4_avx2(deltasT[1]); // 01 01 11 11
1190 0 : deltasT[2] = hsub_32x8_to_64x4_avx2(deltasT[2]); // 02 02 12 12
1191 0 : deltasT[0] =
1192 0 : hadd_x_64_avx2(deltasT[0], deltasT[1]); // 00 10 01 11
1193 0 : deltasT[2] =
1194 0 : hadd_x_64_avx2(deltasT[2], deltasT[2]); // 02 12 02 12
1195 0 : deltas[0] = _mm256_add_epi64(deltas[0], deltasT[0]);
1196 0 : deltas[1] = _mm256_add_epi64(deltas[1], deltasT[2]);
1197 :
1198 0 : h4T += hT;
1199 0 : } while (h4T < h4);
1200 :
1201 0 : delta = _mm256_setzero_si256();
1202 : }
1203 :
1204 0 : if (h4 != height) {
1205 : // 16-bit idx: 0, 2, 1, 3, 0, 2, 1, 3
1206 : const __m128i shf0 =
1207 0 : _mm_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7);
1208 : // 16-bit idx: 0, 2, 1, 3, 4, 6, 5, 7, 0, 2, 1, 3, 4, 6, 5, 7
1209 0 : const __m256i shf1 = _mm256_setr_epi8(0,
1210 : 1,
1211 : 4,
1212 : 5,
1213 : 2,
1214 : 3,
1215 : 6,
1216 : 7,
1217 : 8,
1218 : 9,
1219 : 12,
1220 : 13,
1221 : 10,
1222 : 11,
1223 : 14,
1224 : 15,
1225 : 0,
1226 : 1,
1227 : 4,
1228 : 5,
1229 : 2,
1230 : 3,
1231 : 6,
1232 : 7,
1233 : 8,
1234 : 9,
1235 : 12,
1236 : 13,
1237 : 10,
1238 : 11,
1239 : 14,
1240 : 15);
1241 :
1242 0 : dd = _mm256_insert_epi32(dd, *(int32_t *)(dT + 0 * d_stride), 0);
1243 0 : dd = _mm256_insert_epi32(
1244 : dd, *(int32_t *)(dT + 0 * d_stride + width), 1);
1245 0 : dd = _mm256_insert_epi32(dd, *(int32_t *)(dT + 1 * d_stride), 2);
1246 0 : dd = _mm256_insert_epi32(
1247 : dd, *(int32_t *)(dT + 1 * d_stride + width), 3);
1248 :
1249 0 : y = height - h4;
1250 : do {
1251 : __m128i t0;
1252 :
1253 : // -00s -01s 00e 01e
1254 0 : t0 = _mm_cvtsi32_si128(*(int32_t *)dT);
1255 0 : t0 = _mm_sub_epi16(_mm_setzero_si128(), t0);
1256 0 : t0 = _mm_insert_epi32(t0, *(int32_t *)(dT + width), 1);
1257 0 : t0 = _mm_shuffle_epi8(t0, shf0);
1258 : // -00s 00e -01s 01e -00s 00e -01s 01e -00s 00e -01s 01e -00s
1259 : // 00e -01s 01e
1260 : const __m256i t =
1261 0 : _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t0, 1);
1262 :
1263 : // 00s 01s 00e 01e 10s 11s 10e 11e 20s 21s 20e 21e xx xx xx xx
1264 0 : dd =
1265 0 : _mm256_insert_epi32(dd, *(int32_t *)(dT + 2 * d_stride), 4);
1266 0 : dd = _mm256_insert_epi32(
1267 : dd, *(int32_t *)(dT + 2 * d_stride + width), 5);
1268 : // 00s 00e 01s 01e 10s 10e 11s 11e 20s 20e 21e 21s xx xx xx xx
1269 0 : const __m256i ddT = _mm256_shuffle_epi8(dd, shf1);
1270 0 : madd_avx2(t, ddT, &delta);
1271 :
1272 0 : dd = _mm256_permute4x64_epi64(dd, 0x39); // right shift 8 bytes
1273 0 : dT += d_stride;
1274 0 : } while (--y);
1275 : }
1276 :
1277 : // Writing one more H on the top edge of a triangle along the diagonal
1278 : // falls to the next triangle in the same row, which would be calculated
1279 : // later, so it won't overflow.
1280 0 : if (bit_depth < AOM_BITS_12) {
1281 : // 00 01 02 02 10 11 12 12
1282 0 : delta = _mm256_permutevar8x32_epi32(
1283 : delta, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1284 :
1285 0 : update_4_stats_avx2(
1286 : H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
1287 0 : _mm256_extracti128_si256(delta, 0),
1288 0 : H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
1289 0 : update_4_stats_avx2(
1290 0 : H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
1291 0 : _mm256_extracti128_si256(delta, 1),
1292 0 : H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
1293 : } else {
1294 : const __m256i d0 =
1295 0 : _mm256_cvtepi32_epi64(_mm256_extracti128_si256(delta, 0));
1296 : const __m256i d1 =
1297 0 : _mm256_cvtepi32_epi64(_mm256_extracti128_si256(delta, 1));
1298 0 : deltas[0] = _mm256_add_epi64(deltas[0], d0);
1299 0 : deltas[1] = _mm256_add_epi64(deltas[1], d1);
1300 :
1301 0 : deltas[2] =
1302 0 : _mm256_unpacklo_epi64(deltas[0], deltas[1]); // 00 02 01 02
1303 0 : deltas[3] =
1304 0 : _mm256_unpackhi_epi64(deltas[0], deltas[1]); // 10 12 11 12
1305 :
1306 0 : deltas[2] =
1307 0 : _mm256_permute4x64_epi64(deltas[2], 0xD8); // 00 01 02 02
1308 0 : deltas[3] =
1309 0 : _mm256_permute4x64_epi64(deltas[3], 0xD8); // 10 11 12 12
1310 :
1311 0 : update_4_stats_highbd_avx2(
1312 : H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
1313 : deltas[2],
1314 0 : H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
1315 0 : update_4_stats_highbd_avx2(
1316 0 : H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
1317 : deltas[3],
1318 0 : H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
1319 : }
1320 : }
1321 :
1322 : // Step 4: Derive the top and left edge of each square. No square in top and
1323 : // bottom row.
1324 : {
1325 0 : const int16_t *dT = d;
1326 0 : __m256i deltas[2 * WIENER_WIN_3TAP - 1] = {0};
1327 : __m256i dd[WIENER_WIN_3TAP], ds[WIENER_WIN_3TAP];
1328 : __m256i se0, se1, xx, yy;
1329 : __m256i delta;
1330 0 : se0 = _mm256_setzero_si256(); // Initialize to avoid warning.
1331 :
1332 0 : y = 0;
1333 : do {
1334 : // 00s 01s 10s 11s 20s 21s 30s 31s 00e 01e 10e 11e 20e 21e 30e 31e
1335 0 : se0 = _mm256_insert_epi32(se0, *(int32_t *)(dT + 0 * d_stride), 0);
1336 0 : se0 = _mm256_insert_epi32(
1337 : se0, *(int32_t *)(dT + 0 * d_stride + width), 4);
1338 0 : se0 = _mm256_insert_epi32(se0, *(int32_t *)(dT + 1 * d_stride), 1);
1339 0 : se0 = _mm256_insert_epi32(
1340 : se0, *(int32_t *)(dT + 1 * d_stride + width), 5);
1341 0 : se0 = _mm256_insert_epi32(se0, *(int32_t *)(dT + 2 * d_stride), 2);
1342 0 : se0 = _mm256_insert_epi32(
1343 : se0, *(int32_t *)(dT + 2 * d_stride + width), 6);
1344 0 : se0 = _mm256_insert_epi32(se0, *(int32_t *)(dT + 3 * d_stride), 3);
1345 0 : se0 = _mm256_insert_epi32(
1346 : se0, *(int32_t *)(dT + 3 * d_stride + width), 7);
1347 :
1348 : // 40s 41s 50s 51s 60s 61s 70s 71s 40e 41e 50e 51e 60e 61e 70e 71e
1349 0 : se1 = _mm256_insert_epi32(se0, *(int32_t *)(dT + 4 * d_stride), 0);
1350 0 : se1 = _mm256_insert_epi32(
1351 : se1, *(int32_t *)(dT + 4 * d_stride + width), 4);
1352 0 : se1 = _mm256_insert_epi32(se1, *(int32_t *)(dT + 5 * d_stride), 1);
1353 0 : se1 = _mm256_insert_epi32(
1354 : se1, *(int32_t *)(dT + 5 * d_stride + width), 5);
1355 0 : se1 = _mm256_insert_epi32(se1, *(int32_t *)(dT + 6 * d_stride), 2);
1356 0 : se1 = _mm256_insert_epi32(
1357 : se1, *(int32_t *)(dT + 6 * d_stride + width), 6);
1358 0 : se1 = _mm256_insert_epi32(se1, *(int32_t *)(dT + 7 * d_stride), 3);
1359 0 : se1 = _mm256_insert_epi32(
1360 : se1, *(int32_t *)(dT + 7 * d_stride + width), 7);
1361 :
1362 : // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e
1363 0 : xx = _mm256_slli_epi32(se0, 16);
1364 0 : yy = _mm256_slli_epi32(se1, 16);
1365 0 : xx = _mm256_srai_epi32(xx, 16);
1366 0 : yy = _mm256_srai_epi32(yy, 16);
1367 0 : dd[0] = _mm256_packs_epi32(xx, yy);
1368 :
1369 : // 01s 11s 21s 31s 41s 51s 61s 71s 01e 11e 21e 31e 41e 51e 61e 71e
1370 0 : se0 = _mm256_srai_epi32(se0, 16);
1371 0 : se1 = _mm256_srai_epi32(se1, 16);
1372 0 : ds[0] = _mm256_packs_epi32(se0, se1);
1373 :
1374 0 : load_more_16_avx2(dT + 8 * d_stride + 0, width, dd[0], &dd[1]);
1375 0 : load_more_16_avx2(dT + 8 * d_stride + 1, width, ds[0], &ds[1]);
1376 0 : load_more_16_avx2(dT + 9 * d_stride + 0, width, dd[1], &dd[2]);
1377 0 : load_more_16_avx2(dT + 9 * d_stride + 1, width, ds[1], &ds[2]);
1378 :
1379 0 : madd_avx2(dd[0], ds[0], &deltas[0]);
1380 0 : madd_avx2(dd[0], ds[1], &deltas[1]);
1381 0 : madd_avx2(dd[0], ds[2], &deltas[2]);
1382 0 : madd_avx2(dd[1], ds[0], &deltas[3]);
1383 0 : madd_avx2(dd[2], ds[0], &deltas[4]);
1384 :
1385 0 : dT += 8 * d_stride;
1386 0 : y += 8;
1387 0 : } while (y < h8);
1388 :
1389 0 : if (bit_depth < AOM_BITS_12) {
1390 0 : deltas[0] = _mm256_hadd_epi32(
1391 : deltas[0], deltas[1]); // T0 T0 T1 T1 T0 T0 T1 T1
1392 0 : deltas[2] = _mm256_hadd_epi32(
1393 : deltas[2], deltas[2]); // T2 T2 T2 T2 T2 T2 T2 T2
1394 0 : deltas[3] = _mm256_hadd_epi32(
1395 : deltas[3], deltas[4]); // L0 L0 L1 L1 L0 L0 L1 L1
1396 0 : deltas[0] = _mm256_hadd_epi32(
1397 : deltas[0], deltas[2]); // T0 T1 T2 T2 T0 T1 T2 T2
1398 0 : deltas[3] = _mm256_hadd_epi32(
1399 : deltas[3], deltas[3]); // L0 L1 L0 L1 L0 L1 L0 L1
1400 0 : const __m128i delta0 = sub_hi_lo_32_avx2(deltas[0]); // T0 T1 T2 T2
1401 0 : const __m128i delta1 = sub_hi_lo_32_avx2(deltas[3]); // L0 L1 L0 L1
1402 0 : delta = _mm256_inserti128_si256(
1403 : _mm256_castsi128_si256(delta0), delta1, 1);
1404 : } else {
1405 0 : deltas[0] = hsub_32x8_to_64x4_avx2(deltas[0]); // T0 T0 T0 T0
1406 0 : deltas[1] = hsub_32x8_to_64x4_avx2(deltas[1]); // T1 T1 T1 T1
1407 0 : deltas[2] = hsub_32x8_to_64x4_avx2(deltas[2]); // T2 T2 T2 T2
1408 0 : deltas[3] = hsub_32x8_to_64x4_avx2(deltas[3]); // L0 L0 L0 L0
1409 0 : deltas[4] = hsub_32x8_to_64x4_avx2(deltas[4]); // L1 L1 L1 L1
1410 0 : deltas[0] = hadd_x_64_avx2(deltas[0], deltas[1]); // T0 T0 T1 T1
1411 0 : deltas[2] = hadd_x_64_avx2(deltas[2], deltas[2]); // T2 T2 T2 T2
1412 0 : deltas[3] = hadd_x_64_avx2(deltas[3], deltas[4]); // L0 L0 L1 L1
1413 0 : deltas[0] = hadd_x_64_avx2(deltas[0], deltas[2]); // T0 T1 T2 T2
1414 0 : deltas[1] = hadd_x_64_avx2(deltas[3], deltas[3]); // L0 L1 L0 L1
1415 0 : delta = _mm256_setzero_si256();
1416 : }
1417 :
1418 0 : if (h8 != height) {
1419 0 : const __m256i perm = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
1420 :
1421 0 : ds[0] = _mm256_insert_epi16(ds[0], dT[0 * d_stride + 1], 0);
1422 0 : ds[0] = _mm256_insert_epi16(ds[0], dT[0 * d_stride + 1 + width], 1);
1423 :
1424 0 : dd[0] = _mm256_insert_epi16(dd[0], -dT[1 * d_stride], 8);
1425 0 : ds[0] = _mm256_insert_epi16(ds[0], dT[1 * d_stride + 1], 2);
1426 0 : dd[0] = _mm256_insert_epi16(dd[0], dT[1 * d_stride + width], 9);
1427 0 : ds[0] = _mm256_insert_epi16(ds[0], dT[1 * d_stride + 1 + width], 3);
1428 :
1429 : do {
1430 0 : dd[0] = _mm256_insert_epi16(dd[0], -dT[0 * d_stride], 0);
1431 0 : dd[0] = _mm256_insert_epi16(dd[0], dT[0 * d_stride + width], 1);
1432 0 : dd[0] = _mm256_unpacklo_epi32(dd[0], dd[0]);
1433 0 : dd[0] = _mm256_unpacklo_epi32(dd[0], dd[0]);
1434 :
1435 0 : ds[0] = _mm256_insert_epi16(ds[0], dT[0 * d_stride + 1], 8);
1436 0 : ds[0] = _mm256_insert_epi16(ds[0], dT[0 * d_stride + 1], 10);
1437 0 : ds[0] =
1438 0 : _mm256_insert_epi16(ds[0], dT[0 * d_stride + 1 + width], 9);
1439 0 : ds[0] = _mm256_insert_epi16(
1440 : ds[0], dT[0 * d_stride + 1 + width], 11);
1441 :
1442 0 : dd[0] = _mm256_insert_epi16(dd[0], -dT[2 * d_stride], 10);
1443 0 : ds[0] = _mm256_insert_epi16(ds[0], dT[2 * d_stride + 1], 4);
1444 0 : dd[0] =
1445 0 : _mm256_insert_epi16(dd[0], dT[2 * d_stride + width], 11);
1446 0 : ds[0] =
1447 0 : _mm256_insert_epi16(ds[0], dT[2 * d_stride + 1 + width], 5);
1448 :
1449 0 : madd_avx2(dd[0], ds[0], &delta);
1450 :
1451 : // right shift 4 bytes
1452 0 : dd[0] = _mm256_permutevar8x32_epi32(dd[0], perm);
1453 0 : ds[0] = _mm256_permutevar8x32_epi32(ds[0], perm);
1454 0 : dT += d_stride;
1455 0 : } while (++y < height);
1456 : }
1457 :
1458 : // Writing one more H on the top edge of a square falls to the next
1459 : // square in the same row or the first H in the next row, which would be
1460 : // calculated later, so it won't overflow.
1461 0 : if (bit_depth < AOM_BITS_12) {
1462 0 : update_4_stats_avx2(
1463 0 : H + 0 * wiener_win * wiener_win2 + 1 * wiener_win,
1464 0 : _mm256_extracti128_si256(delta, 0),
1465 0 : H + 1 * wiener_win * wiener_win2 + 2 * wiener_win);
1466 0 : H[(1 * wiener_win + 1) * wiener_win2 + 2 * wiener_win] =
1467 0 : H[(0 * wiener_win + 1) * wiener_win2 + 1 * wiener_win] +
1468 0 : _mm256_extract_epi32(delta, 4);
1469 0 : H[(1 * wiener_win + 2) * wiener_win2 + 2 * wiener_win] =
1470 0 : H[(0 * wiener_win + 2) * wiener_win2 + 1 * wiener_win] +
1471 0 : _mm256_extract_epi32(delta, 5);
1472 : } else {
1473 : const __m256i d0 =
1474 0 : _mm256_cvtepi32_epi64(_mm256_extracti128_si256(delta, 0));
1475 : const __m256i d1 =
1476 0 : _mm256_cvtepi32_epi64(_mm256_extracti128_si256(delta, 1));
1477 0 : deltas[0] = _mm256_add_epi64(deltas[0], d0);
1478 0 : deltas[1] = _mm256_add_epi64(deltas[1], d1);
1479 :
1480 0 : update_4_stats_highbd_avx2(
1481 0 : H + 0 * wiener_win * wiener_win2 + 1 * wiener_win,
1482 : deltas[0],
1483 0 : H + 1 * wiener_win * wiener_win2 + 2 * wiener_win);
1484 0 : H[(1 * wiener_win + 1) * wiener_win2 + 2 * wiener_win] =
1485 0 : H[(0 * wiener_win + 1) * wiener_win2 + 1 * wiener_win] +
1486 0 : _mm256_extract_epi64(deltas[1], 0);
1487 0 : H[(1 * wiener_win + 2) * wiener_win2 + 2 * wiener_win] =
1488 0 : H[(0 * wiener_win + 2) * wiener_win2 + 1 * wiener_win] +
1489 0 : _mm256_extract_epi64(deltas[1], 1);
1490 : }
1491 : }
1492 :
1493 : // Step 5: Derive other points of each square. No square in bottom row.
1494 0 : i = 0;
1495 : do {
1496 0 : const int16_t *const dI = d + i;
1497 :
1498 0 : j = i + 1;
1499 : do {
1500 0 : const int16_t *const dJ = d + j;
1501 0 : __m256i deltas[WIENER_WIN_3TAP - 1][WIENER_WIN_3TAP - 1] = {{{0}},{{0}}};
1502 : __m256i dIs[WIENER_WIN_3TAP - 1], dIe[WIENER_WIN_3TAP - 1];
1503 : __m256i dJs[WIENER_WIN_3TAP - 1], dJe[WIENER_WIN_3TAP - 1];
1504 :
1505 0 : x = 0;
1506 : do {
1507 0 : load_square_win3_avx2(
1508 0 : dI + x, dJ + x, d_stride, height, dIs, dIe, dJs, dJe);
1509 0 : derive_square_win3_avx2(dIs, dIe, dJs, dJe, deltas);
1510 :
1511 0 : x += 16;
1512 0 : } while (x < w16);
1513 :
1514 0 : if (w16 != width) {
1515 0 : load_square_win3_avx2(
1516 0 : dI + x, dJ + x, d_stride, height, dIs, dIe, dJs, dJe);
1517 :
1518 0 : dIs[0] = _mm256_and_si256(dIs[0], mask);
1519 0 : dIs[1] = _mm256_and_si256(dIs[1], mask);
1520 0 : dIe[0] = _mm256_and_si256(dIe[0], mask);
1521 0 : dIe[1] = _mm256_and_si256(dIe[1], mask);
1522 :
1523 0 : derive_square_win3_avx2(dIs, dIe, dJs, dJe, deltas);
1524 : }
1525 :
1526 : __m256i delta64;
1527 0 : if (bit_depth < AOM_BITS_12) {
1528 0 : const __m128i delta32 = hadd_four_32_avx2(
1529 : deltas[0][0], deltas[0][1], deltas[1][0], deltas[1][1]);
1530 0 : delta64 = _mm256_cvtepi32_epi64(delta32);
1531 : } else {
1532 0 : delta64 = hadd_four_31_to_64_avx2(
1533 : deltas[0][0], deltas[0][1], deltas[1][0], deltas[1][1]);
1534 : }
1535 0 : update_2_stats_sse2(
1536 0 : H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win,
1537 0 : _mm256_extracti128_si256(delta64, 0),
1538 0 : H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
1539 0 : update_2_stats_sse2(
1540 0 : H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win,
1541 0 : _mm256_extracti128_si256(delta64, 1),
1542 0 : H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
1543 0 : } while (++j < wiener_win);
1544 0 : } while (++i < wiener_win - 1);
1545 :
1546 : // Step 6: Derive other points of each upper triangle along the diagonal.
1547 0 : i = 0;
1548 : do {
1549 0 : const int16_t *const dI = d + i;
1550 0 : __m256i deltas[WIENER_WIN_3TAP * (WIENER_WIN_3TAP - 1) / 2] = {0};
1551 : __m256i dIs[WIENER_WIN_3TAP - 1], dIe[WIENER_WIN_3TAP - 1];
1552 :
1553 0 : x = 0;
1554 : do {
1555 0 : load_triangle_win3_avx2(dI + x, d_stride, height, dIs, dIe);
1556 0 : derive_triangle_win3_avx2(dIs, dIe, deltas);
1557 :
1558 0 : x += 16;
1559 0 : } while (x < w16);
1560 :
1561 0 : if (w16 != width) {
1562 0 : load_triangle_win3_avx2(dI + x, d_stride, height, dIs, dIe);
1563 :
1564 0 : dIs[0] = _mm256_and_si256(dIs[0], mask);
1565 0 : dIs[1] = _mm256_and_si256(dIs[1], mask);
1566 0 : dIe[0] = _mm256_and_si256(dIe[0], mask);
1567 0 : dIe[1] = _mm256_and_si256(dIe[1], mask);
1568 :
1569 0 : derive_triangle_win3_avx2(dIs, dIe, deltas);
1570 : }
1571 :
1572 : __m128i delta01;
1573 : int64_t delta2;
1574 :
1575 0 : if (bit_depth < AOM_BITS_12) {
1576 : const __m128i delta32 =
1577 0 : hadd_four_32_avx2(deltas[0], deltas[1], deltas[2], deltas[2]);
1578 0 : delta01 = _mm_cvtepi32_epi64(delta32);
1579 0 : delta2 = _mm_extract_epi32(delta32, 2);
1580 : } else {
1581 0 : const __m256i delta64 = hadd_four_31_to_64_avx2(
1582 : deltas[0], deltas[1], deltas[2], deltas[2]);
1583 0 : delta01 = _mm256_extracti128_si256(delta64, 0);
1584 0 : delta2 = _mm256_extract_epi64(delta64, 2);
1585 : }
1586 :
1587 0 : update_2_stats_sse2(
1588 0 : H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win,
1589 : delta01,
1590 0 : H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
1591 0 : H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2] =
1592 0 : H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1] + delta2;
1593 0 : } while (++i < wiener_win);
1594 0 : }
1595 :
1596 120 : static INLINE void compute_stats_win5_avx2(
1597 : const int16_t *const d, const int32_t d_stride, const int16_t *const s,
1598 : const int32_t s_stride, const int32_t width, const int32_t height,
1599 : int64_t *const M, int64_t *const H, AomBitDepth bit_depth) {
1600 120 : const int32_t wiener_win = WIENER_WIN_CHROMA;
1601 120 : const int32_t wiener_win2 = wiener_win * wiener_win;
1602 120 : const int32_t w16 = width & ~15;
1603 120 : const int32_t h8 = height & ~7;
1604 : const __m256i mask =
1605 120 : _mm256_load_si256((__m256i *)(mask_16bit[width - w16]));
1606 : int32_t i, j, x, y;
1607 :
1608 120 : if (bit_depth == AOM_BITS_8) {
1609 : // Step 1: Calculate the top edge of the whole matrix, i.e., the top
1610 : // edge of each triangle and square on the top row.
1611 120 : j = 0;
1612 : do {
1613 600 : const int16_t *sT = s;
1614 600 : const int16_t *dT = d;
1615 600 : __m256i sumM[WIENER_WIN_CHROMA] = {0};
1616 600 : __m256i sumH[WIENER_WIN_CHROMA] = {0};
1617 :
1618 600 : y = height;
1619 : do {
1620 108000 : x = 0;
1621 : do {
1622 2160000 : const __m256i src = _mm256_load_si256((__m256i *)(sT + x));
1623 2160000 : const __m256i dgd = _mm256_load_si256((__m256i *)(dT + x));
1624 2160000 : stats_top_win5_avx2(
1625 2160000 : src, dgd, dT + j + x, d_stride, sumM, sumH);
1626 2160000 : x += 16;
1627 2160000 : } while (x < w16);
1628 :
1629 108000 : if (w16 != width) {
1630 : const __m256i src =
1631 0 : _mm256_load_si256((__m256i *)(sT + w16));
1632 : const __m256i dgd =
1633 0 : _mm256_load_si256((__m256i *)(dT + w16));
1634 0 : const __m256i srcMask = _mm256_and_si256(src, mask);
1635 0 : const __m256i dgdMask = _mm256_and_si256(dgd, mask);
1636 0 : stats_top_win5_avx2(
1637 0 : srcMask, dgdMask, dT + j + w16, d_stride, sumM, sumH);
1638 : }
1639 :
1640 108000 : sT += s_stride;
1641 108000 : dT += d_stride;
1642 108000 : } while (--y);
1643 :
1644 : const __m256i sM =
1645 600 : hadd_four_32_to_64_avx2(sumM[0], sumM[1], sumM[2], sumM[3]);
1646 600 : const __m128i sMH = hadd_two_32_to_64_avx2(sumM[4], sumH[4]);
1647 600 : _mm256_storeu_si256((__m256i *)(M + wiener_win * j), sM);
1648 600 : _mm_storel_epi64((__m128i *)&M[wiener_win * j + 4], sMH);
1649 :
1650 : const __m256i sH =
1651 600 : hadd_four_32_to_64_avx2(sumH[0], sumH[1], sumH[2], sumH[3]);
1652 600 : _mm256_storeu_si256((__m256i *)(H + wiener_win * j), sH);
1653 600 : _mm_storeh_epi64((__m128i *)&H[wiener_win * j + 4], sMH);
1654 600 : } while (++j < wiener_win);
1655 :
1656 : // Step 2: Calculate the left edge of each square on the top row.
1657 120 : j = 1;
1658 : do {
1659 480 : const int16_t *dT = d;
1660 480 : __m256i sumH[WIENER_WIN_CHROMA - 1] = {0};
1661 :
1662 480 : y = height;
1663 : do {
1664 86400 : x = 0;
1665 : do {
1666 : const __m256i dgd =
1667 1728000 : _mm256_loadu_si256((__m256i *)(dT + j + x));
1668 1728000 : stats_left_win5_avx2(dgd, dT + x, d_stride, sumH);
1669 1728000 : x += 16;
1670 1728000 : } while (x < w16);
1671 :
1672 86400 : if (w16 != width) {
1673 : const __m256i dgd =
1674 0 : _mm256_loadu_si256((__m256i *)(dT + j + x));
1675 0 : const __m256i dgdMask = _mm256_and_si256(dgd, mask);
1676 0 : stats_left_win5_avx2(dgdMask, dT + x, d_stride, sumH);
1677 : }
1678 :
1679 86400 : dT += d_stride;
1680 86400 : } while (--y);
1681 :
1682 : const __m256i sum =
1683 480 : hadd_four_32_to_64_avx2(sumH[0], sumH[1], sumH[2], sumH[3]);
1684 960 : _mm_storel_epi64((__m128i *)&H[1 * wiener_win2 + j * wiener_win],
1685 480 : _mm256_extracti128_si256(sum, 0));
1686 480 : _mm_storeh_epi64((__m128i *)&H[2 * wiener_win2 + j * wiener_win],
1687 480 : _mm256_extracti128_si256(sum, 0));
1688 960 : _mm_storel_epi64((__m128i *)&H[3 * wiener_win2 + j * wiener_win],
1689 480 : _mm256_extracti128_si256(sum, 1));
1690 480 : _mm_storeh_epi64((__m128i *)&H[4 * wiener_win2 + j * wiener_win],
1691 480 : _mm256_extracti128_si256(sum, 1));
1692 480 : } while (++j < wiener_win);
1693 : } else {
1694 0 : const int32_t numBitLeft =
1695 0 : 32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 3 /* SIMD */;
1696 0 : const int32_t hAllowed =
1697 0 : (1 << numBitLeft) / (w16 + ((w16 != width) ? 16 : 0));
1698 :
1699 : // Step 1: Calculate the top edge of the whole matrix, i.e., the top
1700 : // edge of each triangle and square on the top row.
1701 0 : j = 0;
1702 : do {
1703 0 : const int16_t *sT = s;
1704 0 : const int16_t *dT = d;
1705 0 : int32_t heightT = 0;
1706 0 : __m256i sumM[WIENER_WIN_CHROMA] = {0};
1707 0 : __m256i sumH[WIENER_WIN_CHROMA] = {0};
1708 :
1709 : do {
1710 0 : const int32_t hT = ((height - heightT) < hAllowed)
1711 : ? (height - heightT)
1712 : : hAllowed;
1713 0 : __m256i rowM[WIENER_WIN_CHROMA] = {0};
1714 0 : __m256i rowH[WIENER_WIN_CHROMA] = {0};
1715 :
1716 0 : y = hT;
1717 : do {
1718 0 : x = 0;
1719 : do {
1720 : const __m256i src =
1721 0 : _mm256_load_si256((__m256i *)(sT + x));
1722 : const __m256i dgd =
1723 0 : _mm256_load_si256((__m256i *)(dT + x));
1724 0 : stats_top_win5_avx2(
1725 0 : src, dgd, dT + j + x, d_stride, rowM, rowH);
1726 0 : x += 16;
1727 0 : } while (x < w16);
1728 :
1729 0 : if (w16 != width) {
1730 : const __m256i src =
1731 0 : _mm256_load_si256((__m256i *)(sT + w16));
1732 : const __m256i dgd =
1733 0 : _mm256_load_si256((__m256i *)(dT + w16));
1734 0 : const __m256i srcMask = _mm256_and_si256(src, mask);
1735 0 : const __m256i dgdMask = _mm256_and_si256(dgd, mask);
1736 0 : stats_top_win5_avx2(srcMask,
1737 : dgdMask,
1738 0 : dT + j + w16,
1739 : d_stride,
1740 : rowM,
1741 : rowH);
1742 : }
1743 :
1744 0 : sT += s_stride;
1745 0 : dT += d_stride;
1746 0 : } while (--y);
1747 :
1748 0 : add_32_to_64_avx2(rowM[0], &sumM[0]);
1749 0 : add_32_to_64_avx2(rowM[1], &sumM[1]);
1750 0 : add_32_to_64_avx2(rowM[2], &sumM[2]);
1751 0 : add_32_to_64_avx2(rowM[3], &sumM[3]);
1752 0 : add_32_to_64_avx2(rowM[4], &sumM[4]);
1753 0 : add_32_to_64_avx2(rowH[0], &sumH[0]);
1754 0 : add_32_to_64_avx2(rowH[1], &sumH[1]);
1755 0 : add_32_to_64_avx2(rowH[2], &sumH[2]);
1756 0 : add_32_to_64_avx2(rowH[3], &sumH[3]);
1757 0 : add_32_to_64_avx2(rowH[4], &sumH[4]);
1758 :
1759 0 : heightT += hT;
1760 0 : } while (heightT < height);
1761 :
1762 : const __m256i sM =
1763 0 : hadd_four_64_avx2(sumM[0], sumM[1], sumM[2], sumM[3]);
1764 0 : const __m128i sMH = hadd_two_64_avx2(sumM[4], sumH[4]);
1765 0 : _mm256_storeu_si256((__m256i *)(M + wiener_win * j), sM);
1766 0 : M[wiener_win * j + 4] = _mm_cvtsi128_si64(sMH);
1767 :
1768 : const __m256i sH =
1769 0 : hadd_four_64_avx2(sumH[0], sumH[1], sumH[2], sumH[3]);
1770 0 : _mm256_storeu_si256((__m256i *)(H + wiener_win * j), sH);
1771 0 : _mm_storeh_epi64((__m128i *)&H[wiener_win * j + 4], sMH);
1772 0 : } while (++j < wiener_win);
1773 :
1774 : // Step 2: Calculate the left edge of each square on the top row.
1775 0 : j = 1;
1776 : do {
1777 0 : const int16_t *dT = d;
1778 0 : int32_t heightT = 0;
1779 0 : __m256i sumH[WIENER_WIN_CHROMA - 1] = {0};
1780 :
1781 : do {
1782 0 : const int32_t hT = ((height - heightT) < hAllowed)
1783 : ? (height - heightT)
1784 : : hAllowed;
1785 0 : __m256i rowH[WIENER_WIN_CHROMA - 1] = {0};
1786 :
1787 0 : y = hT;
1788 : do {
1789 0 : x = 0;
1790 : do {
1791 : const __m256i dgd =
1792 0 : _mm256_loadu_si256((__m256i *)(dT + j + x));
1793 0 : stats_left_win5_avx2(dgd, dT + x, d_stride, rowH);
1794 0 : x += 16;
1795 0 : } while (x < w16);
1796 :
1797 0 : if (w16 != width) {
1798 : const __m256i dgd =
1799 0 : _mm256_loadu_si256((__m256i *)(dT + j + x));
1800 0 : const __m256i dgdMask = _mm256_and_si256(dgd, mask);
1801 0 : stats_left_win5_avx2(dgdMask, dT + x, d_stride, rowH);
1802 : }
1803 :
1804 0 : dT += d_stride;
1805 0 : } while (--y);
1806 :
1807 0 : add_32_to_64_avx2(rowH[0], &sumH[0]);
1808 0 : add_32_to_64_avx2(rowH[1], &sumH[1]);
1809 0 : add_32_to_64_avx2(rowH[2], &sumH[2]);
1810 0 : add_32_to_64_avx2(rowH[3], &sumH[3]);
1811 :
1812 0 : heightT += hT;
1813 0 : } while (heightT < height);
1814 :
1815 : const __m256i sum =
1816 0 : hadd_four_64_avx2(sumH[0], sumH[1], sumH[2], sumH[3]);
1817 0 : _mm_storel_epi64((__m128i *)&H[1 * wiener_win2 + j * wiener_win],
1818 0 : _mm256_extracti128_si256(sum, 0));
1819 0 : _mm_storeh_epi64((__m128i *)&H[2 * wiener_win2 + j * wiener_win],
1820 0 : _mm256_extracti128_si256(sum, 0));
1821 0 : _mm_storel_epi64((__m128i *)&H[3 * wiener_win2 + j * wiener_win],
1822 0 : _mm256_extracti128_si256(sum, 1));
1823 0 : _mm_storeh_epi64((__m128i *)&H[4 * wiener_win2 + j * wiener_win],
1824 0 : _mm256_extracti128_si256(sum, 1));
1825 0 : } while (++j < wiener_win);
1826 : }
1827 :
1828 : // Step 3: Derive the top edge of each triangle along the diagonal. No
1829 : // triangle in top row.
1830 : {
1831 120 : const int16_t *dT = d;
1832 : // 16-bit idx: 0, 4, 1, 5, 2, 6, 3, 7
1833 120 : const __m256i shf = _mm256_setr_epi8(0,
1834 : 1,
1835 : 8,
1836 : 9,
1837 : 2,
1838 : 3,
1839 : 10,
1840 : 11,
1841 : 4,
1842 : 5,
1843 : 12,
1844 : 13,
1845 : 6,
1846 : 7,
1847 : 14,
1848 : 15,
1849 : 0,
1850 : 1,
1851 : 8,
1852 : 9,
1853 : 2,
1854 : 3,
1855 : 10,
1856 : 11,
1857 : 4,
1858 : 5,
1859 : 12,
1860 : 13,
1861 : 6,
1862 : 7,
1863 : 14,
1864 : 15);
1865 120 : __m256i deltas[WIENER_WIN_CHROMA] = {0};
1866 120 : __m256i dd = _mm256_setzero_si256(); // Initialize to avoid warning.
1867 : __m256i ds[WIENER_WIN_CHROMA];
1868 :
1869 : // 00s 01s 02s 03s 10s 11s 12s 13s 00e 01e 02e 03e 10e 11e 12e 13e
1870 120 : dd = _mm256_insert_epi64(dd, *(int64_t *)(dT + 0 * d_stride), 0);
1871 120 : dd =
1872 120 : _mm256_insert_epi64(dd, *(int64_t *)(dT + 0 * d_stride + width), 2);
1873 120 : dd = _mm256_insert_epi64(dd, *(int64_t *)(dT + 1 * d_stride), 1);
1874 120 : dd =
1875 120 : _mm256_insert_epi64(dd, *(int64_t *)(dT + 1 * d_stride + width), 3);
1876 : // 00s 10s 01s 11s 02s 12s 03s 13s 00e 10e 01e 11e 02e 12e 03e 13e
1877 120 : ds[0] = _mm256_shuffle_epi8(dd, shf);
1878 :
1879 : // 10s 11s 12s 13s 20s 21s 22s 23s 10e 11e 12e 13e 20e 21e 22e 23e
1880 120 : load_more_64_avx2(dT + 2 * d_stride, width, &dd);
1881 : // 10s 20s 11s 21s 12s 22s 13s 23s 10e 20e 11e 21e 12e 22e 13e 23e
1882 120 : ds[1] = _mm256_shuffle_epi8(dd, shf);
1883 :
1884 : // 20s 21s 22s 23s 30s 31s 32s 33s 20e 21e 22e 23e 30e 31e 32e 33e
1885 120 : load_more_64_avx2(dT + 3 * d_stride, width, &dd);
1886 : // 20s 30s 21s 31s 22s 32s 23s 33s 20e 30e 21e 31e 22e 32e 23e 33e
1887 120 : ds[2] = _mm256_shuffle_epi8(dd, shf);
1888 :
1889 120 : if (bit_depth < AOM_BITS_12) {
1890 : __m128i dlts[WIENER_WIN_CHROMA];
1891 :
1892 120 : step3_win5_avx2(&dT, d_stride, width, height, &dd, ds, deltas);
1893 :
1894 120 : dlts[0] = sub_hi_lo_32_avx2(deltas[0]);
1895 120 : dlts[1] = sub_hi_lo_32_avx2(deltas[1]);
1896 120 : dlts[2] = sub_hi_lo_32_avx2(deltas[2]);
1897 120 : dlts[3] = sub_hi_lo_32_avx2(deltas[3]);
1898 120 : dlts[4] = sub_hi_lo_32_avx2(deltas[4]);
1899 :
1900 120 : transpose_32bit_4x4(dlts, dlts);
1901 120 : deltas[4] = _mm256_cvtepi32_epi64(dlts[4]);
1902 :
1903 120 : update_5_stats_avx2(
1904 : H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
1905 : dlts[0],
1906 120 : _mm256_extract_epi64(deltas[4], 0),
1907 120 : H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
1908 :
1909 120 : update_5_stats_avx2(
1910 120 : H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
1911 : dlts[1],
1912 120 : _mm256_extract_epi64(deltas[4], 1),
1913 120 : H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
1914 :
1915 120 : update_5_stats_avx2(
1916 120 : H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
1917 : dlts[2],
1918 120 : _mm256_extract_epi64(deltas[4], 2),
1919 120 : H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
1920 :
1921 120 : update_5_stats_avx2(
1922 120 : H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
1923 : dlts[3],
1924 120 : _mm256_extract_epi64(deltas[4], 3),
1925 120 : H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
1926 : } else {
1927 0 : int32_t heightT = 0;
1928 :
1929 : do {
1930 0 : __m256i deltasT[WIENER_WIN_CHROMA] = {0};
1931 0 : const int32_t hT =
1932 0 : ((height - heightT) < 128) ? (height - heightT) : 128;
1933 :
1934 0 : step3_win5_avx2(&dT, d_stride, width, hT, &dd, ds, deltasT);
1935 :
1936 0 : deltasT[0] = hsub_32x8_to_64x4_avx2(deltasT[0]);
1937 0 : deltasT[1] = hsub_32x8_to_64x4_avx2(deltasT[1]);
1938 0 : deltasT[2] = hsub_32x8_to_64x4_avx2(deltasT[2]);
1939 0 : deltasT[3] = hsub_32x8_to_64x4_avx2(deltasT[3]);
1940 0 : deltasT[4] = hsub_32x8_to_64x4_avx2(deltasT[4]);
1941 0 : deltas[0] = _mm256_add_epi64(deltas[0], deltasT[0]);
1942 0 : deltas[1] = _mm256_add_epi64(deltas[1], deltasT[1]);
1943 0 : deltas[2] = _mm256_add_epi64(deltas[2], deltasT[2]);
1944 0 : deltas[3] = _mm256_add_epi64(deltas[3], deltasT[3]);
1945 0 : deltas[4] = _mm256_add_epi64(deltas[4], deltasT[4]);
1946 :
1947 0 : heightT += hT;
1948 0 : } while (heightT < height);
1949 :
1950 0 : transpose_64bit_4x4_avx2(deltas, deltas);
1951 :
1952 0 : update_5_stats_highbd_avx2(
1953 : H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
1954 : deltas[0],
1955 0 : _mm256_extract_epi64(deltas[4], 0),
1956 0 : H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
1957 :
1958 0 : update_5_stats_highbd_avx2(
1959 0 : H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
1960 : deltas[1],
1961 0 : _mm256_extract_epi64(deltas[4], 1),
1962 0 : H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
1963 :
1964 0 : update_5_stats_highbd_avx2(
1965 0 : H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
1966 : deltas[2],
1967 0 : _mm256_extract_epi64(deltas[4], 2),
1968 0 : H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
1969 :
1970 0 : update_5_stats_highbd_avx2(
1971 0 : H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
1972 : deltas[3],
1973 0 : _mm256_extract_epi64(deltas[4], 3),
1974 0 : H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
1975 : }
1976 : }
1977 :
1978 : // Step 4: Derive the top and left edge of each square. No square in top and
1979 : // bottom row.
1980 120 : i = 1;
1981 : do {
1982 360 : j = i + 1;
1983 : do {
1984 720 : const int16_t *dI = d + i - 1;
1985 720 : const int16_t *dJ = d + j - 1;
1986 : __m128i delta128, delta4;
1987 : __m256i delta;
1988 720 : __m256i deltas[2 * WIENER_WIN_CHROMA - 1] = {0};
1989 : __m256i dd[WIENER_WIN_CHROMA], ds[WIENER_WIN_CHROMA];
1990 :
1991 720 : dd[0] = _mm256_setzero_si256(); // Initialize to avoid warning.
1992 720 : ds[0] = _mm256_setzero_si256(); // Initialize to avoid warning.
1993 :
1994 720 : dd[0] = _mm256_insert_epi16(dd[0], dI[0 * d_stride], 0);
1995 720 : dd[0] = _mm256_insert_epi16(dd[0], dI[0 * d_stride + width], 8);
1996 720 : dd[0] = _mm256_insert_epi16(dd[0], dI[1 * d_stride], 1);
1997 720 : dd[0] = _mm256_insert_epi16(dd[0], dI[1 * d_stride + width], 9);
1998 720 : dd[0] = _mm256_insert_epi16(dd[0], dI[2 * d_stride], 2);
1999 720 : dd[0] = _mm256_insert_epi16(dd[0], dI[2 * d_stride + width], 10);
2000 720 : dd[0] = _mm256_insert_epi16(dd[0], dI[3 * d_stride], 3);
2001 720 : dd[0] = _mm256_insert_epi16(dd[0], dI[3 * d_stride + width], 11);
2002 :
2003 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[0 * d_stride], 0);
2004 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[0 * d_stride + width], 8);
2005 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[1 * d_stride], 1);
2006 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[1 * d_stride + width], 9);
2007 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[2 * d_stride], 2);
2008 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[2 * d_stride + width], 10);
2009 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[3 * d_stride], 3);
2010 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[3 * d_stride + width], 11);
2011 :
2012 720 : y = 0;
2013 : do {
2014 : // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e
2015 : // 70e
2016 15840 : dd[0] = _mm256_insert_epi16(dd[0], dI[4 * d_stride], 4);
2017 15840 : dd[0] =
2018 15840 : _mm256_insert_epi16(dd[0], dI[4 * d_stride + width], 12);
2019 15840 : dd[0] = _mm256_insert_epi16(dd[0], dI[5 * d_stride], 5);
2020 15840 : dd[0] =
2021 15840 : _mm256_insert_epi16(dd[0], dI[5 * d_stride + width], 13);
2022 15840 : dd[0] = _mm256_insert_epi16(dd[0], dI[6 * d_stride], 6);
2023 15840 : dd[0] =
2024 15840 : _mm256_insert_epi16(dd[0], dI[6 * d_stride + width], 14);
2025 15840 : dd[0] = _mm256_insert_epi16(dd[0], dI[7 * d_stride], 7);
2026 15840 : dd[0] =
2027 15840 : _mm256_insert_epi16(dd[0], dI[7 * d_stride + width], 15);
2028 :
2029 : // 01s 11s 21s 31s 41s 51s 61s 71s 01e 11e 21e 31e 41e 51e 61e
2030 : // 71e
2031 15840 : ds[0] = _mm256_insert_epi16(ds[0], dJ[4 * d_stride], 4);
2032 15840 : ds[0] =
2033 15840 : _mm256_insert_epi16(ds[0], dJ[4 * d_stride + width], 12);
2034 15840 : ds[0] = _mm256_insert_epi16(ds[0], dJ[5 * d_stride], 5);
2035 15840 : ds[0] =
2036 15840 : _mm256_insert_epi16(ds[0], dJ[5 * d_stride + width], 13);
2037 15840 : ds[0] = _mm256_insert_epi16(ds[0], dJ[6 * d_stride], 6);
2038 15840 : ds[0] =
2039 15840 : _mm256_insert_epi16(ds[0], dJ[6 * d_stride + width], 14);
2040 15840 : ds[0] = _mm256_insert_epi16(ds[0], dJ[7 * d_stride], 7);
2041 15840 : ds[0] =
2042 15840 : _mm256_insert_epi16(ds[0], dJ[7 * d_stride + width], 15);
2043 :
2044 15840 : load_more_16_avx2(dI + 8 * d_stride, width, dd[0], &dd[1]);
2045 15840 : load_more_16_avx2(dJ + 8 * d_stride, width, ds[0], &ds[1]);
2046 15840 : load_more_16_avx2(dI + 9 * d_stride, width, dd[1], &dd[2]);
2047 15840 : load_more_16_avx2(dJ + 9 * d_stride, width, ds[1], &ds[2]);
2048 15840 : load_more_16_avx2(dI + 10 * d_stride, width, dd[2], &dd[3]);
2049 15840 : load_more_16_avx2(dJ + 10 * d_stride, width, ds[2], &ds[3]);
2050 15840 : load_more_16_avx2(dI + 11 * d_stride, width, dd[3], &dd[4]);
2051 15840 : load_more_16_avx2(dJ + 11 * d_stride, width, ds[3], &ds[4]);
2052 :
2053 15840 : madd_avx2(dd[0], ds[0], &deltas[0]);
2054 15840 : madd_avx2(dd[0], ds[1], &deltas[1]);
2055 15840 : madd_avx2(dd[0], ds[2], &deltas[2]);
2056 15840 : madd_avx2(dd[0], ds[3], &deltas[3]);
2057 15840 : madd_avx2(dd[0], ds[4], &deltas[4]);
2058 15840 : madd_avx2(dd[1], ds[0], &deltas[5]);
2059 15840 : madd_avx2(dd[2], ds[0], &deltas[6]);
2060 15840 : madd_avx2(dd[3], ds[0], &deltas[7]);
2061 15840 : madd_avx2(dd[4], ds[0], &deltas[8]);
2062 :
2063 15840 : dd[0] = _mm256_srli_si256(dd[4], 8);
2064 15840 : ds[0] = _mm256_srli_si256(ds[4], 8);
2065 15840 : dI += 8 * d_stride;
2066 15840 : dJ += 8 * d_stride;
2067 15840 : y += 8;
2068 15840 : } while (y < h8);
2069 :
2070 720 : if (bit_depth < AOM_BITS_12) {
2071 720 : deltas[0] = _mm256_hadd_epi32(deltas[0], deltas[1]);
2072 720 : deltas[2] = _mm256_hadd_epi32(deltas[2], deltas[3]);
2073 720 : deltas[4] = _mm256_hadd_epi32(deltas[4], deltas[4]);
2074 720 : deltas[5] = _mm256_hadd_epi32(deltas[5], deltas[6]);
2075 720 : deltas[7] = _mm256_hadd_epi32(deltas[7], deltas[8]);
2076 720 : deltas[0] = _mm256_hadd_epi32(deltas[0], deltas[2]);
2077 720 : deltas[4] = _mm256_hadd_epi32(deltas[4], deltas[4]);
2078 720 : deltas[5] = _mm256_hadd_epi32(deltas[5], deltas[7]);
2079 720 : const __m128i delta0 = sub_hi_lo_32_avx2(deltas[0]);
2080 720 : const __m128i delta1 = sub_hi_lo_32_avx2(deltas[4]);
2081 720 : delta128 = sub_hi_lo_32_avx2(deltas[5]);
2082 720 : delta = _mm256_inserti128_si256(
2083 : _mm256_castsi128_si256(delta0), delta1, 1);
2084 : } else {
2085 0 : deltas[0] = hsub_32x8_to_64x4_avx2(deltas[0]);
2086 0 : deltas[1] = hsub_32x8_to_64x4_avx2(deltas[1]);
2087 0 : deltas[2] = hsub_32x8_to_64x4_avx2(deltas[2]);
2088 0 : deltas[3] = hsub_32x8_to_64x4_avx2(deltas[3]);
2089 0 : deltas[4] = hsub_32x8_to_64x4_avx2(deltas[4]);
2090 0 : deltas[5] = hsub_32x8_to_64x4_avx2(deltas[5]);
2091 0 : deltas[6] = hsub_32x8_to_64x4_avx2(deltas[6]);
2092 0 : deltas[7] = hsub_32x8_to_64x4_avx2(deltas[7]);
2093 0 : deltas[8] = hsub_32x8_to_64x4_avx2(deltas[8]);
2094 :
2095 0 : transpose_64bit_4x4_avx2(deltas + 0, deltas + 0);
2096 0 : transpose_64bit_4x4_avx2(deltas + 5, deltas + 5);
2097 :
2098 0 : deltas[0] = _mm256_add_epi64(deltas[0], deltas[1]);
2099 0 : deltas[2] = _mm256_add_epi64(deltas[2], deltas[3]);
2100 0 : deltas[0] = _mm256_add_epi64(deltas[0], deltas[2]);
2101 0 : deltas[5] = _mm256_add_epi64(deltas[5], deltas[6]);
2102 0 : deltas[7] = _mm256_add_epi64(deltas[7], deltas[8]);
2103 0 : deltas[5] = _mm256_add_epi64(deltas[5], deltas[7]);
2104 0 : delta4 = hadd_64_avx2(deltas[4]);
2105 0 : delta128 = _mm_setzero_si128();
2106 0 : delta = _mm256_setzero_si256();
2107 : }
2108 :
2109 720 : if (h8 != height) {
2110 720 : const __m256i perm = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
2111 : __m128i dd128, ds128;
2112 :
2113 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[0 * d_stride], 0);
2114 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[0 * d_stride + width], 1);
2115 :
2116 720 : dd128 = _mm_cvtsi32_si128(-dI[1 * d_stride]);
2117 720 : dd128 = _mm_insert_epi16(dd128, dI[1 * d_stride + width], 1);
2118 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[1 * d_stride], 2);
2119 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[1 * d_stride + width], 3);
2120 :
2121 720 : dd128 = _mm_insert_epi16(dd128, -dI[2 * d_stride], 2);
2122 720 : dd128 = _mm_insert_epi16(dd128, dI[2 * d_stride + width], 3);
2123 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[2 * d_stride], 4);
2124 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[2 * d_stride + width], 5);
2125 :
2126 720 : dd128 = _mm_insert_epi16(dd128, -dI[3 * d_stride], 4);
2127 720 : dd128 = _mm_insert_epi16(dd128, dI[3 * d_stride + width], 5);
2128 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[3 * d_stride], 6);
2129 720 : ds[0] = _mm256_insert_epi16(ds[0], dJ[3 * d_stride + width], 7);
2130 :
2131 : do {
2132 : __m128i t;
2133 :
2134 2880 : t = _mm_cvtsi32_si128(-dI[0 * d_stride]);
2135 2880 : t = _mm_insert_epi16(t, dI[0 * d_stride + width], 1);
2136 2880 : dd[0] = _mm256_broadcastd_epi32(t);
2137 :
2138 2880 : ds128 = _mm_cvtsi32_si128(dJ[0 * d_stride]);
2139 2880 : ds128 =
2140 2880 : _mm_insert_epi16(ds128, dJ[0 * d_stride + width], 1);
2141 2880 : ds128 = _mm_unpacklo_epi32(ds128, ds128);
2142 2880 : ds128 = _mm_unpacklo_epi32(ds128, ds128);
2143 :
2144 2880 : dd128 = _mm_insert_epi16(dd128, -dI[4 * d_stride], 6);
2145 2880 : dd128 =
2146 2880 : _mm_insert_epi16(dd128, dI[4 * d_stride + width], 7);
2147 2880 : ds[0] = _mm256_insert_epi16(ds[0], dJ[4 * d_stride], 8);
2148 2880 : ds[0] =
2149 2880 : _mm256_insert_epi16(ds[0], dJ[4 * d_stride + width], 9);
2150 :
2151 2880 : madd_avx2(dd[0], ds[0], &delta);
2152 2880 : madd_sse2(dd128, ds128, &delta128);
2153 :
2154 : // right shift 4 bytes
2155 2880 : ds[0] = _mm256_permutevar8x32_epi32(ds[0], perm);
2156 2880 : dd128 = _mm_srli_si128(dd128, 4);
2157 2880 : dI += d_stride;
2158 2880 : dJ += d_stride;
2159 2880 : } while (++y < height);
2160 : }
2161 :
2162 720 : if (bit_depth < AOM_BITS_12) {
2163 720 : update_4_stats_avx2(
2164 720 : H + (i - 1) * wiener_win * wiener_win2 +
2165 720 : (j - 1) * wiener_win,
2166 720 : _mm256_extracti128_si256(delta, 0),
2167 720 : H + i * wiener_win * wiener_win2 + j * wiener_win);
2168 720 : H[i * wiener_win * wiener_win2 + j * wiener_win + 4] =
2169 720 : H[(i - 1) * wiener_win * wiener_win2 +
2170 720 : (j - 1) * wiener_win + 4] +
2171 720 : _mm256_extract_epi32(delta, 4);
2172 :
2173 720 : H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
2174 720 : H[((i - 1) * wiener_win + 1) * wiener_win2 +
2175 720 : (j - 1) * wiener_win] +
2176 720 : _mm_extract_epi32(delta128, 0);
2177 720 : H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
2178 720 : H[((i - 1) * wiener_win + 2) * wiener_win2 +
2179 720 : (j - 1) * wiener_win] +
2180 720 : _mm_extract_epi32(delta128, 1);
2181 720 : H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
2182 720 : H[((i - 1) * wiener_win + 3) * wiener_win2 +
2183 720 : (j - 1) * wiener_win] +
2184 720 : _mm_extract_epi32(delta128, 2);
2185 720 : H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
2186 720 : H[((i - 1) * wiener_win + 4) * wiener_win2 +
2187 720 : (j - 1) * wiener_win] +
2188 720 : _mm_extract_epi32(delta128, 3);
2189 : } else {
2190 : const __m256i d0 =
2191 0 : _mm256_cvtepi32_epi64(_mm256_extracti128_si256(delta, 0));
2192 : const __m128i d1 =
2193 0 : _mm_cvtepi32_epi64(_mm256_extracti128_si256(delta, 1));
2194 0 : const __m256i d2 = _mm256_cvtepi32_epi64(delta128);
2195 0 : deltas[0] = _mm256_add_epi64(deltas[0], d0);
2196 0 : delta4 = _mm_add_epi64(delta4, d1);
2197 0 : deltas[5] = _mm256_add_epi64(deltas[5], d2);
2198 :
2199 0 : update_4_stats_highbd_avx2(
2200 0 : H + (i - 1) * wiener_win * wiener_win2 +
2201 0 : (j - 1) * wiener_win,
2202 : deltas[0],
2203 0 : H + i * wiener_win * wiener_win2 + j * wiener_win);
2204 0 : H[i * wiener_win * wiener_win2 + j * wiener_win + 4] =
2205 0 : H[(i - 1) * wiener_win * wiener_win2 +
2206 0 : (j - 1) * wiener_win + 4] +
2207 0 : _mm_extract_epi64(delta4, 0);
2208 :
2209 0 : H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
2210 0 : H[((i - 1) * wiener_win + 1) * wiener_win2 +
2211 0 : (j - 1) * wiener_win] +
2212 0 : _mm256_extract_epi64(deltas[5], 0);
2213 0 : H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
2214 0 : H[((i - 1) * wiener_win + 2) * wiener_win2 +
2215 0 : (j - 1) * wiener_win] +
2216 0 : _mm256_extract_epi64(deltas[5], 1);
2217 0 : H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
2218 0 : H[((i - 1) * wiener_win + 3) * wiener_win2 +
2219 0 : (j - 1) * wiener_win] +
2220 0 : _mm256_extract_epi64(deltas[5], 2);
2221 0 : H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
2222 0 : H[((i - 1) * wiener_win + 4) * wiener_win2 +
2223 0 : (j - 1) * wiener_win] +
2224 0 : _mm256_extract_epi64(deltas[5], 3);
2225 : }
2226 720 : } while (++j < wiener_win);
2227 360 : } while (++i < wiener_win - 1);
2228 :
2229 : // Step 5: Derive other points of each square. No square in bottom row.
2230 120 : i = 0;
2231 : do {
2232 480 : const int16_t *const dI = d + i;
2233 :
2234 480 : j = i + 1;
2235 : do {
2236 1200 : const int16_t *const dJ = d + j;
2237 1200 : __m256i deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1] = {{{0}},{{0}}};
2238 : __m256i dIs[WIENER_WIN_CHROMA - 1], dIe[WIENER_WIN_CHROMA - 1];
2239 : __m256i dJs[WIENER_WIN_CHROMA - 1], dJe[WIENER_WIN_CHROMA - 1];
2240 :
2241 1200 : x = 0;
2242 : do {
2243 24000 : load_square_win5_avx2(
2244 24000 : dI + x, dJ + x, d_stride, height, dIs, dIe, dJs, dJe);
2245 24000 : derive_square_win5_avx2(dIs, dIe, dJs, dJe, deltas);
2246 :
2247 24000 : x += 16;
2248 24000 : } while (x < w16);
2249 :
2250 1200 : if (w16 != width) {
2251 0 : load_square_win5_avx2(
2252 0 : dI + x, dJ + x, d_stride, height, dIs, dIe, dJs, dJe);
2253 :
2254 0 : dIs[0] = _mm256_and_si256(dIs[0], mask);
2255 0 : dIs[1] = _mm256_and_si256(dIs[1], mask);
2256 0 : dIs[2] = _mm256_and_si256(dIs[2], mask);
2257 0 : dIs[3] = _mm256_and_si256(dIs[3], mask);
2258 0 : dIe[0] = _mm256_and_si256(dIe[0], mask);
2259 0 : dIe[1] = _mm256_and_si256(dIe[1], mask);
2260 0 : dIe[2] = _mm256_and_si256(dIe[2], mask);
2261 0 : dIe[3] = _mm256_and_si256(dIe[3], mask);
2262 :
2263 0 : derive_square_win5_avx2(dIs, dIe, dJs, dJe, deltas);
2264 : }
2265 :
2266 1200 : if (bit_depth < AOM_BITS_12) {
2267 1200 : hadd_update_4_stats_avx2(
2268 1200 : H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win,
2269 : deltas[0],
2270 1200 : H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win +
2271 : 1);
2272 1200 : hadd_update_4_stats_avx2(
2273 1200 : H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win,
2274 : deltas[1],
2275 1200 : H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win +
2276 : 1);
2277 1200 : hadd_update_4_stats_avx2(
2278 1200 : H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win,
2279 : deltas[2],
2280 1200 : H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win +
2281 : 1);
2282 1200 : hadd_update_4_stats_avx2(
2283 1200 : H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win,
2284 : deltas[3],
2285 1200 : H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win +
2286 : 1);
2287 : } else {
2288 0 : hadd_update_4_stats_highbd_avx2(
2289 0 : H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win,
2290 : deltas[0],
2291 0 : H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win +
2292 : 1);
2293 0 : hadd_update_4_stats_highbd_avx2(
2294 0 : H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win,
2295 : deltas[1],
2296 0 : H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win +
2297 : 1);
2298 0 : hadd_update_4_stats_highbd_avx2(
2299 0 : H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win,
2300 : deltas[2],
2301 0 : H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win +
2302 : 1);
2303 0 : hadd_update_4_stats_highbd_avx2(
2304 0 : H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win,
2305 : deltas[3],
2306 0 : H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win +
2307 : 1);
2308 : }
2309 1200 : } while (++j < wiener_win);
2310 480 : } while (++i < wiener_win - 1);
2311 :
2312 : // Step 6: Derive other points of each upper triangle along the diagonal.
2313 120 : i = 0;
2314 : do {
2315 600 : const int16_t *const dI = d + i;
2316 600 : __m256i deltas[WIENER_WIN_CHROMA * (WIENER_WIN_CHROMA - 1) / 2] = {0};
2317 : __m256i dIs[WIENER_WIN_CHROMA - 1], dIe[WIENER_WIN_CHROMA - 1];
2318 :
2319 600 : x = 0;
2320 : do {
2321 12000 : load_triangle_win5_avx2(dI + x, d_stride, height, dIs, dIe);
2322 12000 : derive_triangle_win5_avx2(dIs, dIe, deltas);
2323 :
2324 12000 : x += 16;
2325 12000 : } while (x < w16);
2326 :
2327 600 : if (w16 != width) {
2328 0 : load_triangle_win5_avx2(dI + x, d_stride, height, dIs, dIe);
2329 :
2330 0 : dIs[0] = _mm256_and_si256(dIs[0], mask);
2331 0 : dIs[1] = _mm256_and_si256(dIs[1], mask);
2332 0 : dIs[2] = _mm256_and_si256(dIs[2], mask);
2333 0 : dIs[3] = _mm256_and_si256(dIs[3], mask);
2334 0 : dIe[0] = _mm256_and_si256(dIe[0], mask);
2335 0 : dIe[1] = _mm256_and_si256(dIe[1], mask);
2336 0 : dIe[2] = _mm256_and_si256(dIe[2], mask);
2337 0 : dIe[3] = _mm256_and_si256(dIe[3], mask);
2338 :
2339 0 : derive_triangle_win5_avx2(dIs, dIe, deltas);
2340 : }
2341 :
2342 600 : if (bit_depth < AOM_BITS_12) {
2343 600 : hadd_update_4_stats_avx2(
2344 600 : H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win,
2345 : deltas,
2346 600 : H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
2347 :
2348 : const __m128i delta32 =
2349 600 : hadd_four_32_avx2(deltas[4], deltas[5], deltas[6], deltas[9]);
2350 600 : const __m128i delta64 = _mm_cvtepi32_epi64(delta32);
2351 :
2352 600 : update_2_stats_sse2(
2353 600 : H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1,
2354 : delta64,
2355 600 : H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
2356 600 : H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 4] =
2357 600 : H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 3] +
2358 600 : _mm_extract_epi32(delta32, 2);
2359 :
2360 600 : const __m128i d32 = hadd_two_32_avx2(deltas[7], deltas[8]);
2361 600 : const __m128i d64 = _mm_cvtepi32_epi64(d32);
2362 :
2363 600 : update_2_stats_sse2(
2364 600 : H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
2365 : d64,
2366 600 : H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
2367 600 : H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] =
2368 600 : H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3] +
2369 600 : _mm_extract_epi32(delta32, 3);
2370 : } else {
2371 0 : hadd_update_4_stats_highbd_avx2(
2372 0 : H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win,
2373 : deltas,
2374 0 : H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
2375 :
2376 0 : const __m256i delta64 = hadd_four_31_to_64_avx2(
2377 : deltas[4], deltas[5], deltas[6], deltas[9]);
2378 :
2379 0 : update_2_stats_sse2(
2380 0 : H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1,
2381 0 : _mm256_extracti128_si256(delta64, 0),
2382 0 : H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
2383 0 : H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 4] =
2384 0 : H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 3] +
2385 0 : _mm256_extract_epi64(delta64, 2);
2386 :
2387 0 : const __m128i d64 = hadd_two_31_to_64_avx2(deltas[7], deltas[8]);
2388 :
2389 0 : update_2_stats_sse2(
2390 0 : H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
2391 : d64,
2392 0 : H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
2393 0 : H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] =
2394 0 : H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3] +
2395 0 : _mm256_extract_epi64(delta64, 3);
2396 : }
2397 600 : } while (++i < wiener_win);
2398 120 : }
2399 :
2400 180 : static INLINE void compute_stats_win7_avx2(
2401 : const int16_t *const d, const int32_t d_stride, const int16_t *const s,
2402 : const int32_t s_stride, const int32_t width, const int32_t height,
2403 : int64_t *const M, int64_t *const H, AomBitDepth bit_depth) {
2404 180 : const int32_t wiener_win = WIENER_WIN;
2405 180 : const int32_t wiener_win2 = wiener_win * wiener_win;
2406 180 : const int32_t w16 = width & ~15;
2407 180 : const int32_t h8 = height & ~7;
2408 : const __m256i mask =
2409 180 : _mm256_load_si256((__m256i *)(mask_16bit[width - w16]));
2410 : int32_t i, j, x, y;
2411 :
2412 180 : if (bit_depth == AOM_BITS_8) {
2413 : // Step 1: Calculate the top edge of the whole matrix, i.e., the top
2414 : // edge of each triangle and square on the top row.
2415 180 : j = 0;
2416 : do {
2417 1260 : const int16_t *sT = s;
2418 1260 : const int16_t *dT = d;
2419 1260 : __m256i sumM[WIENER_WIN] = {0};
2420 1260 : __m256i sumH[WIENER_WIN] = {0};
2421 :
2422 1260 : y = height;
2423 : do {
2424 453600 : x = 0;
2425 : do {
2426 6048000 : const __m256i src = _mm256_load_si256((__m256i *)(sT + x));
2427 6048000 : const __m256i dgd = _mm256_load_si256((__m256i *)(dT + x));
2428 6048000 : stats_top_win7_avx2(
2429 6048000 : src, dgd, dT + j + x, d_stride, sumM, sumH);
2430 6048000 : x += 16;
2431 6048000 : } while (x < w16);
2432 :
2433 453600 : if (w16 != width) {
2434 : const __m256i src =
2435 0 : _mm256_load_si256((__m256i *)(sT + w16));
2436 : const __m256i dgd =
2437 0 : _mm256_load_si256((__m256i *)(dT + w16));
2438 0 : const __m256i srcMask = _mm256_and_si256(src, mask);
2439 0 : const __m256i dgdMask = _mm256_and_si256(dgd, mask);
2440 0 : stats_top_win7_avx2(
2441 0 : srcMask, dgdMask, dT + j + w16, d_stride, sumM, sumH);
2442 : }
2443 :
2444 453600 : sT += s_stride;
2445 453600 : dT += d_stride;
2446 453600 : } while (--y);
2447 :
2448 : const __m256i sM0 =
2449 1260 : hadd_four_32_to_64_avx2(sumM[0], sumM[1], sumM[2], sumM[3]);
2450 : const __m256i sM1 =
2451 1260 : hadd_four_32_to_64_avx2(sumM[4], sumM[5], sumM[6], sumM[6]);
2452 1260 : _mm256_storeu_si256((__m256i *)(M + wiener_win * j + 0), sM0);
2453 1260 : _mm_storeu_si128((__m128i *)(M + wiener_win * j + 4),
2454 1260 : _mm256_extracti128_si256(sM1, 0));
2455 2520 : _mm_storel_epi64((__m128i *)&M[wiener_win * j + 6],
2456 1260 : _mm256_extracti128_si256(sM1, 1));
2457 :
2458 : const __m256i sH0 =
2459 1260 : hadd_four_32_to_64_avx2(sumH[0], sumH[1], sumH[2], sumH[3]);
2460 : const __m256i sH1 =
2461 1260 : hadd_four_32_to_64_avx2(sumH[4], sumH[5], sumH[6], sumH[6]);
2462 1260 : _mm256_storeu_si256((__m256i *)(H + wiener_win * j + 0), sH0);
2463 : // Writing one more H on the top edge falls to the second row, so it
2464 : // won't overflow.
2465 1260 : _mm256_storeu_si256((__m256i *)(H + wiener_win * j + 4), sH1);
2466 1260 : } while (++j < wiener_win);
2467 :
2468 : // Step 2: Calculate the left edge of each square on the top row.
2469 180 : j = 1;
2470 : do {
2471 1080 : const int16_t *dT = d;
2472 1080 : __m256i sumH[WIENER_WIN - 1] = {0};
2473 :
2474 1080 : y = height;
2475 : do {
2476 388800 : x = 0;
2477 : do {
2478 : const __m256i dgd =
2479 5184000 : _mm256_loadu_si256((__m256i *)(dT + j + x));
2480 5184000 : stats_left_win7_avx2(dgd, dT + x, d_stride, sumH);
2481 5184000 : x += 16;
2482 5184000 : } while (x < w16);
2483 :
2484 388800 : if (w16 != width) {
2485 : const __m256i dgd =
2486 0 : _mm256_loadu_si256((__m256i *)(dT + j + x));
2487 0 : const __m256i dgdMask = _mm256_and_si256(dgd, mask);
2488 0 : stats_left_win7_avx2(dgdMask, dT + x, d_stride, sumH);
2489 : }
2490 :
2491 388800 : dT += d_stride;
2492 388800 : } while (--y);
2493 :
2494 : const __m256i sum0123 =
2495 1080 : hadd_four_32_to_64_avx2(sumH[0], sumH[1], sumH[2], sumH[3]);
2496 2160 : _mm_storel_epi64((__m128i *)&H[1 * wiener_win2 + j * wiener_win],
2497 1080 : _mm256_extracti128_si256(sum0123, 0));
2498 1080 : _mm_storeh_epi64((__m128i *)&H[2 * wiener_win2 + j * wiener_win],
2499 1080 : _mm256_extracti128_si256(sum0123, 0));
2500 2160 : _mm_storel_epi64((__m128i *)&H[3 * wiener_win2 + j * wiener_win],
2501 1080 : _mm256_extracti128_si256(sum0123, 1));
2502 1080 : _mm_storeh_epi64((__m128i *)&H[4 * wiener_win2 + j * wiener_win],
2503 1080 : _mm256_extracti128_si256(sum0123, 1));
2504 :
2505 1080 : const __m128i sum45 = hadd_two_32_to_64_avx2(sumH[4], sumH[5]);
2506 1080 : _mm_storel_epi64((__m128i *)&H[5 * wiener_win2 + j * wiener_win],
2507 : sum45);
2508 1080 : _mm_storeh_epi64((__m128i *)&H[6 * wiener_win2 + j * wiener_win],
2509 : sum45);
2510 1080 : } while (++j < wiener_win);
2511 : } else {
2512 0 : const int32_t numBitLeft =
2513 0 : 32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 3 /* SIMD */;
2514 0 : const int32_t hAllowed =
2515 0 : (1 << numBitLeft) / (w16 + ((w16 != width) ? 16 : 0));
2516 :
2517 : // Step 1: Calculate the top edge of the whole matrix, i.e., the top
2518 : // edge of each triangle and square on the top row.
2519 0 : j = 0;
2520 : do {
2521 0 : const int16_t *sT = s;
2522 0 : const int16_t *dT = d;
2523 0 : int32_t heightT = 0;
2524 0 : __m256i sumM[WIENER_WIN] = {0};
2525 0 : __m256i sumH[WIENER_WIN] = {0};
2526 :
2527 : do {
2528 0 : const int32_t hT = ((height - heightT) < hAllowed)
2529 : ? (height - heightT)
2530 : : hAllowed;
2531 0 : __m256i rowM[WIENER_WIN] = {0};
2532 0 : __m256i rowH[WIENER_WIN] = {0};
2533 :
2534 0 : y = hT;
2535 : do {
2536 0 : x = 0;
2537 : do {
2538 : const __m256i src =
2539 0 : _mm256_load_si256((__m256i *)(sT + x));
2540 : const __m256i dgd =
2541 0 : _mm256_load_si256((__m256i *)(dT + x));
2542 0 : stats_top_win7_avx2(
2543 0 : src, dgd, dT + j + x, d_stride, rowM, rowH);
2544 0 : x += 16;
2545 0 : } while (x < w16);
2546 :
2547 0 : if (w16 != width) {
2548 : const __m256i src =
2549 0 : _mm256_load_si256((__m256i *)(sT + w16));
2550 : const __m256i dgd =
2551 0 : _mm256_load_si256((__m256i *)(dT + w16));
2552 0 : const __m256i srcMask = _mm256_and_si256(src, mask);
2553 0 : const __m256i dgdMask = _mm256_and_si256(dgd, mask);
2554 0 : stats_top_win7_avx2(srcMask,
2555 : dgdMask,
2556 0 : dT + j + w16,
2557 : d_stride,
2558 : rowM,
2559 : rowH);
2560 : }
2561 :
2562 0 : sT += s_stride;
2563 0 : dT += d_stride;
2564 0 : } while (--y);
2565 :
2566 0 : add_32_to_64_avx2(rowM[0], &sumM[0]);
2567 0 : add_32_to_64_avx2(rowM[1], &sumM[1]);
2568 0 : add_32_to_64_avx2(rowM[2], &sumM[2]);
2569 0 : add_32_to_64_avx2(rowM[3], &sumM[3]);
2570 0 : add_32_to_64_avx2(rowM[4], &sumM[4]);
2571 0 : add_32_to_64_avx2(rowM[5], &sumM[5]);
2572 0 : add_32_to_64_avx2(rowM[6], &sumM[6]);
2573 0 : add_32_to_64_avx2(rowH[0], &sumH[0]);
2574 0 : add_32_to_64_avx2(rowH[1], &sumH[1]);
2575 0 : add_32_to_64_avx2(rowH[2], &sumH[2]);
2576 0 : add_32_to_64_avx2(rowH[3], &sumH[3]);
2577 0 : add_32_to_64_avx2(rowH[4], &sumH[4]);
2578 0 : add_32_to_64_avx2(rowH[5], &sumH[5]);
2579 0 : add_32_to_64_avx2(rowH[6], &sumH[6]);
2580 :
2581 0 : heightT += hT;
2582 0 : } while (heightT < height);
2583 :
2584 : const __m256i sM0 =
2585 0 : hadd_four_64_avx2(sumM[0], sumM[1], sumM[2], sumM[3]);
2586 : const __m256i sM1 =
2587 0 : hadd_four_64_avx2(sumM[4], sumM[5], sumM[6], sumM[6]);
2588 0 : _mm256_storeu_si256((__m256i *)(M + wiener_win * j + 0), sM0);
2589 0 : _mm_storeu_si128((__m128i *)(M + wiener_win * j + 4),
2590 0 : _mm256_extracti128_si256(sM1, 0));
2591 0 : _mm_storel_epi64((__m128i *)&M[wiener_win * j + 6],
2592 0 : _mm256_extracti128_si256(sM1, 1));
2593 :
2594 : const __m256i sH0 =
2595 0 : hadd_four_64_avx2(sumH[0], sumH[1], sumH[2], sumH[3]);
2596 : const __m256i sH1 =
2597 0 : hadd_four_64_avx2(sumH[4], sumH[5], sumH[6], sumH[6]);
2598 0 : _mm256_storeu_si256((__m256i *)(H + wiener_win * j + 0), sH0);
2599 : // Writing one more H on the top edge falls to the second row, so it
2600 : // won't overflow.
2601 0 : _mm256_storeu_si256((__m256i *)(H + wiener_win * j + 4), sH1);
2602 0 : } while (++j < wiener_win);
2603 :
2604 : // Step 2: Calculate the left edge of each square on the top row.
2605 0 : j = 1;
2606 : do {
2607 0 : const int16_t *dT = d;
2608 0 : int32_t heightT = 0;
2609 0 : __m256i sumH[WIENER_WIN - 1] = {0};
2610 :
2611 : do {
2612 0 : const int32_t hT = ((height - heightT) < hAllowed)
2613 : ? (height - heightT)
2614 : : hAllowed;
2615 0 : __m256i rowH[WIENER_WIN - 1] = {0};
2616 :
2617 0 : y = hT;
2618 : do {
2619 0 : x = 0;
2620 : do {
2621 : const __m256i dgd =
2622 0 : _mm256_loadu_si256((__m256i *)(dT + j + x));
2623 0 : stats_left_win7_avx2(dgd, dT + x, d_stride, rowH);
2624 0 : x += 16;
2625 0 : } while (x < w16);
2626 :
2627 0 : if (w16 != width) {
2628 : const __m256i dgd =
2629 0 : _mm256_loadu_si256((__m256i *)(dT + j + x));
2630 0 : const __m256i dgdMask = _mm256_and_si256(dgd, mask);
2631 0 : stats_left_win7_avx2(dgdMask, dT + x, d_stride, rowH);
2632 : }
2633 :
2634 0 : dT += d_stride;
2635 0 : } while (--y);
2636 :
2637 0 : add_32_to_64_avx2(rowH[0], &sumH[0]);
2638 0 : add_32_to_64_avx2(rowH[1], &sumH[1]);
2639 0 : add_32_to_64_avx2(rowH[2], &sumH[2]);
2640 0 : add_32_to_64_avx2(rowH[3], &sumH[3]);
2641 0 : add_32_to_64_avx2(rowH[4], &sumH[4]);
2642 0 : add_32_to_64_avx2(rowH[5], &sumH[5]);
2643 :
2644 0 : heightT += hT;
2645 0 : } while (heightT < height);
2646 :
2647 : const __m256i sum0123 =
2648 0 : hadd_four_64_avx2(sumH[0], sumH[1], sumH[2], sumH[3]);
2649 0 : _mm_storel_epi64((__m128i *)&H[1 * wiener_win2 + j * wiener_win],
2650 0 : _mm256_extracti128_si256(sum0123, 0));
2651 0 : _mm_storeh_epi64((__m128i *)&H[2 * wiener_win2 + j * wiener_win],
2652 0 : _mm256_extracti128_si256(sum0123, 0));
2653 0 : _mm_storel_epi64((__m128i *)&H[3 * wiener_win2 + j * wiener_win],
2654 0 : _mm256_extracti128_si256(sum0123, 1));
2655 0 : _mm_storeh_epi64((__m128i *)&H[4 * wiener_win2 + j * wiener_win],
2656 0 : _mm256_extracti128_si256(sum0123, 1));
2657 :
2658 0 : const __m128i sum45 = hadd_two_64_avx2(sumH[4], sumH[5]);
2659 0 : _mm_storel_epi64((__m128i *)&H[5 * wiener_win2 + j * wiener_win],
2660 : sum45);
2661 0 : _mm_storeh_epi64((__m128i *)&H[6 * wiener_win2 + j * wiener_win],
2662 : sum45);
2663 0 : } while (++j < wiener_win);
2664 : }
2665 :
2666 : // Step 3: Derive the top edge of each triangle along the diagonal. No
2667 : // triangle in top row.
2668 : {
2669 180 : const int16_t *dT = d;
2670 : // Pad to call transpose function.
2671 180 : __m256i deltas[WIENER_WIN + 1] = {0};
2672 : __m256i ds[WIENER_WIN];
2673 :
2674 : // 00s 00e 01s 01e 02s 02e 03s 03e 04s 04e 05s 05e 06s 06e 07s 07e
2675 : // 10s 10e 11s 11e 12s 12e 13s 13e 14s 14e 15s 15e 16s 16e 17s 17e
2676 : // 20s 20e 21s 21e 22s 22e 23s 23e 24s 24e 25s 25e 26s 26e 27s 27e
2677 : // 30s 30e 31s 31e 32s 32e 33s 33e 34s 34e 35s 35e 36s 36e 37s 37e
2678 : // 40s 40e 41s 41e 42s 42e 43s 43e 44s 44e 45s 45e 46s 46e 47s 47e
2679 : // 50s 50e 51s 51e 52s 52e 53s 53e 54s 54e 55s 55e 56s 56e 57s 57e
2680 180 : ds[0] = load_win7_avx2(dT + 0 * d_stride, width);
2681 180 : ds[1] = load_win7_avx2(dT + 1 * d_stride, width);
2682 180 : ds[2] = load_win7_avx2(dT + 2 * d_stride, width);
2683 180 : ds[3] = load_win7_avx2(dT + 3 * d_stride, width);
2684 180 : ds[4] = load_win7_avx2(dT + 4 * d_stride, width);
2685 180 : ds[5] = load_win7_avx2(dT + 5 * d_stride, width);
2686 180 : dT += 6 * d_stride;
2687 :
2688 180 : if (bit_depth < AOM_BITS_12) {
2689 180 : step3_win7_avx2(&dT, d_stride, width, height, ds, deltas);
2690 :
2691 180 : transpose_32bit_8x8_avx2(deltas, deltas);
2692 :
2693 180 : update_8_stats_avx2(
2694 : H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
2695 : deltas[0],
2696 180 : H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
2697 180 : update_8_stats_avx2(
2698 180 : H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
2699 : deltas[1],
2700 180 : H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
2701 180 : update_8_stats_avx2(
2702 180 : H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
2703 : deltas[2],
2704 180 : H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
2705 180 : update_8_stats_avx2(
2706 180 : H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
2707 : deltas[3],
2708 180 : H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
2709 180 : update_8_stats_avx2(
2710 180 : H + 4 * wiener_win * wiener_win2 + 4 * wiener_win,
2711 : deltas[4],
2712 180 : H + 5 * wiener_win * wiener_win2 + 5 * wiener_win);
2713 180 : update_8_stats_avx2(
2714 180 : H + 5 * wiener_win * wiener_win2 + 5 * wiener_win,
2715 : deltas[5],
2716 180 : H + 6 * wiener_win * wiener_win2 + 6 * wiener_win);
2717 : } else {
2718 0 : __m128i deltas128[WIENER_WIN] = {0};
2719 0 : int32_t heightT = 0;
2720 :
2721 : do {
2722 0 : __m256i deltasT[WIENER_WIN] = {0};
2723 0 : const int32_t hT =
2724 0 : ((height - heightT) < 128) ? (height - heightT) : 128;
2725 :
2726 0 : step3_win7_avx2(&dT, d_stride, width, hT, ds, deltasT);
2727 :
2728 0 : add_six_32_to_64_avx2(deltasT[0], &deltas[0], &deltas128[0]);
2729 0 : add_six_32_to_64_avx2(deltasT[1], &deltas[1], &deltas128[1]);
2730 0 : add_six_32_to_64_avx2(deltasT[2], &deltas[2], &deltas128[2]);
2731 0 : add_six_32_to_64_avx2(deltasT[3], &deltas[3], &deltas128[3]);
2732 0 : add_six_32_to_64_avx2(deltasT[4], &deltas[4], &deltas128[4]);
2733 0 : add_six_32_to_64_avx2(deltasT[5], &deltas[5], &deltas128[5]);
2734 0 : add_six_32_to_64_avx2(deltasT[6], &deltas[6], &deltas128[6]);
2735 :
2736 0 : heightT += hT;
2737 0 : } while (heightT < height);
2738 :
2739 0 : transpose_64bit_4x8_avx2(deltas, deltas);
2740 0 : update_4_stats_highbd_avx2(
2741 : H + 0 * wiener_win * wiener_win2 + 0 * wiener_win + 0,
2742 : deltas[0],
2743 0 : H + 1 * wiener_win * wiener_win2 + 1 * wiener_win + 0);
2744 0 : update_4_stats_highbd_avx2(
2745 0 : H + 0 * wiener_win * wiener_win2 + 0 * wiener_win + 4,
2746 : deltas[1],
2747 0 : H + 1 * wiener_win * wiener_win2 + 1 * wiener_win + 4);
2748 0 : update_4_stats_highbd_avx2(
2749 0 : H + 1 * wiener_win * wiener_win2 + 1 * wiener_win + 0,
2750 : deltas[2],
2751 0 : H + 2 * wiener_win * wiener_win2 + 2 * wiener_win + 0);
2752 0 : update_4_stats_highbd_avx2(
2753 0 : H + 1 * wiener_win * wiener_win2 + 1 * wiener_win + 4,
2754 : deltas[3],
2755 0 : H + 2 * wiener_win * wiener_win2 + 2 * wiener_win + 4);
2756 0 : update_4_stats_highbd_avx2(
2757 0 : H + 2 * wiener_win * wiener_win2 + 2 * wiener_win + 0,
2758 : deltas[4],
2759 0 : H + 3 * wiener_win * wiener_win2 + 3 * wiener_win + 0);
2760 0 : update_4_stats_highbd_avx2(
2761 0 : H + 2 * wiener_win * wiener_win2 + 2 * wiener_win + 4,
2762 : deltas[5],
2763 0 : H + 3 * wiener_win * wiener_win2 + 3 * wiener_win + 4);
2764 0 : update_4_stats_highbd_avx2(
2765 0 : H + 3 * wiener_win * wiener_win2 + 3 * wiener_win + 0,
2766 : deltas[6],
2767 0 : H + 4 * wiener_win * wiener_win2 + 4 * wiener_win + 0);
2768 0 : update_4_stats_highbd_avx2(
2769 0 : H + 3 * wiener_win * wiener_win2 + 3 * wiener_win + 4,
2770 : deltas[7],
2771 0 : H + 4 * wiener_win * wiener_win2 + 4 * wiener_win + 4);
2772 :
2773 0 : const __m128i d0 = _mm_unpacklo_epi64(deltas128[0], deltas128[1]);
2774 0 : const __m128i d1 = _mm_unpacklo_epi64(deltas128[2], deltas128[3]);
2775 0 : const __m128i d2 = _mm_unpacklo_epi64(deltas128[4], deltas128[5]);
2776 0 : const __m128i d3 = _mm_unpacklo_epi64(deltas128[6], deltas128[6]);
2777 0 : const __m128i d4 = _mm_unpackhi_epi64(deltas128[0], deltas128[1]);
2778 0 : const __m128i d5 = _mm_unpackhi_epi64(deltas128[2], deltas128[3]);
2779 0 : const __m128i d6 = _mm_unpackhi_epi64(deltas128[4], deltas128[5]);
2780 0 : const __m128i d7 = _mm_unpackhi_epi64(deltas128[6], deltas128[6]);
2781 :
2782 0 : deltas[0] =
2783 0 : _mm256_inserti128_si256(_mm256_castsi128_si256(d0), d1, 1);
2784 0 : deltas[1] =
2785 0 : _mm256_inserti128_si256(_mm256_castsi128_si256(d2), d3, 1);
2786 0 : deltas[2] =
2787 0 : _mm256_inserti128_si256(_mm256_castsi128_si256(d4), d5, 1);
2788 0 : deltas[3] =
2789 0 : _mm256_inserti128_si256(_mm256_castsi128_si256(d6), d7, 1);
2790 0 : update_4_stats_highbd_avx2(
2791 0 : H + 4 * wiener_win * wiener_win2 + 4 * wiener_win + 0,
2792 : deltas[0],
2793 0 : H + 5 * wiener_win * wiener_win2 + 5 * wiener_win + 0);
2794 0 : update_4_stats_highbd_avx2(
2795 0 : H + 4 * wiener_win * wiener_win2 + 4 * wiener_win + 4,
2796 : deltas[1],
2797 0 : H + 5 * wiener_win * wiener_win2 + 5 * wiener_win + 4);
2798 0 : update_4_stats_highbd_avx2(
2799 0 : H + 5 * wiener_win * wiener_win2 + 5 * wiener_win + 0,
2800 : deltas[2],
2801 0 : H + 6 * wiener_win * wiener_win2 + 6 * wiener_win + 0);
2802 0 : update_4_stats_highbd_avx2(
2803 0 : H + 5 * wiener_win * wiener_win2 + 5 * wiener_win + 4,
2804 : deltas[3],
2805 0 : H + 6 * wiener_win * wiener_win2 + 6 * wiener_win + 4);
2806 : }
2807 : }
2808 :
2809 : // Step 4: Derive the top and left edge of each square. No square in top and
2810 : // bottom row.
2811 180 : i = 1;
2812 : do {
2813 900 : j = i + 1;
2814 : do {
2815 2700 : const int16_t *dI = d + i - 1;
2816 2700 : const int16_t *dJ = d + j - 1;
2817 2700 : __m256i deltas[2 * WIENER_WIN - 1] = {0};
2818 : __m256i deltasT[8], deltasTT[4];
2819 : __m256i dd[WIENER_WIN], ds[WIENER_WIN];
2820 2700 : dd[0] = _mm256_setzero_si256(); // Initialize to avoid warning.
2821 2700 : ds[0] = _mm256_setzero_si256(); // Initialize to avoid warning.
2822 :
2823 2700 : dd[0] = _mm256_insert_epi16(dd[0], dI[0 * d_stride], 0);
2824 2700 : dd[0] = _mm256_insert_epi16(dd[0], dI[0 * d_stride + width], 8);
2825 2700 : dd[0] = _mm256_insert_epi16(dd[0], dI[1 * d_stride], 1);
2826 2700 : dd[0] = _mm256_insert_epi16(dd[0], dI[1 * d_stride + width], 9);
2827 2700 : dd[0] = _mm256_insert_epi16(dd[0], dI[2 * d_stride], 2);
2828 2700 : dd[0] = _mm256_insert_epi16(dd[0], dI[2 * d_stride + width], 10);
2829 2700 : dd[0] = _mm256_insert_epi16(dd[0], dI[3 * d_stride], 3);
2830 2700 : dd[0] = _mm256_insert_epi16(dd[0], dI[3 * d_stride + width], 11);
2831 2700 : dd[0] = _mm256_insert_epi16(dd[0], dI[4 * d_stride], 4);
2832 2700 : dd[0] = _mm256_insert_epi16(dd[0], dI[4 * d_stride + width], 12);
2833 2700 : dd[0] = _mm256_insert_epi16(dd[0], dI[5 * d_stride], 5);
2834 2700 : dd[0] = _mm256_insert_epi16(dd[0], dI[5 * d_stride + width], 13);
2835 :
2836 2700 : ds[0] = _mm256_insert_epi16(ds[0], dJ[0 * d_stride], 0);
2837 2700 : ds[0] = _mm256_insert_epi16(ds[0], dJ[0 * d_stride + width], 8);
2838 2700 : ds[0] = _mm256_insert_epi16(ds[0], dJ[1 * d_stride], 1);
2839 2700 : ds[0] = _mm256_insert_epi16(ds[0], dJ[1 * d_stride + width], 9);
2840 2700 : ds[0] = _mm256_insert_epi16(ds[0], dJ[2 * d_stride], 2);
2841 2700 : ds[0] = _mm256_insert_epi16(ds[0], dJ[2 * d_stride + width], 10);
2842 2700 : ds[0] = _mm256_insert_epi16(ds[0], dJ[3 * d_stride], 3);
2843 2700 : ds[0] = _mm256_insert_epi16(ds[0], dJ[3 * d_stride + width], 11);
2844 2700 : ds[0] = _mm256_insert_epi16(ds[0], dJ[4 * d_stride], 4);
2845 2700 : ds[0] = _mm256_insert_epi16(ds[0], dJ[4 * d_stride + width], 12);
2846 2700 : ds[0] = _mm256_insert_epi16(ds[0], dJ[5 * d_stride], 5);
2847 2700 : ds[0] = _mm256_insert_epi16(ds[0], dJ[5 * d_stride + width], 13);
2848 :
2849 2700 : y = 0;
2850 : do {
2851 : // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e
2852 : // 70e
2853 121500 : dd[0] = _mm256_insert_epi16(dd[0], dI[6 * d_stride], 6);
2854 121500 : dd[0] =
2855 121500 : _mm256_insert_epi16(dd[0], dI[6 * d_stride + width], 14);
2856 121500 : dd[0] = _mm256_insert_epi16(dd[0], dI[7 * d_stride], 7);
2857 121500 : dd[0] =
2858 121500 : _mm256_insert_epi16(dd[0], dI[7 * d_stride + width], 15);
2859 :
2860 : // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e
2861 : // 70e 01s 11s 21s 31s 41s 51s 61s 71s 01e 11e 21e 31e 41e 51e
2862 : // 61e 71e
2863 121500 : ds[0] = _mm256_insert_epi16(ds[0], dJ[6 * d_stride], 6);
2864 121500 : ds[0] =
2865 121500 : _mm256_insert_epi16(ds[0], dJ[6 * d_stride + width], 14);
2866 121500 : ds[0] = _mm256_insert_epi16(ds[0], dJ[7 * d_stride], 7);
2867 121500 : ds[0] =
2868 121500 : _mm256_insert_epi16(ds[0], dJ[7 * d_stride + width], 15);
2869 :
2870 121500 : load_more_16_avx2(dI + 8 * d_stride, width, dd[0], &dd[1]);
2871 121500 : load_more_16_avx2(dJ + 8 * d_stride, width, ds[0], &ds[1]);
2872 121500 : load_more_16_avx2(dI + 9 * d_stride, width, dd[1], &dd[2]);
2873 121500 : load_more_16_avx2(dJ + 9 * d_stride, width, ds[1], &ds[2]);
2874 121500 : load_more_16_avx2(dI + 10 * d_stride, width, dd[2], &dd[3]);
2875 121500 : load_more_16_avx2(dJ + 10 * d_stride, width, ds[2], &ds[3]);
2876 121500 : load_more_16_avx2(dI + 11 * d_stride, width, dd[3], &dd[4]);
2877 121500 : load_more_16_avx2(dJ + 11 * d_stride, width, ds[3], &ds[4]);
2878 121500 : load_more_16_avx2(dI + 12 * d_stride, width, dd[4], &dd[5]);
2879 121500 : load_more_16_avx2(dJ + 12 * d_stride, width, ds[4], &ds[5]);
2880 121500 : load_more_16_avx2(dI + 13 * d_stride, width, dd[5], &dd[6]);
2881 121500 : load_more_16_avx2(dJ + 13 * d_stride, width, ds[5], &ds[6]);
2882 :
2883 121500 : madd_avx2(dd[0], ds[0], &deltas[0]);
2884 121500 : madd_avx2(dd[0], ds[1], &deltas[1]);
2885 121500 : madd_avx2(dd[0], ds[2], &deltas[2]);
2886 121500 : madd_avx2(dd[0], ds[3], &deltas[3]);
2887 121500 : madd_avx2(dd[0], ds[4], &deltas[4]);
2888 121500 : madd_avx2(dd[0], ds[5], &deltas[5]);
2889 121500 : madd_avx2(dd[0], ds[6], &deltas[6]);
2890 121500 : madd_avx2(dd[1], ds[0], &deltas[7]);
2891 121500 : madd_avx2(dd[2], ds[0], &deltas[8]);
2892 121500 : madd_avx2(dd[3], ds[0], &deltas[9]);
2893 121500 : madd_avx2(dd[4], ds[0], &deltas[10]);
2894 121500 : madd_avx2(dd[5], ds[0], &deltas[11]);
2895 121500 : madd_avx2(dd[6], ds[0], &deltas[12]);
2896 :
2897 121500 : dd[0] = _mm256_srli_si256(dd[6], 4);
2898 121500 : ds[0] = _mm256_srli_si256(ds[6], 4);
2899 121500 : dI += 8 * d_stride;
2900 121500 : dJ += 8 * d_stride;
2901 121500 : y += 8;
2902 121500 : } while (y < h8);
2903 :
2904 2700 : if (bit_depth < AOM_BITS_12) {
2905 2700 : deltas[0] = _mm256_hadd_epi32(deltas[0], deltas[1]);
2906 2700 : deltas[2] = _mm256_hadd_epi32(deltas[2], deltas[3]);
2907 2700 : deltas[4] = _mm256_hadd_epi32(deltas[4], deltas[5]);
2908 2700 : deltas[6] = _mm256_hadd_epi32(deltas[6], deltas[6]);
2909 2700 : deltas[7] = _mm256_hadd_epi32(deltas[7], deltas[8]);
2910 2700 : deltas[9] = _mm256_hadd_epi32(deltas[9], deltas[10]);
2911 2700 : deltas[11] = _mm256_hadd_epi32(deltas[11], deltas[12]);
2912 2700 : deltas[0] = _mm256_hadd_epi32(deltas[0], deltas[2]);
2913 2700 : deltas[4] = _mm256_hadd_epi32(deltas[4], deltas[6]);
2914 2700 : deltas[7] = _mm256_hadd_epi32(deltas[7], deltas[9]);
2915 2700 : deltas[11] = _mm256_hadd_epi32(deltas[11], deltas[11]);
2916 2700 : const __m128i delta0 = sub_hi_lo_32_avx2(deltas[0]);
2917 2700 : const __m128i delta1 = sub_hi_lo_32_avx2(deltas[4]);
2918 2700 : const __m128i delta2 = sub_hi_lo_32_avx2(deltas[7]);
2919 2700 : const __m128i delta3 = sub_hi_lo_32_avx2(deltas[11]);
2920 2700 : deltas[0] = _mm256_inserti128_si256(
2921 : _mm256_castsi128_si256(delta0), delta1, 1);
2922 2700 : deltas[1] = _mm256_inserti128_si256(
2923 : _mm256_castsi128_si256(delta2), delta3, 1);
2924 : } else {
2925 0 : deltas[0] = hsub_32x8_to_64x4_avx2(deltas[0]);
2926 0 : deltas[1] = hsub_32x8_to_64x4_avx2(deltas[1]);
2927 0 : deltas[2] = hsub_32x8_to_64x4_avx2(deltas[2]);
2928 0 : deltas[3] = hsub_32x8_to_64x4_avx2(deltas[3]);
2929 0 : deltas[4] = hsub_32x8_to_64x4_avx2(deltas[4]);
2930 0 : deltas[5] = hsub_32x8_to_64x4_avx2(deltas[5]);
2931 0 : deltas[6] = hsub_32x8_to_64x4_avx2(deltas[6]);
2932 0 : deltas[7] = hsub_32x8_to_64x4_avx2(deltas[7]);
2933 0 : deltas[8] = hsub_32x8_to_64x4_avx2(deltas[8]);
2934 0 : deltas[9] = hsub_32x8_to_64x4_avx2(deltas[9]);
2935 0 : deltas[10] = hsub_32x8_to_64x4_avx2(deltas[10]);
2936 0 : deltas[11] = hsub_32x8_to_64x4_avx2(deltas[11]);
2937 0 : deltas[12] = hsub_32x8_to_64x4_avx2(deltas[12]);
2938 :
2939 0 : transpose_64bit_4x8_avx2(deltas + 0, deltasT);
2940 0 : deltasT[0] = _mm256_add_epi64(deltasT[0], deltasT[2]);
2941 0 : deltasT[4] = _mm256_add_epi64(deltasT[4], deltasT[6]);
2942 0 : deltasT[1] = _mm256_add_epi64(deltasT[1], deltasT[3]);
2943 0 : deltasT[5] = _mm256_add_epi64(deltasT[5], deltasT[7]);
2944 0 : deltasTT[0] = _mm256_add_epi64(deltasT[0], deltasT[4]);
2945 0 : deltasTT[1] = _mm256_add_epi64(deltasT[1], deltasT[5]);
2946 :
2947 0 : transpose_64bit_4x6_avx2(deltas + 7, deltasT);
2948 0 : deltasT[0] = _mm256_add_epi64(deltasT[0], deltasT[2]);
2949 0 : deltasT[4] = _mm256_add_epi64(deltasT[4], deltasT[6]);
2950 0 : deltasT[1] = _mm256_add_epi64(deltasT[1], deltasT[3]);
2951 0 : deltasT[5] = _mm256_add_epi64(deltasT[5], deltasT[7]);
2952 0 : deltasTT[2] = _mm256_add_epi64(deltasT[0], deltasT[4]);
2953 0 : deltasTT[3] = _mm256_add_epi64(deltasT[1], deltasT[5]);
2954 :
2955 0 : deltas[0] = _mm256_setzero_si256();
2956 0 : deltas[1] = _mm256_setzero_si256();
2957 : }
2958 :
2959 2700 : if (h8 != height) {
2960 0 : const __m256i perm = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
2961 :
2962 0 : ds[0] = _mm256_insert_epi16(ds[0], dJ[0 * d_stride], 0);
2963 0 : ds[0] = _mm256_insert_epi16(ds[0], dJ[0 * d_stride + width], 1);
2964 :
2965 0 : dd[2] = _mm256_insert_epi16(dd[2], -dI[1 * d_stride], 0);
2966 0 : dd[2] = _mm256_insert_epi16(dd[2], dI[1 * d_stride + width], 1);
2967 0 : ds[0] = _mm256_insert_epi16(ds[0], dJ[1 * d_stride], 2);
2968 0 : ds[0] = _mm256_insert_epi16(ds[0], dJ[1 * d_stride + width], 3);
2969 :
2970 0 : dd[2] = _mm256_insert_epi16(dd[2], -dI[2 * d_stride], 2);
2971 0 : dd[2] = _mm256_insert_epi16(dd[2], dI[2 * d_stride + width], 3);
2972 0 : ds[0] = _mm256_insert_epi16(ds[0], dJ[2 * d_stride], 4);
2973 0 : ds[0] = _mm256_insert_epi16(ds[0], dJ[2 * d_stride + width], 5);
2974 :
2975 0 : dd[2] = _mm256_insert_epi16(dd[2], -dI[3 * d_stride], 4);
2976 0 : dd[2] = _mm256_insert_epi16(dd[2], dI[3 * d_stride + width], 5);
2977 0 : ds[0] = _mm256_insert_epi16(ds[0], dJ[3 * d_stride], 6);
2978 0 : ds[0] = _mm256_insert_epi16(ds[0], dJ[3 * d_stride + width], 7);
2979 :
2980 0 : dd[2] = _mm256_insert_epi16(dd[2], -dI[4 * d_stride], 6);
2981 0 : dd[2] = _mm256_insert_epi16(dd[2], dI[4 * d_stride + width], 7);
2982 0 : ds[0] = _mm256_insert_epi16(ds[0], dJ[4 * d_stride], 8);
2983 0 : ds[0] = _mm256_insert_epi16(ds[0], dJ[4 * d_stride + width], 9);
2984 :
2985 0 : dd[2] = _mm256_insert_epi16(dd[2], -dI[5 * d_stride], 8);
2986 0 : dd[2] = _mm256_insert_epi16(dd[2], dI[5 * d_stride + width], 9);
2987 0 : ds[0] = _mm256_insert_epi16(ds[0], dJ[5 * d_stride], 10);
2988 0 : ds[0] =
2989 0 : _mm256_insert_epi16(ds[0], dJ[5 * d_stride + width], 11);
2990 :
2991 : do {
2992 0 : dd[0] = _mm256_set1_epi16(-dI[0 * d_stride]);
2993 0 : dd[1] = _mm256_set1_epi16(dI[0 * d_stride + width]);
2994 0 : dd[0] = _mm256_unpacklo_epi16(dd[0], dd[1]);
2995 :
2996 0 : ds[2] = _mm256_set1_epi16(dJ[0 * d_stride]);
2997 0 : ds[3] = _mm256_set1_epi16(dJ[0 * d_stride + width]);
2998 0 : ds[2] = _mm256_unpacklo_epi16(ds[2], ds[3]);
2999 :
3000 0 : dd[2] = _mm256_insert_epi16(dd[2], -dI[6 * d_stride], 10);
3001 0 : dd[2] = _mm256_insert_epi16(
3002 : dd[2], dI[6 * d_stride + width], 11);
3003 0 : ds[0] = _mm256_insert_epi16(ds[0], dJ[6 * d_stride], 12);
3004 0 : ds[0] = _mm256_insert_epi16(
3005 : ds[0], dJ[6 * d_stride + width], 13);
3006 :
3007 0 : madd_avx2(dd[0], ds[0], &deltas[0]);
3008 0 : madd_avx2(dd[2], ds[2], &deltas[1]);
3009 :
3010 : // right shift 4 bytes
3011 0 : dd[2] = _mm256_permutevar8x32_epi32(dd[2], perm);
3012 0 : ds[0] = _mm256_permutevar8x32_epi32(ds[0], perm);
3013 0 : dI += d_stride;
3014 0 : dJ += d_stride;
3015 0 : } while (++y < height);
3016 : }
3017 :
3018 : // Writing one more H on the top edge of a square falls to the next
3019 : // square in the same row or the first H in the next row, which
3020 : // would be calculated later, so it won't overflow.
3021 2700 : if (bit_depth < AOM_BITS_12) {
3022 2700 : update_8_stats_avx2(
3023 2700 : H + (i - 1) * wiener_win * wiener_win2 +
3024 2700 : (j - 1) * wiener_win,
3025 : deltas[0],
3026 2700 : H + i * wiener_win * wiener_win2 + j * wiener_win);
3027 :
3028 2700 : H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
3029 2700 : H[((i - 1) * wiener_win + 1) * wiener_win2 +
3030 2700 : (j - 1) * wiener_win] +
3031 2700 : _mm256_extract_epi32(deltas[1], 0);
3032 2700 : H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
3033 2700 : H[((i - 1) * wiener_win + 2) * wiener_win2 +
3034 2700 : (j - 1) * wiener_win] +
3035 2700 : _mm256_extract_epi32(deltas[1], 1);
3036 2700 : H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
3037 2700 : H[((i - 1) * wiener_win + 3) * wiener_win2 +
3038 2700 : (j - 1) * wiener_win] +
3039 2700 : _mm256_extract_epi32(deltas[1], 2);
3040 2700 : H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
3041 2700 : H[((i - 1) * wiener_win + 4) * wiener_win2 +
3042 2700 : (j - 1) * wiener_win] +
3043 2700 : _mm256_extract_epi32(deltas[1], 3);
3044 2700 : H[(i * wiener_win + 5) * wiener_win2 + j * wiener_win] =
3045 2700 : H[((i - 1) * wiener_win + 5) * wiener_win2 +
3046 2700 : (j - 1) * wiener_win] +
3047 2700 : _mm256_extract_epi32(deltas[1], 4);
3048 2700 : H[(i * wiener_win + 6) * wiener_win2 + j * wiener_win] =
3049 2700 : H[((i - 1) * wiener_win + 6) * wiener_win2 +
3050 2700 : (j - 1) * wiener_win] +
3051 2700 : _mm256_extract_epi32(deltas[1], 5);
3052 : } else {
3053 0 : const __m256i d0 = _mm256_cvtepi32_epi64(
3054 0 : _mm256_extracti128_si256(deltas[0], 0));
3055 0 : const __m256i d1 = _mm256_cvtepi32_epi64(
3056 0 : _mm256_extracti128_si256(deltas[0], 1));
3057 0 : const __m256i d2 = _mm256_cvtepi32_epi64(
3058 0 : _mm256_extracti128_si256(deltas[1], 0));
3059 0 : const __m256i d3 = _mm256_cvtepi32_epi64(
3060 0 : _mm256_extracti128_si256(deltas[1], 1));
3061 :
3062 0 : deltas[0] = _mm256_add_epi64(deltasTT[0], d0);
3063 0 : deltas[1] = _mm256_add_epi64(deltasTT[1], d1);
3064 0 : deltas[2] = _mm256_add_epi64(deltasTT[2], d2);
3065 0 : deltas[3] = _mm256_add_epi64(deltasTT[3], d3);
3066 :
3067 0 : update_4_stats_highbd_avx2(
3068 0 : H + (i - 1) * wiener_win * wiener_win2 +
3069 0 : (j - 1) * wiener_win + 0,
3070 : deltas[0],
3071 0 : H + i * wiener_win * wiener_win2 + j * wiener_win + 0);
3072 0 : update_4_stats_highbd_avx2(
3073 0 : H + (i - 1) * wiener_win * wiener_win2 +
3074 0 : (j - 1) * wiener_win + 4,
3075 : deltas[1],
3076 0 : H + i * wiener_win * wiener_win2 + j * wiener_win + 4);
3077 :
3078 0 : H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
3079 0 : H[((i - 1) * wiener_win + 1) * wiener_win2 +
3080 0 : (j - 1) * wiener_win] +
3081 0 : _mm256_extract_epi64(deltas[2], 0);
3082 0 : H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
3083 0 : H[((i - 1) * wiener_win + 2) * wiener_win2 +
3084 0 : (j - 1) * wiener_win] +
3085 0 : _mm256_extract_epi64(deltas[2], 1);
3086 0 : H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
3087 0 : H[((i - 1) * wiener_win + 3) * wiener_win2 +
3088 0 : (j - 1) * wiener_win] +
3089 0 : _mm256_extract_epi64(deltas[2], 2);
3090 0 : H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
3091 0 : H[((i - 1) * wiener_win + 4) * wiener_win2 +
3092 0 : (j - 1) * wiener_win] +
3093 0 : _mm256_extract_epi64(deltas[2], 3);
3094 0 : H[(i * wiener_win + 5) * wiener_win2 + j * wiener_win] =
3095 0 : H[((i - 1) * wiener_win + 5) * wiener_win2 +
3096 0 : (j - 1) * wiener_win] +
3097 0 : _mm256_extract_epi64(deltas[3], 0);
3098 0 : H[(i * wiener_win + 6) * wiener_win2 + j * wiener_win] =
3099 0 : H[((i - 1) * wiener_win + 6) * wiener_win2 +
3100 0 : (j - 1) * wiener_win] +
3101 0 : _mm256_extract_epi64(deltas[3], 1);
3102 : }
3103 2700 : } while (++j < wiener_win);
3104 900 : } while (++i < wiener_win - 1);
3105 :
3106 : // Step 5: Derive other points of each square. No square in bottom row.
3107 180 : i = 0;
3108 : do {
3109 1080 : const int16_t *const dI = d + i;
3110 :
3111 1080 : j = i + 1;
3112 : do {
3113 3780 : const int16_t *const dJ = d + j;
3114 3780 : __m256i deltas[WIENER_WIN - 1][WIENER_WIN - 1] = {{{0}},{{0}}};
3115 : __m256i dIs[WIENER_WIN - 1], dIe[WIENER_WIN - 1];
3116 : __m256i dJs[WIENER_WIN - 1], dJe[WIENER_WIN - 1];
3117 :
3118 3780 : x = 0;
3119 : do {
3120 50400 : load_square_win7_avx2(
3121 50400 : dI + x, dJ + x, d_stride, height, dIs, dIe, dJs, dJe);
3122 50400 : derive_square_win7_avx2(dIs, dIe, dJs, dJe, deltas);
3123 :
3124 50400 : x += 16;
3125 50400 : } while (x < w16);
3126 :
3127 3780 : if (w16 != width) {
3128 0 : load_square_win7_avx2(
3129 0 : dI + x, dJ + x, d_stride, height, dIs, dIe, dJs, dJe);
3130 :
3131 0 : dIs[0] = _mm256_and_si256(dIs[0], mask);
3132 0 : dIs[1] = _mm256_and_si256(dIs[1], mask);
3133 0 : dIs[2] = _mm256_and_si256(dIs[2], mask);
3134 0 : dIs[3] = _mm256_and_si256(dIs[3], mask);
3135 0 : dIs[4] = _mm256_and_si256(dIs[4], mask);
3136 0 : dIs[5] = _mm256_and_si256(dIs[5], mask);
3137 0 : dIe[0] = _mm256_and_si256(dIe[0], mask);
3138 0 : dIe[1] = _mm256_and_si256(dIe[1], mask);
3139 0 : dIe[2] = _mm256_and_si256(dIe[2], mask);
3140 0 : dIe[3] = _mm256_and_si256(dIe[3], mask);
3141 0 : dIe[4] = _mm256_and_si256(dIe[4], mask);
3142 0 : dIe[5] = _mm256_and_si256(dIe[5], mask);
3143 :
3144 0 : derive_square_win7_avx2(dIs, dIe, dJs, dJe, deltas);
3145 : }
3146 :
3147 3780 : if (bit_depth < AOM_BITS_12) {
3148 3780 : hadd_update_6_stats_avx2(
3149 3780 : H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win,
3150 : deltas[0],
3151 3780 : H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win +
3152 : 1);
3153 3780 : hadd_update_6_stats_avx2(
3154 3780 : H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win,
3155 : deltas[1],
3156 3780 : H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win +
3157 : 1);
3158 3780 : hadd_update_6_stats_avx2(
3159 3780 : H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win,
3160 : deltas[2],
3161 3780 : H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win +
3162 : 1);
3163 3780 : hadd_update_6_stats_avx2(
3164 3780 : H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win,
3165 : deltas[3],
3166 3780 : H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win +
3167 : 1);
3168 3780 : hadd_update_6_stats_avx2(
3169 3780 : H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win,
3170 : deltas[4],
3171 3780 : H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win +
3172 : 1);
3173 3780 : hadd_update_6_stats_avx2(
3174 3780 : H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win,
3175 : deltas[5],
3176 3780 : H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win +
3177 : 1);
3178 : } else {
3179 0 : hadd_update_6_stats_highbd_avx2(
3180 0 : H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win,
3181 : deltas[0],
3182 0 : H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win +
3183 : 1);
3184 0 : hadd_update_6_stats_highbd_avx2(
3185 0 : H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win,
3186 : deltas[1],
3187 0 : H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win +
3188 : 1);
3189 0 : hadd_update_6_stats_highbd_avx2(
3190 0 : H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win,
3191 : deltas[2],
3192 0 : H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win +
3193 : 1);
3194 0 : hadd_update_6_stats_highbd_avx2(
3195 0 : H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win,
3196 : deltas[3],
3197 0 : H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win +
3198 : 1);
3199 0 : hadd_update_6_stats_highbd_avx2(
3200 0 : H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win,
3201 : deltas[4],
3202 0 : H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win +
3203 : 1);
3204 0 : hadd_update_6_stats_highbd_avx2(
3205 0 : H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win,
3206 : deltas[5],
3207 0 : H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win +
3208 : 1);
3209 : }
3210 3780 : } while (++j < wiener_win);
3211 1080 : } while (++i < wiener_win - 1);
3212 :
3213 : // Step 6: Derive other points of each upper triangle along the diagonal.
3214 180 : i = 0;
3215 : do {
3216 1260 : const int16_t *const dI = d + i;
3217 1260 : __m256i deltas[WIENER_WIN * (WIENER_WIN - 1) / 2] = {0};
3218 : __m256i dIs[WIENER_WIN - 1], dIe[WIENER_WIN - 1];
3219 :
3220 1260 : x = 0;
3221 : do {
3222 16800 : load_triangle_win7_avx2(dI + x, d_stride, height, dIs, dIe);
3223 16800 : derive_triangle_win7_avx2(dIs, dIe, deltas);
3224 :
3225 16800 : x += 16;
3226 16800 : } while (x < w16);
3227 :
3228 1260 : if (w16 != width) {
3229 0 : load_triangle_win7_avx2(dI + x, d_stride, height, dIs, dIe);
3230 :
3231 0 : dIs[0] = _mm256_and_si256(dIs[0], mask);
3232 0 : dIs[1] = _mm256_and_si256(dIs[1], mask);
3233 0 : dIs[2] = _mm256_and_si256(dIs[2], mask);
3234 0 : dIs[3] = _mm256_and_si256(dIs[3], mask);
3235 0 : dIs[4] = _mm256_and_si256(dIs[4], mask);
3236 0 : dIs[5] = _mm256_and_si256(dIs[5], mask);
3237 0 : dIe[0] = _mm256_and_si256(dIe[0], mask);
3238 0 : dIe[1] = _mm256_and_si256(dIe[1], mask);
3239 0 : dIe[2] = _mm256_and_si256(dIe[2], mask);
3240 0 : dIe[3] = _mm256_and_si256(dIe[3], mask);
3241 0 : dIe[4] = _mm256_and_si256(dIe[4], mask);
3242 0 : dIe[5] = _mm256_and_si256(dIe[5], mask);
3243 :
3244 0 : derive_triangle_win7_avx2(dIs, dIe, deltas);
3245 : }
3246 :
3247 1260 : if (bit_depth < AOM_BITS_12) {
3248 : // Row 1: 6 points
3249 1260 : hadd_update_6_stats_avx2(
3250 1260 : H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win,
3251 : deltas,
3252 1260 : H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
3253 :
3254 1260 : const __m128i delta0 = hadd_four_32_avx2(
3255 : deltas[15], deltas[16], deltas[17], deltas[10]);
3256 1260 : const __m128i delta1 = hadd_four_32_avx2(
3257 : deltas[18], deltas[19], deltas[20], deltas[20]);
3258 1260 : const __m128i delta2 = _mm_cvtepi32_epi64(delta0);
3259 1260 : const __m128i delta3 = _mm_cvtepi32_epi64(delta1);
3260 :
3261 : // Row 2: 5 points
3262 1260 : hadd_update_4_stats_avx2(
3263 1260 : H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1,
3264 : deltas + 6,
3265 1260 : H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
3266 1260 : H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 6] =
3267 1260 : H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 5] +
3268 1260 : _mm_extract_epi32(delta0, 3);
3269 :
3270 : // Row 3: 4 points
3271 1260 : hadd_update_4_stats_avx2(
3272 1260 : H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
3273 : deltas + 11,
3274 1260 : H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
3275 :
3276 : // Row 4: 3 points
3277 1260 : update_2_stats_sse2(
3278 1260 : H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3,
3279 : delta2,
3280 1260 : H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4);
3281 1260 : H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 6] =
3282 1260 : H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 5] +
3283 1260 : _mm_extract_epi32(delta0, 2);
3284 :
3285 : // Row 5: 2 points
3286 1260 : update_2_stats_sse2(
3287 1260 : H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4,
3288 : delta3,
3289 1260 : H + (i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5);
3290 :
3291 : // Row 6: 1 points
3292 1260 : H[(i * wiener_win + 6) * wiener_win2 + i * wiener_win + 6] =
3293 1260 : H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] +
3294 1260 : _mm_extract_epi32(delta1, 2);
3295 : } else {
3296 : // Row 1: 6 points
3297 0 : hadd_update_6_stats_highbd_avx2(
3298 0 : H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win,
3299 : deltas,
3300 0 : H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
3301 :
3302 0 : const __m256i delta0 = hadd_four_31_to_64_avx2(
3303 : deltas[15], deltas[16], deltas[17], deltas[10]);
3304 0 : const __m256i delta1 = hadd_four_31_to_64_avx2(
3305 : deltas[18], deltas[19], deltas[20], deltas[20]);
3306 :
3307 : // Row 2: 5 points
3308 0 : hadd_update_4_stats_highbd_avx2(
3309 0 : H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1,
3310 : deltas + 6,
3311 0 : H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
3312 0 : H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 6] =
3313 0 : H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 5] +
3314 0 : _mm256_extract_epi64(delta0, 3);
3315 :
3316 : // Row 3: 4 points
3317 0 : hadd_update_4_stats_highbd_avx2(
3318 0 : H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
3319 : deltas + 11,
3320 0 : H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
3321 :
3322 : // Row 4: 3 points
3323 0 : update_2_stats_sse2(
3324 0 : H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3,
3325 0 : _mm256_extracti128_si256(delta0, 0),
3326 0 : H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4);
3327 0 : H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 6] =
3328 0 : H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 5] +
3329 0 : _mm256_extract_epi64(delta0, 2);
3330 :
3331 : // Row 5: 2 points
3332 0 : update_2_stats_sse2(
3333 0 : H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4,
3334 0 : _mm256_extracti128_si256(delta1, 0),
3335 0 : H + (i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5);
3336 :
3337 : // Row 6: 1 points
3338 0 : H[(i * wiener_win + 6) * wiener_win2 + i * wiener_win + 6] =
3339 0 : H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] +
3340 0 : _mm256_extract_epi64(delta1, 2);
3341 : }
3342 1260 : } while (++i < wiener_win);
3343 180 : }
3344 :
3345 300 : void eb_av1_compute_stats_avx2(int32_t wiener_win, const uint8_t *dgd,
3346 : const uint8_t *src, int32_t h_start, int32_t h_end, int32_t v_start,
3347 : int32_t v_end, int32_t dgd_stride, int32_t src_stride, int64_t *M,
3348 : int64_t *H) {
3349 300 : const int32_t wiener_win2 = wiener_win * wiener_win;
3350 300 : const int32_t wiener_halfwin = wiener_win >> 1;
3351 : const uint8_t avg =
3352 300 : find_average_avx2(dgd, h_start, h_end, v_start, v_end, dgd_stride);
3353 300 : const int32_t width = h_end - h_start;
3354 300 : const int32_t height = v_end - v_start;
3355 300 : const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15;
3356 300 : const int32_t s_stride = (width + 15) & ~15;
3357 : int16_t *d, *s;
3358 :
3359 : // The maximum input size is width * height, which is
3360 : // (9 / 4) * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX. Enlarge to
3361 : // 3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX considering
3362 : // paddings.
3363 300 : d = eb_aom_memalign(32,
3364 : sizeof(*d) * 6 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX);
3365 300 : s = d + 3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
3366 :
3367 : assert(!(height % 2));
3368 :
3369 300 : sub_avg_block_avx2(src + v_start * src_stride + h_start,
3370 : src_stride,
3371 : avg,
3372 : width,
3373 : height,
3374 : s,
3375 : s_stride);
3376 300 : sub_avg_block_avx2(dgd + (v_start - wiener_halfwin) * dgd_stride + h_start -
3377 : wiener_halfwin,
3378 : dgd_stride,
3379 : avg,
3380 300 : width + 2 * wiener_halfwin,
3381 300 : height + 2 * wiener_halfwin,
3382 : d,
3383 : d_stride);
3384 :
3385 300 : if (wiener_win == WIENER_WIN) {
3386 180 : compute_stats_win7_avx2(
3387 : d, d_stride, s, s_stride, width, height, M, H, 8);
3388 120 : } else if (wiener_win == WIENER_WIN_CHROMA) {
3389 120 : compute_stats_win5_avx2(
3390 : d, d_stride, s, s_stride, width, height, M, H, 8);
3391 : } else {
3392 : assert(wiener_win == WIENER_WIN_3TAP);
3393 0 : compute_stats_win3_avx2(
3394 : d, d_stride, s, s_stride, width, height, M, H, 8);
3395 : }
3396 :
3397 : // H is a symmetric matrix, so we only need to fill out the upper triangle.
3398 : // We can copy it down to the lower triangle outside the (i, j) loops.
3399 300 : diagonal_copy_stats_avx2(wiener_win2, H);
3400 :
3401 300 : eb_aom_free(d);
3402 300 : }
3403 :
3404 0 : void eb_av1_compute_stats_highbd_avx2(int32_t wiener_win, const uint8_t *dgd8,
3405 : const uint8_t *src8, int32_t h_start,
3406 : int32_t h_end, int32_t v_start,
3407 : int32_t v_end, int32_t dgd_stride,
3408 : int32_t src_stride, int64_t *M, int64_t *H,
3409 : AomBitDepth bit_depth) {
3410 0 : const int32_t wiener_win2 = wiener_win * wiener_win;
3411 0 : const int32_t wiener_halfwin = (wiener_win >> 1);
3412 0 : const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
3413 0 : const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
3414 0 : const uint16_t avg = find_average_highbd_avx2(
3415 : dgd, h_start, h_end, v_start, v_end, dgd_stride, bit_depth);
3416 0 : const int32_t width = h_end - h_start;
3417 0 : const int32_t height = v_end - v_start;
3418 0 : const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15;
3419 0 : const int32_t s_stride = (width + 15) & ~15;
3420 : int32_t k;
3421 : int16_t *d, *s;
3422 :
3423 : assert(!(height % 2));
3424 :
3425 : // The maximum input size is width * height, which is
3426 : // (9 / 4) * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX. Enlarge to
3427 : // 3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX considering
3428 : // paddings.
3429 0 : d = eb_aom_memalign(32,
3430 : sizeof(*d) * 6 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX);
3431 0 : s = d + 3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
3432 :
3433 0 : sub_avg_block_highbd_avx2(src + v_start * src_stride + h_start,
3434 : src_stride,
3435 : avg,
3436 : width,
3437 : height,
3438 : s,
3439 : s_stride);
3440 0 : sub_avg_block_highbd_avx2(dgd + (v_start - wiener_halfwin) * dgd_stride +
3441 0 : h_start - wiener_halfwin,
3442 : dgd_stride,
3443 : avg,
3444 0 : width + 2 * wiener_halfwin,
3445 0 : height + 2 * wiener_halfwin,
3446 : d,
3447 : d_stride);
3448 :
3449 0 : if (wiener_win == WIENER_WIN) {
3450 0 : compute_stats_win7_avx2(
3451 : d, d_stride, s, s_stride, width, height, M, H, bit_depth);
3452 0 : } else if (wiener_win == WIENER_WIN_CHROMA) {
3453 0 : compute_stats_win5_avx2(
3454 : d, d_stride, s, s_stride, width, height, M, H, bit_depth);
3455 : } else {
3456 : assert(wiener_win == WIENER_WIN_3TAP);
3457 0 : compute_stats_win3_avx2(
3458 : d, d_stride, s, s_stride, width, height, M, H, bit_depth);
3459 : }
3460 :
3461 : // H is a symmetric matrix, so we only need to fill out the upper triangle.
3462 : // We can copy it down to the lower triangle outside the (i, j) loops.
3463 0 : if (bit_depth == AOM_BITS_8) {
3464 0 : diagonal_copy_stats_avx2(wiener_win2, H);
3465 0 : } else if (bit_depth == AOM_BITS_10) {
3466 0 : const int32_t k4 = wiener_win2 & ~3;
3467 :
3468 0 : k = 0;
3469 : do {
3470 0 : const __m256i src = _mm256_loadu_si256((__m256i *)(M + k));
3471 0 : const __m256i dst = div4_avx2(src);
3472 0 : _mm256_storeu_si256((__m256i *)(M + k), dst);
3473 0 : H[k * wiener_win2 + k] /= 4;
3474 0 : k += 4;
3475 0 : } while (k < k4);
3476 :
3477 0 : H[k * wiener_win2 + k] /= 4;
3478 :
3479 0 : for (; k < wiener_win2; ++k) {
3480 0 : M[k] /= 4;
3481 : }
3482 :
3483 0 : div4_diagonal_copy_stats_avx2(wiener_win2, H);
3484 : } else {
3485 0 : const int32_t k4 = wiener_win2 & ~3;
3486 :
3487 0 : k = 0;
3488 : do {
3489 0 : const __m256i src = _mm256_loadu_si256((__m256i *)(M + k));
3490 0 : const __m256i dst = div16_avx2(src);
3491 0 : _mm256_storeu_si256((__m256i *)(M + k), dst);
3492 0 : H[k * wiener_win2 + k] /= 16;
3493 0 : k += 4;
3494 0 : } while (k < k4);
3495 :
3496 0 : H[k * wiener_win2 + k] /= 16;
3497 :
3498 0 : for (; k < wiener_win2; ++k) {
3499 0 : M[k] /= 16;
3500 : }
3501 :
3502 0 : div16_diagonal_copy_stats_avx2(wiener_win2, H);
3503 : }
3504 :
3505 0 : eb_aom_free(d);
3506 0 : }
3507 :
3508 36589 : static INLINE __m256i pair_set_epi16(uint16_t a, uint16_t b) {
3509 36589 : return _mm256_set1_epi32(
3510 36589 : (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
3511 : }
3512 :
3513 36589 : int64_t eb_av1_lowbd_pixel_proj_error_avx2(const uint8_t *src8, int32_t width,
3514 : int32_t height, int32_t src_stride,
3515 : const uint8_t *dat8,
3516 : int32_t dat_stride, int32_t *flt0,
3517 : int32_t flt0_stride, int32_t *flt1,
3518 : int32_t flt1_stride, int32_t xq[2],
3519 : const SgrParamsType *params) {
3520 36589 : const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
3521 36589 : const uint8_t *src = src8;
3522 36589 : const uint8_t *dat = dat8;
3523 36589 : int64_t err = 0;
3524 36589 : int32_t y = height;
3525 : int32_t j;
3526 73178 : const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
3527 36589 : __m256i sum64 = _mm256_setzero_si256();
3528 :
3529 62845 : if (params->r[0] > 0 && params->r[1] > 0) {
3530 26256 : const __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]);
3531 :
3532 : do {
3533 7727110 : __m256i sum32 = _mm256_setzero_si256();
3534 :
3535 123613000 : for (j = 0; j <= width - 16; j += 16) {
3536 116070000 : const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
3537 116072000 : const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
3538 115972000 : const __m256i flt0_16b = _mm256_permute4x64_epi64(
3539 : _mm256_packs_epi32(yy_loadu_256(flt0 + j),
3540 : yy_loadu_256(flt0 + j + 8)),
3541 : 0xd8);
3542 231801000 : const __m256i flt1_16b = _mm256_permute4x64_epi64(
3543 : _mm256_packs_epi32(yy_loadu_256(flt1 + j),
3544 : yy_loadu_256(flt1 + j + 8)),
3545 : 0xd8);
3546 115886000 : const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
3547 115886000 : const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0);
3548 115886000 : const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0);
3549 231771000 : const __m256i v0 = _mm256_madd_epi16(
3550 : xq_coeff,
3551 : _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
3552 231771000 : const __m256i v1 = _mm256_madd_epi16(
3553 : xq_coeff,
3554 : _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
3555 : const __m256i vr0 =
3556 231771000 : _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
3557 : const __m256i vr1 =
3558 231771000 : _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
3559 347657000 : const __m256i e0 = _mm256_sub_epi16(
3560 : _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
3561 115886000 : const __m256i err0 = _mm256_madd_epi16(e0, e0);
3562 115886000 : sum32 = _mm256_add_epi32(sum32, err0);
3563 : }
3564 :
3565 7542180 : for (; j < width; ++j) {
3566 0 : const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
3567 0 : int32_t v = xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u);
3568 0 : const int32_t e =
3569 0 : ROUND_POWER_OF_TWO(v, shift) + dat[j] - src[j];
3570 0 : err += e * e;
3571 : }
3572 :
3573 7542180 : dat += dat_stride;
3574 7542180 : src += src_stride;
3575 7542180 : flt0 += flt0_stride;
3576 7542180 : flt1 += flt1_stride;
3577 : const __m256i sum64_0 =
3578 7542180 : _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
3579 : const __m256i sum64_1 =
3580 15084400 : _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
3581 7542180 : sum64 = _mm256_add_epi64(sum64, sum64_0);
3582 7542180 : sum64 = _mm256_add_epi64(sum64, sum64_1);
3583 7542180 : } while (--y);
3584 : }
3585 13487 : else if (params->r[0] > 0 || params->r[1] > 0) {
3586 10333 : const int32_t xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
3587 : const __m256i xq_coeff =
3588 10333 : pair_set_epi16(xq_active, (-xq_active * (1 << SGRPROJ_RST_BITS)));
3589 10333 : const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
3590 10333 : const int32_t flt_stride =
3591 10333 : (params->r[0] > 0) ? flt0_stride : flt1_stride;
3592 :
3593 : do {
3594 3177620 : __m256i sum32 = _mm256_setzero_si256();
3595 :
3596 49613300 : for (j = 0; j <= width - 16; j += 16) {
3597 46442800 : const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
3598 46442200 : const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
3599 92874800 : const __m256i flt_16b = _mm256_permute4x64_epi64(
3600 : _mm256_packs_epi32(yy_loadu_256(flt + j),
3601 : yy_loadu_256(flt + j + 8)),
3602 : 0xd8);
3603 92871300 : const __m256i v0 = _mm256_madd_epi16(
3604 : xq_coeff, _mm256_unpacklo_epi16(flt_16b, d0));
3605 92871300 : const __m256i v1 = _mm256_madd_epi16(
3606 : xq_coeff, _mm256_unpackhi_epi16(flt_16b, d0));
3607 : const __m256i vr0 =
3608 92871300 : _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
3609 : const __m256i vr1 =
3610 92871300 : _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
3611 139307000 : const __m256i e0 = _mm256_sub_epi16(
3612 : _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
3613 46435600 : const __m256i err0 = _mm256_madd_epi16(e0, e0);
3614 46435600 : sum32 = _mm256_add_epi32(sum32, err0);
3615 : }
3616 :
3617 3170440 : for (; j < width; ++j) {
3618 0 : const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
3619 0 : int32_t v = xq_active * (flt[j] - u);
3620 0 : const int32_t e =
3621 0 : ROUND_POWER_OF_TWO(v, shift) + dat[j] - src[j];
3622 0 : err += e * e;
3623 : }
3624 :
3625 3170440 : dat += dat_stride;
3626 3170440 : src += src_stride;
3627 3170440 : flt += flt_stride;
3628 : const __m256i sum64_0 =
3629 3170440 : _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
3630 : const __m256i sum64_1 =
3631 6340880 : _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
3632 3170440 : sum64 = _mm256_add_epi64(sum64, sum64_0);
3633 3170440 : sum64 = _mm256_add_epi64(sum64, sum64_1);
3634 3170440 : } while (--y);
3635 : }
3636 : else {
3637 0 : __m256i sum32 = _mm256_setzero_si256();
3638 :
3639 : do {
3640 0 : for (j = 0; j <= width - 16; j += 16) {
3641 0 : const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
3642 0 : const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
3643 0 : const __m256i diff0 = _mm256_sub_epi16(d0, s0);
3644 0 : const __m256i err0 = _mm256_madd_epi16(diff0, diff0);
3645 0 : sum32 = _mm256_add_epi32(sum32, err0);
3646 : }
3647 :
3648 0 : for (; j < width; ++j) {
3649 0 : const int32_t e = (int32_t)(dat[j]) - src[j];
3650 0 : err += e * e;
3651 : }
3652 :
3653 0 : dat += dat_stride;
3654 0 : src += src_stride;
3655 0 : } while (--y);
3656 :
3657 : const __m256i sum64_0 =
3658 0 : _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
3659 : const __m256i sum64_1 =
3660 0 : _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
3661 0 : sum64 = _mm256_add_epi64(sum64_0, sum64_1);
3662 : }
3663 :
3664 65999 : return err + _mm_cvtsi128_si64(hadd_64_avx2(sum64));
3665 : }
3666 :
3667 0 : int64_t eb_av1_highbd_pixel_proj_error_avx2(
3668 : const uint8_t *src8, int32_t width, int32_t height, int32_t src_stride,
3669 : const uint8_t *dat8, int32_t dat_stride, int32_t *flt0, int32_t flt0_stride,
3670 : int32_t *flt1, int32_t flt1_stride, int32_t xq[2], const SgrParamsType *params) {
3671 : int32_t i, j, k;
3672 0 : const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
3673 0 : const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
3674 0 : __m256i sum64 = _mm256_setzero_si256();
3675 0 : const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
3676 0 : const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
3677 0 : int64_t err = 0;
3678 0 : if (params->r[0] > 0 && params->r[1] > 0) { // Both filters are enabled
3679 0 : const __m256i xq0 = _mm256_set1_epi32(xq[0]);
3680 0 : const __m256i xq1 = _mm256_set1_epi32(xq[1]);
3681 0 : for (i = 0; i < height; ++i) {
3682 0 : __m256i sum32 = _mm256_setzero_si256();
3683 0 : for (j = 0; j <= width - 16; j += 16) { // Process 16 pixels at a time
3684 : // Load 16 pixels each from source image and corrupted image
3685 0 : const __m256i s0 = yy_loadu_256(src + j);
3686 0 : const __m256i d0 = yy_loadu_256(dat + j);
3687 : // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 (indices)
3688 :
3689 : // Shift-up each pixel to match filtered image scaling
3690 0 : const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
3691 :
3692 : // Split u0 into two halves and pad each from u16 to i32
3693 0 : const __m256i u0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(u0));
3694 : const __m256i u0h =
3695 0 : _mm256_cvtepu16_epi32(_mm256_extracti128_si256(u0, 1));
3696 : // u0h, u0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32
3697 :
3698 : // Load 16 pixels from each filtered image
3699 0 : const __m256i flt0l = yy_loadu_256(flt0 + j);
3700 0 : const __m256i flt0h = yy_loadu_256(flt0 + j + 8);
3701 0 : const __m256i flt1l = yy_loadu_256(flt1 + j);
3702 0 : const __m256i flt1h = yy_loadu_256(flt1 + j + 8);
3703 : // flt?l, flt?h = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32
3704 :
3705 : // Subtract shifted corrupt image from each filtered image
3706 0 : const __m256i flt0l_subu = _mm256_sub_epi32(flt0l, u0l);
3707 0 : const __m256i flt0h_subu = _mm256_sub_epi32(flt0h, u0h);
3708 0 : const __m256i flt1l_subu = _mm256_sub_epi32(flt1l, u0l);
3709 0 : const __m256i flt1h_subu = _mm256_sub_epi32(flt1h, u0h);
3710 :
3711 : // Multiply basis vectors by appropriate coefficients
3712 0 : const __m256i v0l = _mm256_mullo_epi32(flt0l_subu, xq0);
3713 0 : const __m256i v0h = _mm256_mullo_epi32(flt0h_subu, xq0);
3714 0 : const __m256i v1l = _mm256_mullo_epi32(flt1l_subu, xq1);
3715 0 : const __m256i v1h = _mm256_mullo_epi32(flt1h_subu, xq1);
3716 :
3717 : // Add together the contributions from the two basis vectors
3718 0 : const __m256i vl = _mm256_add_epi32(v0l, v1l);
3719 0 : const __m256i vh = _mm256_add_epi32(v0h, v1h);
3720 :
3721 : // Right-shift v with appropriate rounding
3722 : const __m256i vrl =
3723 0 : _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift);
3724 : const __m256i vrh =
3725 0 : _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift);
3726 : // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0]
3727 :
3728 : // Saturate each i32 to an i16 then combine both halves
3729 : // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes
3730 : const __m256i vr =
3731 0 : _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8);
3732 : // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0]
3733 : // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0]
3734 :
3735 : // Add twin-subspace-sgr-filter to corrupt image then subtract source
3736 0 : const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0);
3737 :
3738 : // Calculate squared error and add adjacent values
3739 0 : const __m256i err0 = _mm256_madd_epi16(e0, e0);
3740 :
3741 0 : sum32 = _mm256_add_epi32(sum32, err0);
3742 : }
3743 :
3744 : const __m256i sum32l =
3745 0 : _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
3746 0 : sum64 = _mm256_add_epi64(sum64, sum32l);
3747 : const __m256i sum32h =
3748 0 : _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
3749 0 : sum64 = _mm256_add_epi64(sum64, sum32h);
3750 :
3751 : // Process remaining pixels in this row (modulo 16)
3752 0 : for (k = j; k < width; ++k) {
3753 0 : const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
3754 0 : int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
3755 0 : const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
3756 0 : err += e * e;
3757 : }
3758 0 : dat += dat_stride;
3759 0 : src += src_stride;
3760 0 : flt0 += flt0_stride;
3761 0 : flt1 += flt1_stride;
3762 : }
3763 : }
3764 0 : else if (params->r[0] > 0 || params->r[1] > 0) { // Only one filter enabled
3765 0 : const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1];
3766 0 : const __m256i xq_active = _mm256_set1_epi32(xq_on);
3767 : const __m256i xq_inactive =
3768 0 : _mm256_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS));
3769 0 : const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
3770 0 : const int32_t flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
3771 0 : for (i = 0; i < height; ++i) {
3772 0 : __m256i sum32 = _mm256_setzero_si256();
3773 0 : for (j = 0; j <= width - 16; j += 16) {
3774 : // Load 16 pixels from source image
3775 0 : const __m256i s0 = yy_loadu_256(src + j);
3776 : // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
3777 :
3778 : // Load 16 pixels from corrupted image and pad each u16 to i32
3779 0 : const __m256i d0 = yy_loadu_256(dat + j);
3780 : const __m256i d0h =
3781 0 : _mm256_cvtepu16_epi32(_mm256_extracti128_si256(d0, 1));
3782 0 : const __m256i d0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(d0));
3783 : // d0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
3784 : // d0h, d0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
3785 :
3786 : // Load 16 pixels from the filtered image
3787 0 : const __m256i flth = yy_loadu_256(flt + j + 8);
3788 0 : const __m256i fltl = yy_loadu_256(flt + j);
3789 : // flth, fltl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
3790 :
3791 0 : const __m256i flth_xq = _mm256_mullo_epi32(flth, xq_active);
3792 0 : const __m256i fltl_xq = _mm256_mullo_epi32(fltl, xq_active);
3793 0 : const __m256i d0h_xq = _mm256_mullo_epi32(d0h, xq_inactive);
3794 0 : const __m256i d0l_xq = _mm256_mullo_epi32(d0l, xq_inactive);
3795 :
3796 0 : const __m256i vh = _mm256_add_epi32(flth_xq, d0h_xq);
3797 0 : const __m256i vl = _mm256_add_epi32(fltl_xq, d0l_xq);
3798 :
3799 : // Shift this down with appropriate rounding
3800 : const __m256i vrh =
3801 0 : _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift);
3802 : const __m256i vrl =
3803 0 : _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift);
3804 : // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
3805 :
3806 : // Saturate each i32 to an i16 then combine both halves
3807 : // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes
3808 : const __m256i vr =
3809 0 : _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8);
3810 : // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0] as u16
3811 : // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
3812 :
3813 : // Subtract twin-subspace-sgr filtered from source image to get error
3814 0 : const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0);
3815 :
3816 : // Calculate squared error and add adjacent values
3817 0 : const __m256i err0 = _mm256_madd_epi16(e0, e0);
3818 :
3819 0 : sum32 = _mm256_add_epi32(sum32, err0);
3820 : }
3821 :
3822 : const __m256i sum32l =
3823 0 : _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
3824 0 : sum64 = _mm256_add_epi64(sum64, sum32l);
3825 : const __m256i sum32h =
3826 0 : _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
3827 0 : sum64 = _mm256_add_epi64(sum64, sum32h);
3828 :
3829 : // Process remaining pixels in this row (modulo 16)
3830 0 : for (k = j; k < width; ++k) {
3831 0 : const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
3832 0 : int32_t v = xq_on * (flt[k] - u);
3833 0 : const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
3834 0 : err += e * e;
3835 : }
3836 0 : dat += dat_stride;
3837 0 : src += src_stride;
3838 0 : flt += flt_stride;
3839 : }
3840 : }
3841 : else { // Neither filter is enabled
3842 0 : for (i = 0; i < height; ++i) {
3843 0 : __m256i sum32 = _mm256_setzero_si256();
3844 0 : for (j = 0; j <= width - 32; j += 32) {
3845 : // Load 2x16 u16 from source image
3846 0 : const __m256i s0l = yy_loadu_256(src + j);
3847 0 : const __m256i s0h = yy_loadu_256(src + j + 16);
3848 :
3849 : // Load 2x16 u16 from corrupted image
3850 0 : const __m256i d0l = yy_loadu_256(dat + j);
3851 0 : const __m256i d0h = yy_loadu_256(dat + j + 16);
3852 :
3853 : // Subtract corrupted image from source image
3854 0 : const __m256i diffl = _mm256_sub_epi16(d0l, s0l);
3855 0 : const __m256i diffh = _mm256_sub_epi16(d0h, s0h);
3856 :
3857 : // Square error and add adjacent values
3858 0 : const __m256i err0l = _mm256_madd_epi16(diffl, diffl);
3859 0 : const __m256i err0h = _mm256_madd_epi16(diffh, diffh);
3860 :
3861 0 : sum32 = _mm256_add_epi32(sum32, err0l);
3862 0 : sum32 = _mm256_add_epi32(sum32, err0h);
3863 : }
3864 :
3865 : const __m256i sum32l =
3866 0 : _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
3867 0 : sum64 = _mm256_add_epi64(sum64, sum32l);
3868 : const __m256i sum32h =
3869 0 : _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
3870 0 : sum64 = _mm256_add_epi64(sum64, sum32h);
3871 :
3872 : // Process remaining pixels (modulu 16)
3873 0 : for (k = j; k < width; ++k) {
3874 0 : const int32_t e = (int32_t)(dat[k]) - src[k];
3875 0 : err += e * e;
3876 : }
3877 0 : dat += dat_stride;
3878 0 : src += src_stride;
3879 : }
3880 : }
3881 :
3882 : // Sum 4 values from sum64l and sum64h into err
3883 : int64_t sum[4];
3884 0 : yy_storeu_256(sum, sum64);
3885 0 : err += sum[0] + sum[1] + sum[2] + sum[3];
3886 0 : return err;
3887 : }
|