Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include <string.h>
7 :
8 : #include "EbDefinitions.h"
9 : #include "immintrin.h"
10 : #include "EbIntrinMacros_SSE2.h"
11 : #include "EbIntraPrediction_AVX2.h"
12 : #include "lpf_common_sse2.h"
13 : #include "txfm_common_avx2.h"
14 : #include "aom_dsp_rtcd.h"
15 :
16 : // Indices are sign, integer, and fractional part of the gradient value
17 : static const uint8_t gradient_to_angle_bin[2][7][16] = {
18 : {
19 : { 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0 },
20 : { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },
21 : { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
22 : { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
23 : { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
24 : { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
25 : { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
26 : },
27 : {
28 : { 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4 },
29 : { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3 },
30 : { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
31 : { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
32 : { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
33 : { 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
34 : { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
35 : },
36 : };
37 :
38 8323770 : static INLINE __m256i __m256i_div_epi32(const __m256i *a, const __m256i *b)
39 : {
40 24971300 : __m256 d_f = _mm256_div_ps(_mm256_cvtepi32_ps(*a), _mm256_cvtepi32_ps(*b));
41 : //Integer devide round down
42 16647500 : return _mm256_cvtps_epi32(_mm256_floor_ps(d_f));
43 : }
44 :
45 4163900 : static INLINE void get_gradient_hist_avx2_internal(const __m256i *src1,
46 : const __m256i *src2, const __m256i *src3, int16_t *dy_mask_array,
47 : int16_t *quot_array, int16_t *remd_array, int16_t * sn_array,
48 : int32_t *temp_array) {
49 :
50 4163900 : const __m256i zero = _mm256_setzero_si256();
51 4163900 : const __m256i val_15_i16 = _mm256_set1_epi16(15);
52 4163900 : const __m256i val_6_i16 = _mm256_set1_epi16(6);
53 : __m256i dx, dy;
54 : __m256i tmp1_32, tmp2_32;
55 : __m256i dx1_32, dx2_32;
56 : __m256i dy1_32, dy2_32;
57 : __m256i sn;
58 : __m256i remd;
59 : __m256i quot;
60 : __m256i dy_mask;
61 :
62 4163900 : dx = _mm256_sub_epi16(*src1, *src2);
63 8327800 : dy = _mm256_sub_epi16(*src1, *src3);
64 :
65 : //sn = (dx > 0) ^ (dy > 0);
66 4163900 : sn = _mm256_xor_si256(dx, dy); //result is 0 or 0xFFFF
67 4163900 : sn = _mm256_srli_epi16(sn, 15); //change output from 0xFFFF to 1
68 :
69 : //mask which shows where are zeros in dy register 0/1
70 8327800 : dy_mask = _mm256_srli_epi16(_mm256_cmpeq_epi16(dy, zero), 15);
71 :
72 : //dx = abs(dx); dy = abs(dy);
73 4163900 : dx = _mm256_abs_epi16(dx);
74 4163900 : dy = _mm256_abs_epi16(dy);
75 :
76 : _mm256_add_epi16(dy, dy_mask);
77 :
78 : // temp = dx * dx + dy * dy;
79 8327800 : dx1_32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(dx)); //dx
80 4163900 : dy1_32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(dy)); //dy
81 :
82 12491700 : tmp1_32 = _mm256_add_epi32(
83 : _mm256_mullo_epi32(dx1_32, dx1_32),
84 : _mm256_mullo_epi32(dy1_32, dy1_32));
85 :
86 4163900 : dx2_32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dx, 1));
87 4163900 : dy2_32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dy, 1));
88 :
89 12491700 : tmp2_32 = _mm256_add_epi32(
90 : _mm256_mullo_epi32(dx2_32, dx2_32),
91 : _mm256_mullo_epi32(dy2_32, dy2_32));
92 :
93 : /* Code:
94 : quot16 = (dx << 4) / dy;
95 : quot = quot16 >> 4;
96 : remd = = (quot16 & (15));
97 : Equivalent of:
98 : quot = dx / dy;
99 : remd = (dx % dy) * 16 / dy;*/
100 :
101 : //quot16 = (dx << 4) / dy;
102 4163900 : dx1_32 = _mm256_slli_epi32(dx1_32, 4);
103 4163900 : dx2_32 = _mm256_slli_epi32(dx2_32, 4);
104 4163900 : const __m256i d1_i32 = __m256i_div_epi32(&dx1_32, &dy1_32);
105 4173550 : const __m256i d2_i32 = __m256i_div_epi32(&dx2_32, &dy2_32);
106 4166010 : __m256i quot16 = _mm256_permute4x64_epi64(
107 : _mm256_packs_epi32(d1_i32, d2_i32), 0xD8);
108 :
109 4166010 : quot = _mm256_srli_epi16(quot16, 4);
110 :
111 : //remd = (quot16 & (15));
112 4166010 : remd = _mm256_and_si256(quot16, val_15_i16);
113 :
114 : //AOMMIN(remdA, 15)
115 4166010 : remd = _mm256_min_epi16(remd, val_15_i16);
116 : //AOMMIN(quotA, 6)
117 4166010 : quot = _mm256_min_epi16(quot, val_6_i16);
118 :
119 : _mm256_store_si256((__m256i *)dy_mask_array, dy_mask);
120 : _mm256_store_si256((__m256i *)quot_array, quot);
121 : _mm256_store_si256((__m256i *)remd_array, remd);
122 : _mm256_store_si256((__m256i *)sn_array, sn);
123 : _mm256_store_si256((__m256i *)temp_array, tmp1_32);
124 4166010 : _mm256_store_si256((__m256i *)&temp_array[8], tmp2_32);
125 4166010 : }
126 :
127 159253 : void av1_get_gradient_hist_avx2(const uint8_t *src, int src_stride, int rows,
128 : int cols, uint64_t *hist) {
129 159253 : src += src_stride;
130 :
131 : __m128i tmp_src;
132 : __m256i src1; //src[c]
133 : __m256i src2; //src[c-1]
134 : __m256i src3; //src[c - src_stride]
135 :
136 : DECLARE_ALIGNED(64, int16_t, dy_mask_array[16]);
137 : DECLARE_ALIGNED(64, int16_t, quot_array[16]);
138 : DECLARE_ALIGNED(64, int16_t, remd_array[16]);
139 : DECLARE_ALIGNED(64, int16_t, sn_array[16]);
140 : DECLARE_ALIGNED(64, int32_t, temp_array[16]);
141 :
142 159253 : if (cols < 8) { //i.e cols ==4
143 19295 : for (int r = 1; r < rows; r += 4) {
144 15436 : if ((r + 3) >= rows) {
145 3859 : tmp_src = _mm_set_epi32(
146 : 0,
147 3859 : *(uint32_t*)(src + 1),
148 3859 : *(uint32_t*)(src + 1 + src_stride),
149 3859 : *(uint32_t*)(src + 1 + 2 * src_stride));
150 3859 : src1 = _mm256_cvtepu8_epi16(tmp_src);
151 :
152 3859 : tmp_src = _mm_set_epi32(
153 : 0,
154 3859 : *(uint32_t*)(src),
155 3859 : *(uint32_t*)(src + src_stride),
156 3859 : *(uint32_t*)(src + 2 * src_stride));
157 3859 : src2 = _mm256_cvtepu8_epi16(tmp_src);
158 :
159 3859 : tmp_src = _mm_set_epi32(
160 : 0,
161 3859 : *(uint32_t*)(src + 1 - src_stride),
162 3859 : *(uint32_t*)(src + 1),
163 3859 : *(uint32_t*)(src + 1 + src_stride));
164 3859 : src3 = _mm256_cvtepu8_epi16(tmp_src);
165 : }
166 : else {
167 11577 : tmp_src = _mm_set_epi32(
168 11577 : *(uint32_t*)(src + 1),
169 11577 : *(uint32_t*)(src + 1 + src_stride),
170 11577 : *(uint32_t*)(src + 1 + 2 * src_stride),
171 11577 : *(uint32_t*)(src + 1 + 3 * src_stride));
172 11577 : src1 = _mm256_cvtepu8_epi16(tmp_src);
173 :
174 11577 : tmp_src = _mm_set_epi32(
175 11577 : *(uint32_t*)(src),
176 11577 : *(uint32_t*)(src + src_stride),
177 11577 : *(uint32_t*)(src + 2 * src_stride),
178 11577 : *(uint32_t*)(src + 3 * src_stride));
179 11577 : src2 = _mm256_cvtepu8_epi16(tmp_src);
180 :
181 11577 : tmp_src = _mm_set_epi32(
182 11577 : *(uint32_t*)(src + 1 - src_stride),
183 11577 : *(uint32_t*)(src + 1),
184 11577 : *(uint32_t*)(src + 1 + src_stride),
185 11577 : *(uint32_t*)(src + 1 + 2 * src_stride));
186 11577 : src3 = _mm256_cvtepu8_epi16(tmp_src);
187 : }
188 :
189 15436 : get_gradient_hist_avx2_internal(&src1, &src2, &src3, dy_mask_array,
190 : quot_array, remd_array, sn_array, temp_array);
191 :
192 15436 : if ((r + 3) >= rows) {
193 46308 : for (int w = 0; w < 11; ++w) {
194 42449 : if (w == 3 || w == 7)
195 7718 : continue;
196 34731 : if (dy_mask_array[w] != 1) {
197 31739 : int index = gradient_to_angle_bin[sn_array[w]]
198 31739 : [quot_array[w]][remd_array[w]];
199 31739 : hist[index] += temp_array[w];
200 : }
201 : else {
202 2992 : hist[2] += temp_array[w];
203 : }
204 : }
205 : }
206 : else {
207 185232 : for (int w = 0; w < 15; ++w) {
208 173655 : if (w == 3 || w == 7 || w == 11)
209 34731 : continue;
210 138924 : if (dy_mask_array[w] != 1) {
211 128333 : int index = gradient_to_angle_bin[sn_array[w]]
212 128333 : [quot_array[w]][remd_array[w]];
213 128333 : hist[index] += temp_array[w];
214 : }
215 : else {
216 10591 : hist[2] += temp_array[w];
217 : }
218 : }
219 : }
220 15436 : src += 4 * src_stride;
221 : }
222 : }
223 155394 : else if (cols < 16) { //i.e cols ==8
224 377884 : for (int r = 1; r < rows; r += 2) {
225 308013 : if ((r + 1) >= rows) {
226 139734 : tmp_src = _mm_set1_epi64x(*(uint64_t*)(src + 1));
227 69867 : src1 = _mm256_cvtepu8_epi16(tmp_src);
228 :
229 139734 : tmp_src = _mm_set1_epi64x(*(uint64_t*)(src));
230 69867 : src2 = _mm256_cvtepu8_epi16(tmp_src);
231 :
232 139734 : tmp_src = _mm_set1_epi64x(*(uint64_t*)(src + 1 - src_stride));
233 69867 : src3 = _mm256_cvtepu8_epi16(tmp_src);
234 : }
235 : else {
236 238146 : tmp_src = _mm_set_epi64x(*(uint64_t*)(src + 1 + src_stride),
237 238146 : *(uint64_t*)(src + 1));
238 238146 : src1 = _mm256_cvtepu8_epi16(tmp_src);
239 :
240 238146 : tmp_src = _mm_set_epi64x(*(uint64_t*)(src + src_stride),
241 238146 : *(uint64_t*)(src));
242 238146 : src2 = _mm256_cvtepu8_epi16(tmp_src);
243 :
244 238146 : tmp_src = _mm_set_epi64x(*(uint64_t*)(src + 1),
245 238146 : *(uint64_t*)(src + 1 - src_stride));
246 238146 : src3 = _mm256_cvtepu8_epi16(tmp_src);
247 : }
248 :
249 308013 : get_gradient_hist_avx2_internal(&src1, &src2, &src3, dy_mask_array,
250 : quot_array, remd_array, sn_array, temp_array);
251 :
252 308085 : if ((r + 1) >= rows) {
253 558939 : for (int w = 0; w < 7; ++w) {
254 489062 : if (dy_mask_array[w] != 1) {
255 421559 : int index = gradient_to_angle_bin[sn_array[w]]
256 421559 : [quot_array[w]][remd_array[w]];
257 421559 : hist[index] += temp_array[w];
258 : }
259 : else {
260 67503 : hist[2] += temp_array[w];
261 : }
262 : }
263 : }
264 : else {
265 3807580 : for (int w = 0; w < 15; ++w) {
266 3569380 : if (w == 7)
267 238266 : continue;
268 3331110 : if (dy_mask_array[w] != 1) {
269 2880700 : int index = gradient_to_angle_bin[sn_array[w]]
270 2880700 : [quot_array[w]][remd_array[w]];
271 2880700 : hist[index] += temp_array[w];
272 : }
273 : else {
274 450407 : hist[2] += temp_array[w];
275 : }
276 : }
277 : }
278 308085 : src += 2 * src_stride;
279 : }
280 : }
281 : else {
282 2031160 : for (int r = 1; r < rows; ++r) {
283 1928880 : int c = 1;
284 5786400 : for (; cols - c >= 15; c += 16) {
285 :
286 : //read too many [1:16], while max is 15
287 3840840 : src1 = _mm256_cvtepu8_epi16(
288 3840840 : _mm_loadu_si128((__m128i const*)&src[c]));
289 3840840 : src2 = _mm256_cvtepu8_epi16(
290 3840840 : _mm_loadu_si128((__m128i const*)&src[c - 1]));
291 3840840 : src3 = _mm256_cvtepu8_epi16(
292 3840840 : _mm_loadu_si128((__m128i const*)&src[c - src_stride]));
293 :
294 3840840 : get_gradient_hist_avx2_internal(&src1, &src2, &src3,
295 : dy_mask_array, quot_array, remd_array, sn_array, temp_array);
296 :
297 3857520 : int max = 16;
298 3857520 : if (c + 16 > cols) {
299 1928680 : max = 15;
300 : }
301 :
302 62670700 : for (int w = 0; w < max; ++w) {
303 :
304 58813100 : if (dy_mask_array[w] != 1) {
305 40042400 : int index = gradient_to_angle_bin[sn_array[w]]
306 40042400 : [quot_array[w]][remd_array[w]];
307 40042400 : hist[index] += temp_array[w];
308 : }
309 : else {
310 18770700 : hist[2] += temp_array[w];
311 : }
312 : }
313 : }
314 1945560 : src += src_stride;
315 : }
316 : }
317 176004 : }
318 :
319 : #ifndef _mm256_setr_m128i
320 : #define _mm256_setr_m128i(/* __m128i */ lo, /* __m128i */ hi) \
321 : _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
322 : #endif
323 :
324 : #define MACRO_VERTICAL_LUMA_4(A, B, C) \
325 : *(uint32_t*)prediction_ptr = _mm_cvtsi128_si32(_mm_or_si128(_mm_and_si128(A, B), C)); \
326 : A = _mm_srli_si128(A, 1); \
327 : *(uint32_t*)(prediction_ptr + pStride) = _mm_cvtsi128_si32(_mm_or_si128(_mm_and_si128(A, B), C)); \
328 : A = _mm_srli_si128(A, 1);
329 :
330 : #define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \
331 : _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
332 :
333 0 : static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
334 : __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
335 :
336 0 : r0 = _mm_unpacklo_epi16(x[0], x[1]);
337 0 : r1 = _mm_unpacklo_epi16(x[2], x[3]);
338 0 : r2 = _mm_unpacklo_epi16(x[4], x[5]);
339 0 : r3 = _mm_unpacklo_epi16(x[6], x[7]);
340 :
341 0 : r4 = _mm_unpacklo_epi16(x[8], x[9]);
342 0 : r5 = _mm_unpacklo_epi16(x[10], x[11]);
343 0 : r6 = _mm_unpacklo_epi16(x[12], x[13]);
344 0 : r7 = _mm_unpacklo_epi16(x[14], x[15]);
345 :
346 0 : r8 = _mm_unpacklo_epi32(r0, r1);
347 0 : r9 = _mm_unpackhi_epi32(r0, r1);
348 0 : r10 = _mm_unpacklo_epi32(r2, r3);
349 0 : r11 = _mm_unpackhi_epi32(r2, r3);
350 :
351 0 : r12 = _mm_unpacklo_epi32(r4, r5);
352 0 : r13 = _mm_unpackhi_epi32(r4, r5);
353 0 : r14 = _mm_unpacklo_epi32(r6, r7);
354 0 : r15 = _mm_unpackhi_epi32(r6, r7);
355 :
356 0 : r0 = _mm_unpacklo_epi64(r8, r9);
357 0 : r1 = _mm_unpackhi_epi64(r8, r9);
358 0 : r2 = _mm_unpacklo_epi64(r10, r11);
359 0 : r3 = _mm_unpackhi_epi64(r10, r11);
360 :
361 0 : r4 = _mm_unpacklo_epi64(r12, r13);
362 0 : r5 = _mm_unpackhi_epi64(r12, r13);
363 0 : r6 = _mm_unpacklo_epi64(r14, r15);
364 0 : r7 = _mm_unpackhi_epi64(r14, r15);
365 :
366 0 : d[0] = _mm_unpacklo_epi64(r0, r2);
367 0 : d[1] = _mm_unpacklo_epi64(r4, r6);
368 0 : d[2] = _mm_unpacklo_epi64(r1, r3);
369 0 : d[3] = _mm_unpacklo_epi64(r5, r7);
370 :
371 0 : d[4] = _mm_unpackhi_epi64(r0, r2);
372 0 : d[5] = _mm_unpackhi_epi64(r4, r6);
373 0 : d[6] = _mm_unpackhi_epi64(r1, r3);
374 0 : d[7] = _mm_unpackhi_epi64(r5, r7);
375 0 : }
376 0 : static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
377 : __m256i w0, w1, w2, w3, ww0, ww1;
378 :
379 0 : w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13
380 0 : w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33
381 0 : w2 = _mm256_unpackhi_epi16(x[0], x[1]); // 40 50 41 51 42 52 43 53
382 0 : w3 = _mm256_unpackhi_epi16(x[2], x[3]); // 60 70 61 71 62 72 63 73
383 :
384 0 : ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
385 0 : ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
386 :
387 0 : d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
388 0 : d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
389 :
390 0 : ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
391 0 : ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
392 :
393 0 : d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
394 0 : d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
395 0 : }
396 :
397 0 : static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
398 : __m256i w0, w1, w2, w3, ww0, ww1;
399 :
400 0 : w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13
401 0 : w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33
402 0 : w2 = _mm256_unpacklo_epi16(x[4], x[5]); // 40 50 41 51 42 52 43 53
403 0 : w3 = _mm256_unpacklo_epi16(x[6], x[7]); // 60 70 61 71 62 72 63 73
404 :
405 0 : ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
406 0 : ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
407 :
408 0 : d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
409 0 : d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
410 :
411 0 : ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
412 0 : ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
413 :
414 0 : d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
415 0 : d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
416 :
417 0 : w0 = _mm256_unpackhi_epi16(x[0], x[1]); // 04 14 05 15 06 16 07 17
418 0 : w1 = _mm256_unpackhi_epi16(x[2], x[3]); // 24 34 25 35 26 36 27 37
419 0 : w2 = _mm256_unpackhi_epi16(x[4], x[5]); // 44 54 45 55 46 56 47 57
420 0 : w3 = _mm256_unpackhi_epi16(x[6], x[7]); // 64 74 65 75 66 76 67 77
421 :
422 0 : ww0 = _mm256_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
423 0 : ww1 = _mm256_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75
424 :
425 0 : d[4] = _mm256_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74
426 0 : d[5] = _mm256_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75
427 :
428 0 : ww0 = _mm256_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
429 0 : ww1 = _mm256_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77
430 :
431 0 : d[6] = _mm256_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76
432 0 : d[7] = _mm256_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77
433 0 : }
434 :
435 : // TODO(luoyi) The following two functions are shared with intrapred_sse2.c.
436 : // Use a header file, intrapred_common_x86.h
437 123034 : static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
438 123034 : __m128i x = _mm_load_si128((__m128i const *)ref);
439 123034 : const __m128i zero = _mm_setzero_si128();
440 123034 : x = _mm_sad_epu8(x, zero);
441 123034 : const __m128i high = _mm_unpackhi_epi64(x, x);
442 123034 : return _mm_add_epi16(x, high);
443 : }
444 :
445 114153 : static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
446 114153 : __m128i x0 = _mm_load_si128((__m128i const *)ref);
447 228306 : __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
448 114153 : const __m128i zero = _mm_setzero_si128();
449 114153 : x0 = _mm_sad_epu8(x0, zero);
450 114153 : x1 = _mm_sad_epu8(x1, zero);
451 114153 : x0 = _mm_add_epi16(x0, x1);
452 114153 : const __m128i high = _mm_unpackhi_epi64(x0, x0);
453 114153 : return _mm_add_epi16(x0, high);
454 : }
455 :
456 270523 : static INLINE __m256i dc_sum_32(const uint8_t *ref) {
457 270523 : const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
458 270523 : const __m256i zero = _mm256_setzero_si256();
459 270523 : __m256i y = _mm256_sad_epu8(x, zero);
460 270523 : __m256i u = _mm256_permute2x128_si256(y, y, 1);
461 270523 : y = _mm256_add_epi64(u, y);
462 270523 : u = _mm256_unpackhi_epi64(y, y);
463 270523 : return _mm256_add_epi16(y, u);
464 : }
465 642432 : static INLINE void row_store_32xh(const __m256i *r, int32_t height, uint8_t *dst,
466 : ptrdiff_t stride) {
467 16513500 : for (int32_t i = 0; i < height; ++i) {
468 15871000 : _mm256_storeu_si256((__m256i *)dst, *r);
469 15871000 : dst += stride;
470 : }
471 642432 : }
472 :
473 10710 : static INLINE void row_store_64xh(const __m256i *r, int32_t height, uint8_t *dst,
474 : ptrdiff_t stride) {
475 521085 : for (int32_t i = 0; i < height; ++i) {
476 510375 : _mm256_storeu_si256((__m256i *)dst, *r);
477 510375 : _mm256_storeu_si256((__m256i *)(dst + 32), *r);
478 510375 : dst += stride;
479 : }
480 10710 : }
481 17141 : static INLINE __m256i dc_sum_64(const uint8_t *ref) {
482 17141 : const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
483 34282 : const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
484 17141 : const __m256i zero = _mm256_setzero_si256();
485 17141 : __m256i y0 = _mm256_sad_epu8(x0, zero);
486 17141 : __m256i y1 = _mm256_sad_epu8(x1, zero);
487 17141 : y0 = _mm256_add_epi64(y0, y1);
488 17141 : __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
489 17141 : y0 = _mm256_add_epi64(u0, y0);
490 17141 : u0 = _mm256_unpackhi_epi64(y0, y0);
491 17141 : return _mm256_add_epi16(y0, u0);
492 : }
493 4789 : void eb_aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
494 : const uint8_t *above, const uint8_t *left) {
495 4789 : const __m256i sum_above = dc_sum_64(above);
496 4790 : __m256i sum_left = dc_sum_64(left);
497 4792 : sum_left = _mm256_add_epi16(sum_left, sum_above);
498 4792 : uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
499 4792 : sum += 64;
500 4792 : sum /= 128;
501 4792 : const __m256i row = _mm256_set1_epi8((uint8_t)sum);
502 4792 : row_store_64xh(&row, 64, dst, stride);
503 4795 : }
504 :
505 1072 : void eb_aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
506 : const uint8_t *above,
507 : const uint8_t *left) {
508 1072 : __m256i sum = dc_sum_64(left);
509 : (void)above;
510 :
511 1072 : const __m256i thirtytwo = _mm256_set1_epi16(32);
512 1072 : sum = _mm256_add_epi16(sum, thirtytwo);
513 1072 : sum = _mm256_srai_epi16(sum, 6);
514 1072 : const __m256i zero = _mm256_setzero_si256();
515 1072 : __m256i row = _mm256_shuffle_epi8(sum, zero);
516 1072 : row_store_64xh(&row, 64, dst, stride);
517 1072 : }
518 530 : void eb_aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
519 : const uint8_t *above,
520 : const uint8_t *left) {
521 530 : __m256i sum = dc_sum_64(above);
522 : (void)left;
523 :
524 530 : const __m256i thirtytwo = _mm256_set1_epi16(32);
525 530 : sum = _mm256_add_epi16(sum, thirtytwo);
526 530 : sum = _mm256_srai_epi16(sum, 6);
527 530 : const __m256i zero = _mm256_setzero_si256();
528 530 : __m256i row = _mm256_shuffle_epi8(sum, zero);
529 530 : row_store_64xh(&row, 64, dst, stride);
530 530 : }
531 14623 : void eb_aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
532 : const uint8_t *above,
533 : const uint8_t *left) {
534 14623 : __m256i sum = dc_sum_32(above);
535 : (void)left;
536 :
537 14623 : const __m256i sixteen = _mm256_set1_epi16(16);
538 14623 : sum = _mm256_add_epi16(sum, sixteen);
539 14623 : sum = _mm256_srai_epi16(sum, 5);
540 14623 : const __m256i zero = _mm256_setzero_si256();
541 14623 : __m256i row = _mm256_shuffle_epi8(sum, zero);
542 14623 : row_store_32xh(&row, 32, dst, stride);
543 14623 : }
544 12499 : void eb_aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
545 : const uint8_t *above,
546 : const uint8_t *left) {
547 12499 : __m256i sum = dc_sum_32(left);
548 : (void)above;
549 :
550 12499 : const __m256i sixteen = _mm256_set1_epi16(16);
551 12499 : sum = _mm256_add_epi16(sum, sixteen);
552 12499 : sum = _mm256_srai_epi16(sum, 5);
553 12499 : const __m256i zero = _mm256_setzero_si256();
554 12499 : __m256i row = _mm256_shuffle_epi8(sum, zero);
555 12499 : row_store_32xh(&row, 32, dst, stride);
556 12499 : }
557 118 : void eb_aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
558 : const uint8_t *above,
559 : const uint8_t *left) {
560 : (void)above;
561 : (void)left;
562 118 : const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
563 118 : row_store_64xh(&row, 64, dst, stride);
564 118 : }
565 434 : void eb_aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
566 : const uint8_t *above,
567 : const uint8_t *left) {
568 : (void)above;
569 : (void)left;
570 434 : const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
571 434 : row_store_32xh(&row, 32, dst, stride);
572 434 : }
573 :
574 114153 : void eb_aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
575 : const uint8_t *above, const uint8_t *left) {
576 114153 : const __m128i top_sum = dc_sum_32_sse2(above);
577 114153 : __m128i left_sum = dc_sum_16_sse2(left);
578 114153 : left_sum = _mm_add_epi16(top_sum, left_sum);
579 114153 : uint32_t sum = _mm_cvtsi128_si32(left_sum);
580 114153 : sum += 24;
581 114153 : sum /= 48;
582 114153 : const __m256i row = _mm256_set1_epi8((uint8_t)sum);
583 114153 : row_store_32xh(&row, 16, dst, stride);
584 114153 : }
585 :
586 1729 : void eb_aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
587 : const uint8_t *above, const uint8_t *left) {
588 1729 : const __m256i sum_above = dc_sum_32(above);
589 1729 : __m256i sum_left = dc_sum_64(left);
590 1729 : sum_left = _mm256_add_epi16(sum_left, sum_above);
591 1729 : uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
592 1729 : sum += 48;
593 1729 : sum /= 96;
594 1729 : const __m256i row = _mm256_set1_epi8((uint8_t)sum);
595 1729 : row_store_32xh(&row, 64, dst, stride);
596 1729 : }
597 :
598 1222 : void eb_aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
599 : const uint8_t *above, const uint8_t *left) {
600 1222 : const __m256i sum_above = dc_sum_64(above);
601 1222 : __m256i sum_left = dc_sum_32(left);
602 1222 : sum_left = _mm256_add_epi16(sum_left, sum_above);
603 1222 : uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
604 1222 : sum += 48;
605 1222 : sum /= 96;
606 1222 : const __m256i row = _mm256_set1_epi8((uint8_t)sum);
607 1222 : row_store_64xh(&row, 32, dst, stride);
608 1222 : }
609 :
610 1938 : void eb_aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
611 : const uint8_t *above, const uint8_t *left) {
612 1938 : const __m256i sum_above = dc_sum_64(above);
613 3876 : __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
614 1938 : sum_left = _mm256_add_epi16(sum_left, sum_above);
615 1938 : uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
616 1938 : sum += 40;
617 1938 : sum /= 80;
618 1938 : const __m256i row = _mm256_set1_epi8((uint8_t)sum);
619 1938 : row_store_64xh(&row, 16, dst, stride);
620 1938 : }
621 :
622 6854 : void eb_aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
623 : const uint8_t *above,
624 : const uint8_t *left) {
625 6854 : __m128i sum = dc_sum_16_sse2(left);
626 : (void)above;
627 :
628 6854 : const __m128i eight = _mm_set1_epi16(8);
629 6854 : sum = _mm_add_epi16(sum, eight);
630 6854 : sum = _mm_srai_epi16(sum, 4);
631 6854 : const __m128i zero = _mm_setzero_si128();
632 6854 : const __m128i r = _mm_shuffle_epi8(sum, zero);
633 6854 : const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
634 6854 : row_store_32xh(&row, 16, dst, stride);
635 6854 : }
636 :
637 258 : void eb_aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
638 : const uint8_t *above,
639 : const uint8_t *left) {
640 258 : __m256i sum = dc_sum_64(left);
641 : (void)above;
642 :
643 258 : const __m256i thirtytwo = _mm256_set1_epi16(32);
644 258 : sum = _mm256_add_epi16(sum, thirtytwo);
645 258 : sum = _mm256_srai_epi16(sum, 6);
646 258 : const __m256i zero = _mm256_setzero_si256();
647 258 : __m256i row = _mm256_shuffle_epi8(sum, zero);
648 258 : row_store_32xh(&row, 64, dst, stride);
649 258 : }
650 :
651 86 : void eb_aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
652 : const uint8_t *above,
653 : const uint8_t *left) {
654 86 : __m256i sum = dc_sum_32(left);
655 : (void)above;
656 :
657 86 : const __m256i sixteen = _mm256_set1_epi16(16);
658 86 : sum = _mm256_add_epi16(sum, sixteen);
659 86 : sum = _mm256_srai_epi16(sum, 5);
660 86 : const __m256i zero = _mm256_setzero_si256();
661 86 : __m256i row = _mm256_shuffle_epi8(sum, zero);
662 86 : row_store_64xh(&row, 32, dst, stride);
663 86 : }
664 :
665 89 : void eb_aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
666 : const uint8_t *above,
667 : const uint8_t *left) {
668 89 : __m128i sum = dc_sum_16_sse2(left);
669 : (void)above;
670 :
671 89 : const __m128i eight = _mm_set1_epi16(8);
672 89 : sum = _mm_add_epi16(sum, eight);
673 89 : sum = _mm_srai_epi16(sum, 4);
674 89 : const __m128i zero = _mm_setzero_si128();
675 89 : const __m128i r = _mm_shuffle_epi8(sum, zero);
676 89 : const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
677 89 : row_store_64xh(&row, 16, dst, stride);
678 89 : }
679 :
680 19822 : void eb_aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
681 : const uint8_t *above,
682 : const uint8_t *left) {
683 19822 : __m256i sum = dc_sum_32(above);
684 : (void)left;
685 :
686 19822 : const __m256i sixteen = _mm256_set1_epi16(16);
687 19822 : sum = _mm256_add_epi16(sum, sixteen);
688 19822 : sum = _mm256_srai_epi16(sum, 5);
689 19822 : const __m256i zero = _mm256_setzero_si256();
690 19822 : __m256i row = _mm256_shuffle_epi8(sum, zero);
691 19822 : row_store_32xh(&row, 16, dst, stride);
692 19822 : }
693 :
694 138 : void eb_aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
695 : const uint8_t *above,
696 : const uint8_t *left) {
697 138 : __m256i sum = dc_sum_32(above);
698 : (void)left;
699 :
700 138 : const __m256i sixteen = _mm256_set1_epi16(16);
701 138 : sum = _mm256_add_epi16(sum, sixteen);
702 138 : sum = _mm256_srai_epi16(sum, 5);
703 138 : const __m256i zero = _mm256_setzero_si256();
704 138 : __m256i row = _mm256_shuffle_epi8(sum, zero);
705 138 : row_store_32xh(&row, 64, dst, stride);
706 138 : }
707 :
708 347 : void eb_aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
709 : const uint8_t *above,
710 : const uint8_t *left) {
711 347 : __m256i sum = dc_sum_64(above);
712 : (void)left;
713 :
714 347 : const __m256i thirtytwo = _mm256_set1_epi16(32);
715 347 : sum = _mm256_add_epi16(sum, thirtytwo);
716 347 : sum = _mm256_srai_epi16(sum, 6);
717 347 : const __m256i zero = _mm256_setzero_si256();
718 347 : __m256i row = _mm256_shuffle_epi8(sum, zero);
719 347 : row_store_64xh(&row, 32, dst, stride);
720 347 : }
721 :
722 469 : void eb_aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
723 : const uint8_t *above,
724 : const uint8_t *left) {
725 469 : __m256i sum = dc_sum_64(above);
726 : (void)left;
727 :
728 469 : const __m256i thirtytwo = _mm256_set1_epi16(32);
729 469 : sum = _mm256_add_epi16(sum, thirtytwo);
730 469 : sum = _mm256_srai_epi16(sum, 6);
731 469 : const __m256i zero = _mm256_setzero_si256();
732 469 : __m256i row = _mm256_shuffle_epi8(sum, zero);
733 469 : row_store_64xh(&row, 16, dst, stride);
734 469 : }
735 :
736 449 : void eb_aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
737 : const uint8_t *above,
738 : const uint8_t *left) {
739 : (void)above;
740 : (void)left;
741 449 : const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
742 449 : row_store_32xh(&row, 16, dst, stride);
743 449 : }
744 23 : void eb_aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
745 : const uint8_t *above,
746 : const uint8_t *left) {
747 : (void)above;
748 : (void)left;
749 23 : const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
750 23 : row_store_32xh(&row, 64, dst, stride);
751 23 : }
752 23 : void eb_aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
753 : const uint8_t *above,
754 : const uint8_t *left) {
755 : (void)above;
756 : (void)left;
757 23 : const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
758 23 : row_store_64xh(&row, 16, dst, stride);
759 23 : }
760 23 : void eb_aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
761 : const uint8_t *above,
762 : const uint8_t *left) {
763 : (void)above;
764 : (void)left;
765 23 : const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
766 23 : row_store_64xh(&row, 32, dst, stride);
767 23 : }
768 :
769 : // There are 32 rows togeter. This function does line:
770 : // 0,1,2,3, and 16,17,18,19. The next call would do
771 : // 4,5,6,7, and 20,21,22,23. So 4 times of calling
772 : // would finish 32 rows.
773 1506500 : static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
774 : ptrdiff_t stride) {
775 : __m256i t[4];
776 1506500 : __m256i m = _mm256_setzero_si256();
777 1506500 : const __m256i inc = _mm256_set1_epi8(4);
778 : int32_t i;
779 :
780 7532280 : for (i = 0; i < 4; i++) {
781 6025780 : t[i] = _mm256_shuffle_epi8(*row, m);
782 6025780 : __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
783 6025780 : __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
784 : _mm256_storeu_si256((__m256i *)dst, r0);
785 6025780 : _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
786 6025780 : dst += stride;
787 6025780 : m = _mm256_add_epi8(m, inc);
788 : }
789 1506500 : }
790 :
791 376632 : void eb_aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
792 : const uint8_t *above, const uint8_t *left) {
793 : (void)above;
794 376632 : const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
795 :
796 376632 : __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
797 :
798 376632 : __m256i v = _mm256_unpacklo_epi8(u, u);
799 376632 : h_predictor_32x8line(&v, dst, stride);
800 376633 : dst += stride << 2;
801 :
802 376633 : v = _mm256_unpackhi_epi8(u, u);
803 376633 : h_predictor_32x8line(&v, dst, stride);
804 376634 : dst += stride << 2;
805 :
806 376634 : u = _mm256_unpackhi_epi8(left_col, left_col);
807 :
808 376634 : v = _mm256_unpacklo_epi8(u, u);
809 376634 : h_predictor_32x8line(&v, dst, stride);
810 376633 : dst += stride << 2;
811 :
812 376633 : v = _mm256_unpackhi_epi8(u, u);
813 376633 : h_predictor_32x8line(&v, dst, stride);
814 376634 : }
815 4416 : static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
816 : int32_t height, uint8_t *dst,
817 : ptrdiff_t stride) {
818 186128 : for (int32_t i = 0; i < height; ++i) {
819 181712 : _mm256_storeu_si256((__m256i *)dst, *r0);
820 181712 : _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
821 181712 : dst += stride;
822 : }
823 4416 : }
824 2000 : void eb_aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
825 : const uint8_t *above, const uint8_t *left) {
826 2000 : const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
827 2000 : const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
828 : (void)left;
829 2000 : row_store_32x2xh(&row0, &row1, 64, dst, stride);
830 2000 : }
831 201131 : void eb_aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
832 : const uint8_t *above, const uint8_t *left) {
833 201131 : const __m256i row = _mm256_loadu_si256((const __m256i *)above);
834 : (void)left;
835 201131 : row_store_32xh(&row, 32, dst, stride);
836 201132 : }
837 :
838 158700 : void eb_aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
839 : const uint8_t *above, const uint8_t *left) {
840 158700 : const __m256i row = _mm256_loadu_si256((const __m256i *)above);
841 : (void)left;
842 158700 : row_store_32xh(&row, 16, dst, stride);
843 158700 : }
844 1417 : void eb_aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
845 : const uint8_t *above, const uint8_t *left) {
846 1417 : const __m256i row = _mm256_loadu_si256((const __m256i *)above);
847 : (void)left;
848 1417 : row_store_32xh(&row, 64, dst, stride);
849 1417 : }
850 1475 : void eb_aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
851 : const uint8_t *above, const uint8_t *left) {
852 1475 : const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
853 1475 : const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
854 : (void)left;
855 1475 : row_store_32x2xh(&row0, &row1, 16, dst, stride);
856 1475 : }
857 941 : void eb_aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
858 : const uint8_t *above, const uint8_t *left) {
859 941 : const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
860 941 : const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
861 : (void)left;
862 941 : row_store_32x2xh(&row0, &row1, 32, dst, stride);
863 941 : }
864 :
865 110202 : void eb_aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
866 : const uint8_t *above, const uint8_t *left) {
867 110202 : const __m256i sum_above = dc_sum_32(above);
868 110205 : __m256i sum_left = dc_sum_32(left);
869 110212 : sum_left = _mm256_add_epi16(sum_left, sum_above);
870 110212 : const __m256i thirtytwo = _mm256_set1_epi16(32);
871 110212 : sum_left = _mm256_add_epi16(sum_left, thirtytwo);
872 110212 : sum_left = _mm256_srai_epi16(sum_left, 6);
873 110212 : const __m256i zero = _mm256_setzero_si256();
874 110212 : __m256i row = _mm256_shuffle_epi8(sum_left, zero);
875 110212 : row_store_32xh(&row, 32, dst, stride);
876 110212 : }
877 :
878 : // only define these intrinsics if immintrin.h doesn't have them
879 : #if defined(_MSC_VER) && _MSC_VER < 1910
880 : static inline int32_t _mm256_extract_epi32(__m256i a, const int32_t i)
881 : {
882 : return a.m256i_i32[i & 7];
883 : }
884 :
885 : static inline __m256i _mm256_insert_epi32(__m256i a, int32_t b, const int32_t i)
886 : {
887 : __m256i c = a;
888 : c.m256i_i32[i & 7] = b;
889 : return c;
890 : }
891 : #endif
892 :
893 : #define PERM4x64(c0, c1, c2, c3) c0+(c1<<2)+(c2<<4)+(c3<<6)
894 : #define PERM2x128(c0, c1) c0+(c1<<4)
895 :
896 0 : void transpose_16bit_TX_4X4(const uint16_t *src, uint32_t srcStride, uint16_t *dst, uint32_t dstStride)
897 : {
898 0 : assert(srcStride == 4);
899 : (void)srcStride;
900 :
901 0 : if (dstStride == 4)
902 : {
903 0 : __m128i s = _mm_loadu_si128((__m128i*)src);
904 0 : __m128i r1 = _mm_srli_si128(s, 8);
905 0 : __m128i r2 = _mm_loadu_si128((__m128i*)(src + 8));
906 0 : __m128i r3 = _mm_srli_si128(r2, 8);
907 :
908 0 : __m128i r0_Lo = _mm_unpacklo_epi16(s, r1);
909 0 : __m128i r2_Lo = _mm_unpacklo_epi16(r2, r3);
910 0 : __m128i r1_Lo = _mm_unpacklo_epi32(r0_Lo, r2_Lo);
911 0 : r0_Lo = _mm_unpackhi_epi32(r0_Lo, r2_Lo);
912 :
913 : _mm_storeu_si128((__m128i*)(dst + 0 * dstStride), r1_Lo);
914 0 : _mm_storeu_si128((__m128i*)(dst + 2 * dstStride), r0_Lo);
915 : }
916 : else
917 : {
918 0 : __m128i s = _mm_loadu_si128((__m128i*)src);
919 0 : __m128i r1 = _mm_srli_si128(s, 8);
920 0 : __m128i r2 = _mm_loadu_si128((__m128i*)(src + 8));
921 0 : __m128i r3 = _mm_srli_si128(r2, 8);
922 :
923 0 : __m128i r0_Lo = _mm_unpacklo_epi16(s, r1);
924 0 : __m128i r2_Lo = _mm_unpacklo_epi16(r2, r3);
925 0 : __m128i r1_Lo = _mm_unpacklo_epi32(r0_Lo, r2_Lo);
926 0 : r0_Lo = _mm_unpackhi_epi32(r0_Lo, r2_Lo);
927 :
928 0 : _mm_storel_epi64((__m128i*)(dst + 0 * dstStride), r1_Lo);
929 0 : _mm_storel_epi64((__m128i*)(dst + 1 * dstStride), _mm_srli_si128(r1_Lo, 8));
930 0 : _mm_storel_epi64((__m128i*)(dst + 2 * dstStride), r0_Lo);
931 0 : _mm_storel_epi64((__m128i*)(dst + 3 * dstStride), _mm_srli_si128(r0_Lo, 8));
932 : }
933 0 : }
934 0 : void transpose_16bit_TX_8X8(const uint16_t *src, uint32_t srcStride, uint16_t *dst, uint32_t dstStride)
935 : {
936 : __m128i r0, r1, r2, r3, r4, r5, r6, r7, r0_Lo, r1_Lo, r2_Lo, r3_Lo, r4_Lo, r5_Lo, r6_Lo;
937 0 : r0 = _mm_loadu_si128((__m128i*)(src + 0 * srcStride)); // 07,06,05,04,03,02,01,00
938 0 : r1 = _mm_loadu_si128((__m128i*)(src + 1 * srcStride)); // 17,16,15,14,13,12,11,10
939 0 : r2 = _mm_loadu_si128((__m128i*)(src + 2 * srcStride)); // 27,26,25,24,23,22,21,20
940 0 : r3 = _mm_loadu_si128((__m128i*)(src + 3 * srcStride)); // 37,36,35,34,33,32,31,30
941 0 : r4 = _mm_loadu_si128((__m128i*)(src + 4 * srcStride)); // 47,46,45,44,43,42,41,40
942 0 : r5 = _mm_loadu_si128((__m128i*)(src + 5 * srcStride)); // 57,56,55,54,53,52,51,50
943 0 : r6 = _mm_loadu_si128((__m128i*)(src + 6 * srcStride)); // 67,66,65,64,63,62,61,60
944 0 : r7 = _mm_loadu_si128((__m128i*)(src + 7 * srcStride)); // 77,76,75,74,73,72,71,70
945 :
946 0 : r0_Lo = _mm_unpacklo_epi16(r0, r1);
947 0 : r2_Lo = _mm_unpacklo_epi16(r2, r3);
948 0 : r4_Lo = _mm_unpacklo_epi16(r4, r5);
949 0 : r6_Lo = _mm_unpacklo_epi16(r6, r7);
950 :
951 0 : r1_Lo = r0_Lo;
952 0 : r0_Lo = _mm_unpacklo_epi32(r0_Lo, r2_Lo);
953 0 : r1_Lo = _mm_unpackhi_epi32(r1_Lo, r2_Lo);
954 0 : r5_Lo = r4_Lo;
955 0 : r4_Lo = _mm_unpacklo_epi32(r4_Lo, r6_Lo);
956 0 : r5_Lo = _mm_unpackhi_epi32(r5_Lo, r6_Lo);
957 0 : r2_Lo = r0_Lo;
958 0 : r0_Lo = _mm_unpacklo_epi64(r0_Lo, r4_Lo); //64
959 0 : r2_Lo = _mm_unpackhi_epi64(r2_Lo, r4_Lo);
960 0 : r3_Lo = r1_Lo;
961 0 : r1_Lo = _mm_unpacklo_epi64(r1_Lo, r5_Lo);
962 0 : r3_Lo = _mm_unpackhi_epi64(r3_Lo, r5_Lo);
963 :
964 : _mm_storeu_si128((__m128i*)(dst + 0 * dstStride), r0_Lo);
965 0 : _mm_storeu_si128((__m128i*)(dst + 1 * dstStride), r2_Lo);
966 0 : _mm_storeu_si128((__m128i*)(dst + 2 * dstStride), r1_Lo);
967 0 : _mm_storeu_si128((__m128i*)(dst + 3 * dstStride), r3_Lo);
968 :
969 0 : r0 = _mm_unpackhi_epi16(r0, r1);
970 0 : r2 = _mm_unpackhi_epi16(r2, r3);
971 0 : r4 = _mm_unpackhi_epi16(r4, r5);
972 0 : r6 = _mm_unpackhi_epi16(r6, r7);
973 :
974 0 : r1 = r0;
975 0 : r0 = _mm_unpacklo_epi32(r0, r2);
976 0 : r1 = _mm_unpackhi_epi32(r1, r2);
977 0 : r5 = r4;
978 0 : r4 = _mm_unpacklo_epi32(r4, r6);
979 0 : r5 = _mm_unpackhi_epi32(r5, r6);
980 0 : r2 = r0;
981 0 : r0 = _mm_unpacklo_epi64(r0, r4);
982 0 : r2 = _mm_unpackhi_epi64(r2, r4);
983 0 : r3 = r1;
984 0 : r1 = _mm_unpacklo_epi64(r1, r5);
985 0 : r3 = _mm_unpackhi_epi64(r3, r5);
986 :
987 0 : _mm_storeu_si128((__m128i*)(dst + 4 * dstStride), r0);
988 0 : _mm_storeu_si128((__m128i*)(dst + 5 * dstStride), r2);
989 0 : _mm_storeu_si128((__m128i*)(dst + 6 * dstStride), r1);
990 0 : _mm_storeu_si128((__m128i*)(dst + 7 * dstStride), r3);
991 0 : }
992 0 : void transpose_16bit(const uint16_t *src, uint32_t srcStride, uint16_t *dst, uint32_t dstStride, int32_t width, int32_t height)
993 : {
994 0 : for (int32_t j = 0; j < height; j += 8)
995 0 : for (int32_t i = 0; i < width; i += 8)
996 0 : transpose_16bit_TX_8X8(src + i * srcStride + j, srcStride, dst + j * dstStride + i, dstStride);
997 0 : }
998 :
999 : // Low bit depth functions
1000 : static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
1001 : { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1002 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1003 : { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1004 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1005 : { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1006 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1007 : { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1008 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1009 : { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1010 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1011 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1012 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1013 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1014 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1015 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1016 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1017 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
1018 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1019 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
1020 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1021 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1022 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1023 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1024 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1025 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1026 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1027 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1028 : 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1029 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1030 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1031 : 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1032 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1033 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1034 : 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
1035 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1036 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1037 : 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
1038 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1039 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1040 : 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
1041 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1042 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1043 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
1044 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1045 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1046 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
1047 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1048 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1049 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
1050 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1051 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1052 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
1053 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1054 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1055 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1056 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1057 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1058 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1059 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1060 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1061 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1062 : 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1063 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1064 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1065 : 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
1066 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1067 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1068 : 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 },
1069 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1070 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1071 : 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 },
1072 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1073 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1074 : 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 },
1075 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1076 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1077 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 },
1078 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1079 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1080 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 },
1081 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1082 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1083 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 },
1084 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1085 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1086 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
1087 : { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1088 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1089 : 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
1090 : };
1091 :
1092 : static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = {
1093 : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
1094 : {0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
1095 : {0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13},
1096 : {0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
1097 : {0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
1098 : {0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
1099 : {0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
1100 : {0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8},
1101 : {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7},
1102 : {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6},
1103 : {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5},
1104 : {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4},
1105 : {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3},
1106 : {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2},
1107 : {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1},
1108 : {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
1109 : };
1110 :
1111 : static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
1112 : {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15},
1113 : {0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14},
1114 : {0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13},
1115 : {0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12},
1116 : {0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11},
1117 : {0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10},
1118 : {0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9},
1119 : {0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8}};
1120 :
1121 : static AOM_FORCE_INLINE void
1122 : dr_prediction_z1_HxW_internal_avx2(int H, int W, __m128i *dst,
1123 : const uint8_t *above, int upsample_above,
1124 : int dx) {
1125 6495780 : const int frac_bits = 6 - upsample_above;
1126 6495780 : const int max_base_x = ((W + H) - 1) << upsample_above;
1127 :
1128 0 : assert(dx > 0);
1129 : // pre-filter above pixels
1130 : // store in temp buffers:
1131 : // above[x] * 32 + 16
1132 : // above[x+1] - above[x]
1133 : // final pixels will be caluculated as:
1134 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1135 : __m256i a0, a1, a32, a16;
1136 : __m256i diff, c3f;
1137 : __m128i a_mbase_x;
1138 :
1139 6495780 : a16 = _mm256_set1_epi16(16);
1140 12991600 : a_mbase_x = _mm_set1_epi8(above[max_base_x]);
1141 6495780 : c3f = _mm256_set1_epi16(0x3f);
1142 :
1143 6495780 : int x = dx;
1144 69768800 : for (int r = 0; r < W; r++) {
1145 : __m256i b, res, shift;
1146 : __m128i res1, a0_128, a1_128;
1147 :
1148 63343900 : int base = x >> frac_bits;
1149 63343900 : int base_max_diff = (max_base_x - base) >> upsample_above;
1150 63343900 : if (base_max_diff <= 0) {
1151 207779 : for (int i = r; i < W; ++i) {
1152 136862 : dst[i] = a_mbase_x; // save 4 values
1153 : }
1154 70917 : return;
1155 : }
1156 63273000 : if (base_max_diff > H)
1157 60098000 : base_max_diff = H;
1158 63273000 : a0_128 = _mm_loadu_si128((__m128i *)(above + base));
1159 63273000 : a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1160 :
1161 63273000 : if (upsample_above) {
1162 16667600 : a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
1163 16667600 : a1_128 = _mm_srli_si128(a0_128, 8);
1164 :
1165 83337800 : shift = _mm256_srli_epi16(
1166 : _mm256_and_si256(
1167 : _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
1168 : 1);
1169 : }
1170 : else {
1171 186422000 : shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1172 : }
1173 63273000 : a0 = _mm256_cvtepu8_epi16(a0_128);
1174 63273000 : a1 = _mm256_cvtepu8_epi16(a1_128);
1175 :
1176 63273000 : diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
1177 63273000 : a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
1178 63273000 : a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
1179 :
1180 63273000 : b = _mm256_mullo_epi16(diff, shift);
1181 63273000 : res = _mm256_add_epi16(a32, b);
1182 63273000 : res = _mm256_srli_epi16(res, 5);
1183 :
1184 126546000 : res = _mm256_packus_epi16(
1185 : res,
1186 63273000 : _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); // goto 8 bit
1187 63273000 : res1 = _mm256_castsi256_si128(res); // 16 8bit values
1188 :
1189 126546000 : dst[r] =
1190 63273000 : _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
1191 63273000 : x += dx;
1192 : }
1193 : }
1194 :
1195 1527060 : static void dr_prediction_z1_4xN_avx2(int32_t N, uint8_t *dst, ptrdiff_t stride,
1196 : const uint8_t *above,
1197 : int32_t upsample_above, int32_t dx) {
1198 : __m128i dstvec[16];
1199 : dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
1200 :
1201 12616300 : for (int32_t i = 0; i < N; i++) {
1202 22178400 : *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
1203 : }
1204 1527060 : }
1205 :
1206 1418700 : static void dr_prediction_z1_8xN_avx2(int32_t N, uint8_t *dst, ptrdiff_t stride,
1207 : const uint8_t *above, int32_t upsample_above,
1208 : int32_t dx) {
1209 : __m128i dstvec[32];
1210 :
1211 : dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
1212 15590100 : for (int32_t i = 0; i < N; i++) {
1213 14171400 : _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
1214 : }
1215 1418700 : }
1216 :
1217 955254 : static void dr_prediction_z1_16xN_avx2(int32_t N, uint8_t *dst, ptrdiff_t stride,
1218 : const uint8_t *above, int32_t upsample_above,
1219 : int32_t dx) {
1220 : __m128i dstvec[64];
1221 :
1222 : dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
1223 :
1224 13415700 : for (int32_t i = 0; i < N; i++) {
1225 12460400 : _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
1226 : }
1227 955254 : }
1228 :
1229 : static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
1230 : int32_t N, __m256i *dstvec, const uint8_t *above, int32_t upsample_above, int32_t dx) {
1231 : int32_t x;
1232 : // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1233 : (void)upsample_above;
1234 672650 : const int32_t frac_bits = 6;
1235 672650 : const int32_t max_base_x = ((32 + N) - 1);
1236 :
1237 : // pre-filter above pixels
1238 : // store in temp buffers:
1239 : // above[x] * 32 + 16
1240 : // above[x+1] - above[x]
1241 : // final pixels will be caluculated as:
1242 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1243 : __m256i a0, a0_1, a1, a1_1, a32, a16;
1244 : __m256i a_mbase_x, diff;
1245 :
1246 672650 : a16 = _mm256_set1_epi32(16);
1247 672650 : a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
1248 :
1249 672650 : x = dx;
1250 15670500 : for (int32_t r = 0; r < N; r++) {
1251 : __m256i b, res[2], res16[2];
1252 :
1253 14997900 : int32_t base = x >> frac_bits;
1254 14997900 : int32_t base_max_diff = (max_base_x - base);
1255 14997900 : if (base_max_diff <= 0) {
1256 0 : for (int32_t i = r; i < N; ++i) {
1257 0 : dstvec[i] = a_mbase_x; // save 32 values
1258 : }
1259 0 : return;
1260 : }
1261 14997900 : if (base_max_diff > 32) base_max_diff = 32;
1262 44993600 : __m256i shift = _mm256_srli_epi32(
1263 : _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1264 :
1265 44984000 : for (int32_t j = 0, jj = 0; j < 32; j += 16, jj++) {
1266 29986100 : int32_t mdiff = base_max_diff - j;
1267 29986100 : if (mdiff <= 0) {
1268 8350 : res16[jj] = a_mbase_x;
1269 : }
1270 : else {
1271 29977800 : a0 = _mm256_cvtepu8_epi32(
1272 29977800 : _mm_loadu_si128((__m128i *)(above + base + j)));
1273 59955500 : a1 = _mm256_cvtepu8_epi32(
1274 29977800 : _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
1275 :
1276 29977800 : diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1277 29977800 : a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1278 29977800 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1279 29977800 : b = _mm256_mullo_epi32(diff, shift);
1280 :
1281 29977800 : res[0] = _mm256_add_epi32(a32, b);
1282 29977800 : res[0] = _mm256_srli_epi32(res[0], 5);
1283 29977800 : res[0] = _mm256_packus_epi32(
1284 : res[0],
1285 29977800 : _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1286 :
1287 : // goto 8 bit
1288 29977800 : res[0] = _mm256_packus_epi16(res[0], res[0]);
1289 :
1290 29977800 : if (mdiff > 8) {
1291 29923800 : a0_1 = _mm256_cvtepu8_epi32(
1292 29923800 : _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
1293 59847500 : a1_1 = _mm256_cvtepu8_epi32(
1294 29923800 : _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
1295 :
1296 29923800 : diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
1297 29923800 : a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
1298 29923800 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1299 29923800 : b = _mm256_mullo_epi32(diff, shift);
1300 :
1301 29923800 : res[1] = _mm256_add_epi32(a32, b);
1302 29923800 : res[1] = _mm256_srli_epi32(res[1], 5);
1303 29923800 : res[1] = _mm256_packus_epi32(
1304 : res[1],
1305 29923800 : _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1306 59847500 : res[1] = _mm256_packus_epi16(res[1], res[1]);
1307 : // goto 8 bit
1308 : }
1309 : else {
1310 54005 : res[1] = a_mbase_x;
1311 : }
1312 59955500 : res16[jj] = _mm256_unpacklo_epi64(res[0], res[1]); // 16 8bit values
1313 : }
1314 : }
1315 14997900 : res16[1] =
1316 14997900 : _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
1317 : 1); // 32 8bit values
1318 :
1319 29995700 : dstvec[r] = _mm256_blendv_epi8(
1320 : a_mbase_x, res16[1],
1321 14997900 : *(__m256i *)BaseMask[base_max_diff]); // 32 8bit values
1322 14997900 : x += dx;
1323 : }
1324 : }
1325 :
1326 382076 : static void dr_prediction_z1_32xN_avx2(int32_t N, uint8_t *dst, ptrdiff_t stride,
1327 : const uint8_t *above, int32_t upsample_above,
1328 : int32_t dx) {
1329 : __m256i dstvec[64];
1330 : dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
1331 9355190 : for (int32_t i = 0; i < N; i++) {
1332 8973110 : _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1333 : }
1334 382076 : }
1335 :
1336 139904 : static void dr_prediction_z1_64xN_avx2(int32_t N, uint8_t *dst, ptrdiff_t stride,
1337 : const uint8_t *above, int32_t upsample_above,
1338 : int32_t dx) {
1339 : int32_t x;
1340 :
1341 : // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1342 : (void)upsample_above;
1343 139904 : const int32_t frac_bits = 6;
1344 139904 : const int32_t max_base_x = ((64 + N) - 1);
1345 :
1346 : // pre-filter above pixels
1347 : // store in temp buffers:
1348 : // above[x] * 32 + 16
1349 : // above[x+1] - above[x]
1350 : // final pixels will be caluculated as:
1351 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1352 : __m256i a0, a0_1, a1, a1_1, a32, a16;
1353 : __m256i a_mbase_x, diff;
1354 : __m128i max_base_x128, base_inc128, mask128;
1355 :
1356 139904 : a16 = _mm256_set1_epi32(16);
1357 139904 : a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
1358 139904 : max_base_x128 = _mm_set1_epi8(max_base_x);
1359 :
1360 139904 : x = dx;
1361 5701190 : for (int32_t r = 0; r < N; r++, dst += stride) {
1362 : __m256i b, res[2];
1363 : __m128i res1;
1364 :
1365 5561290 : int32_t base = x >> frac_bits;
1366 5561290 : if (base >= max_base_x) {
1367 0 : for (int32_t i = r; i < N; ++i) {
1368 : _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
1369 0 : _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1370 0 : dst += stride;
1371 : }
1372 0 : return;
1373 : }
1374 :
1375 16683900 : __m256i shift = _mm256_srli_epi32(
1376 : _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1377 :
1378 : __m128i a0_128, a0_1_128, a1_128, a1_1_128;
1379 27779000 : for (int32_t j = 0; j < 64; j += 16) {
1380 22217700 : int32_t mdif = max_base_x - (base + j);
1381 22217700 : if (mdif <= 0) {
1382 15879 : _mm_storeu_si128((__m128i *)(dst + j),
1383 : _mm256_castsi256_si128(a_mbase_x));
1384 : }
1385 : else {
1386 22201800 : a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
1387 44403600 : a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
1388 22201800 : a0 = _mm256_cvtepu8_epi32(a0_128);
1389 22201800 : a1 = _mm256_cvtepu8_epi32(a1_128);
1390 :
1391 22201800 : diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1392 22201800 : a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1393 22201800 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1394 22201800 : b = _mm256_mullo_epi32(diff, shift);
1395 :
1396 22201800 : res[0] = _mm256_add_epi32(a32, b);
1397 22201800 : res[0] = _mm256_srli_epi32(res[0], 5);
1398 22201800 : res[0] = _mm256_packus_epi32(
1399 : res[0],
1400 22201800 : _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1401 : // goto 8 bit
1402 22201800 : res[0] = _mm256_packus_epi16(res[0], res[0]);
1403 :
1404 22201800 : if (mdif > 8) {
1405 22165400 : a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
1406 44330700 : a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
1407 22165400 : a0_1 = _mm256_cvtepu8_epi32(a0_1_128);
1408 22165400 : a1_1 = _mm256_cvtepu8_epi32(a1_1_128);
1409 :
1410 22165400 : diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
1411 22165400 : a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
1412 22165400 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1413 22165400 : b = _mm256_mullo_epi32(diff, shift);
1414 :
1415 22165400 : res[1] = _mm256_add_epi32(a32, b);
1416 22165400 : res[1] = _mm256_srli_epi32(res[1], 5);
1417 22165400 : res[1] = _mm256_packus_epi32(
1418 : res[1],
1419 22165400 : _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1420 44330700 : res[1] = _mm256_packus_epi16(res[1], res[1]);
1421 :
1422 : }
1423 : else {
1424 36437 : res[1] = a_mbase_x;
1425 : }
1426 66605400 : res1 = _mm_unpacklo_epi64(
1427 : _mm256_castsi256_si128(res[0]),
1428 : _mm256_castsi256_si128(res[1])); // 16 8bit values
1429 :
1430 22201800 : base_inc128 = _mm_setr_epi8(
1431 22201800 : base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1432 22201800 : base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1433 22201800 : base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1434 22201800 : base + j + 13, base + j + 14, base + j + 15);
1435 :
1436 66605400 : mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
1437 : _mm_setzero_si128());
1438 : res1 =
1439 22201800 : _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x), res1, mask128);
1440 22201800 : _mm_storeu_si128((__m128i *)(dst + j), res1);
1441 : }
1442 : }
1443 5561290 : x += dx;
1444 : }
1445 : }
1446 :
1447 : // Directional prediction, zone 1: 0 < angle < 90
1448 4343370 : void eb_av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int32_t bw, int32_t bh,
1449 : const uint8_t *above, const uint8_t *left,
1450 : int32_t upsample_above, int32_t dx, int32_t dy) {
1451 : (void)left;
1452 : (void)dy;
1453 4343370 : switch (bw) {
1454 1527050 : case 4:
1455 1527050 : dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
1456 1527030 : break;
1457 1418690 : case 8:
1458 1418690 : dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
1459 1418640 : break;
1460 955247 : case 16:
1461 955247 : dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
1462 955227 : break;
1463 371409 : case 32:
1464 371409 : dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
1465 371402 : break;
1466 71499 : case 64:
1467 71499 : dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
1468 71499 : break;
1469 0 : default: break;
1470 : }
1471 4343280 : return;
1472 : }
1473 :
1474 : static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
1475 : int32_t N, __m128i *dst, const uint16_t *above, int32_t upsample_above, int32_t dx) {
1476 0 : const int32_t frac_bits = 6 - upsample_above;
1477 0 : const int32_t max_base_x = ((N + 4) - 1) << upsample_above;
1478 : int32_t x;
1479 : // a assert(dx > 0);
1480 : // pre-filter above pixels
1481 : // store in temp buffers:
1482 : // above[x] * 32 + 16
1483 : // above[x+1] - above[x]
1484 : // final pixels will be caluculated as:
1485 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1486 : __m256i a0, a1, a32, a16;
1487 : __m256i diff;
1488 : __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1489 :
1490 0 : a16 = _mm256_set1_epi32(16);
1491 0 : a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1492 0 : max_base_x128 = _mm_set1_epi32(max_base_x);
1493 :
1494 0 : x = dx;
1495 0 : for (int32_t r = 0; r < N; r++) {
1496 : __m256i b, res, shift;
1497 : __m128i res1;
1498 :
1499 0 : int32_t base = x >> frac_bits;
1500 0 : if (base >= max_base_x) {
1501 0 : for (int32_t i = r; i < N; ++i) {
1502 0 : dst[i] = a_mbase_x; // save 4 values
1503 : }
1504 0 : return;
1505 : }
1506 :
1507 0 : a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1508 0 : a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1509 :
1510 0 : if (upsample_above) {
1511 0 : a0 = _mm256_permutevar8x32_epi32(
1512 : a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1513 0 : a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1514 0 : base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
1515 0 : shift = _mm256_srli_epi32(
1516 : _mm256_and_si256(
1517 : _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1518 : _mm256_set1_epi32(0x3f)),
1519 : 1);
1520 : }
1521 : else {
1522 0 : base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
1523 0 : shift = _mm256_srli_epi32(
1524 : _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1525 : }
1526 :
1527 0 : diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1528 0 : a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1529 0 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1530 :
1531 0 : b = _mm256_mullo_epi32(diff, shift);
1532 0 : res = _mm256_add_epi32(a32, b);
1533 0 : res = _mm256_srli_epi32(res, 5);
1534 :
1535 0 : res1 = _mm256_castsi256_si128(res);
1536 0 : res1 = _mm_packus_epi32(res1, res1);
1537 :
1538 0 : mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
1539 0 : mask128 = _mm_packs_epi32(mask128, mask128); // goto 16 bit
1540 0 : dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1541 0 : x += dx;
1542 : }
1543 : }
1544 :
1545 0 : static void highbd_dr_prediction_z1_4xN_avx2(int32_t N, uint16_t *dst,
1546 : ptrdiff_t stride,
1547 : const uint16_t *above,
1548 : int32_t upsample_above, int32_t dx) {
1549 : __m128i dstvec[16];
1550 :
1551 : highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
1552 : dx);
1553 0 : for (int32_t i = 0; i < N; i++) {
1554 0 : _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
1555 : }
1556 0 : }
1557 :
1558 : static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
1559 : int32_t N, __m128i *dst, const uint16_t *above, int32_t upsample_above, int32_t dx) {
1560 0 : const int32_t frac_bits = 6 - upsample_above;
1561 0 : const int32_t max_base_x = ((8 + N) - 1) << upsample_above;
1562 :
1563 : int32_t x;
1564 : // a assert(dx > 0);
1565 : // pre-filter above pixels
1566 : // store in temp buffers:
1567 : // above[x] * 32 + 16
1568 : // above[x+1] - above[x]
1569 : // final pixels will be caluculated as:
1570 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1571 : __m256i a0, a1, a0_1, a1_1, a32, a16;
1572 : __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1573 :
1574 0 : a16 = _mm256_set1_epi32(16);
1575 0 : a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1576 0 : max_base_x256 = _mm256_set1_epi32(max_base_x);
1577 :
1578 0 : x = dx;
1579 0 : for (int32_t r = 0; r < N; r++) {
1580 : __m256i b, res, res1, shift;
1581 :
1582 0 : int32_t base = x >> frac_bits;
1583 0 : if (base >= max_base_x) {
1584 0 : for (int32_t i = r; i < N; ++i) {
1585 0 : dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values
1586 : }
1587 0 : return;
1588 : }
1589 :
1590 0 : a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1591 0 : a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1592 :
1593 0 : if (upsample_above) {
1594 0 : a0 = _mm256_permutevar8x32_epi32(
1595 : a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1596 0 : a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1597 :
1598 : a0_1 =
1599 0 : _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1600 0 : a0_1 = _mm256_permutevar8x32_epi32(
1601 : a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1602 0 : a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
1603 :
1604 0 : a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
1605 0 : a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
1606 : base_inc256 =
1607 0 : _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
1608 : base + 10, base + 12, base + 14);
1609 0 : shift = _mm256_srli_epi32(
1610 : _mm256_and_si256(
1611 : _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1612 : _mm256_set1_epi32(0x3f)),
1613 : 1);
1614 : }
1615 : else {
1616 0 : base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
1617 : base + 4, base + 5, base + 6, base + 7);
1618 0 : shift = _mm256_srli_epi32(
1619 : _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1620 : }
1621 :
1622 0 : diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1623 0 : a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1624 0 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1625 :
1626 0 : b = _mm256_mullo_epi32(diff, shift);
1627 0 : res = _mm256_add_epi32(a32, b);
1628 0 : res = _mm256_srli_epi32(res, 5);
1629 :
1630 0 : res1 = _mm256_packus_epi32(
1631 0 : res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
1632 :
1633 0 : mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
1634 0 : mask256 = _mm256_packs_epi32(
1635 : mask256, _mm256_castsi128_si256(
1636 0 : _mm256_extracti128_si256(mask256, 1))); // goto 16 bit
1637 0 : res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1638 0 : dst[r] = _mm256_castsi256_si128(res1);
1639 0 : x += dx;
1640 : }
1641 : }
1642 :
1643 0 : static void highbd_dr_prediction_z1_8xN_avx2(int32_t N, uint16_t *dst,
1644 : ptrdiff_t stride,
1645 : const uint16_t *above,
1646 : int32_t upsample_above, int32_t dx) {
1647 : __m128i dstvec[32];
1648 :
1649 : highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
1650 : dx);
1651 0 : for (int32_t i = 0; i < N; i++) {
1652 0 : _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
1653 : }
1654 0 : }
1655 :
1656 : static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
1657 : int32_t N, __m256i *dstvec, const uint16_t *above, int32_t upsample_above, int32_t dx) {
1658 : int32_t x;
1659 : // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1660 : (void)upsample_above;
1661 0 : const int32_t frac_bits = 6;
1662 0 : const int32_t max_base_x = ((16 + N) - 1);
1663 :
1664 : // pre-filter above pixels
1665 : // store in temp buffers:
1666 : // above[x] * 32 + 16
1667 : // above[x+1] - above[x]
1668 : // final pixels will be caluculated as:
1669 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1670 : __m256i a0, a0_1, a1, a1_1, a32, a16;
1671 : __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1672 :
1673 0 : a16 = _mm256_set1_epi32(16);
1674 0 : a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1675 0 : max_base_x256 = _mm256_set1_epi16(max_base_x);
1676 :
1677 0 : x = dx;
1678 0 : for (int32_t r = 0; r < N; r++) {
1679 : __m256i b, res[2], res1;
1680 :
1681 0 : int32_t base = x >> frac_bits;
1682 0 : if (base >= max_base_x) {
1683 0 : for (int32_t i = r; i < N; ++i) {
1684 0 : dstvec[i] = a_mbase_x; // save 16 values
1685 : }
1686 0 : return;
1687 : }
1688 0 : __m256i shift = _mm256_srli_epi32(
1689 : _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1690 :
1691 0 : a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1692 0 : a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1693 :
1694 0 : diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1695 0 : a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1696 0 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1697 0 : b = _mm256_mullo_epi32(diff, shift);
1698 :
1699 0 : res[0] = _mm256_add_epi32(a32, b);
1700 0 : res[0] = _mm256_srli_epi32(res[0], 5);
1701 0 : res[0] = _mm256_packus_epi32(
1702 0 : res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1703 :
1704 0 : int32_t mdif = max_base_x - base;
1705 0 : if (mdif > 8) {
1706 : a0_1 =
1707 0 : _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1708 : a1_1 =
1709 0 : _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
1710 :
1711 0 : diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
1712 0 : a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
1713 0 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1714 0 : b = _mm256_mullo_epi32(diff, shift);
1715 :
1716 0 : res[1] = _mm256_add_epi32(a32, b);
1717 0 : res[1] = _mm256_srli_epi32(res[1], 5);
1718 0 : res[1] = _mm256_packus_epi32(
1719 0 : res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1720 : }
1721 : else {
1722 0 : res[1] = a_mbase_x;
1723 : }
1724 0 : res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1725 : 1); // 16 16bit values
1726 :
1727 0 : base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1728 0 : base + 4, base + 5, base + 6, base + 7,
1729 0 : base + 8, base + 9, base + 10, base + 11,
1730 0 : base + 12, base + 13, base + 14, base + 15);
1731 0 : mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1732 0 : dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1733 0 : x += dx;
1734 : }
1735 : }
1736 :
1737 0 : static void highbd_dr_prediction_z1_16xN_avx2(int32_t N, uint16_t *dst,
1738 : ptrdiff_t stride,
1739 : const uint16_t *above,
1740 : int32_t upsample_above, int32_t dx) {
1741 : __m256i dstvec[64];
1742 : highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
1743 : dx);
1744 0 : for (int32_t i = 0; i < N; i++) {
1745 0 : _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1746 : }
1747 0 : }
1748 :
1749 : static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
1750 : int32_t N, __m256i *dstvec, const uint16_t *above, int32_t upsample_above, int32_t dx) {
1751 : int32_t x;
1752 : // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1753 : (void)upsample_above;
1754 0 : const int32_t frac_bits = 6;
1755 0 : const int32_t max_base_x = ((32 + N) - 1);
1756 :
1757 : // pre-filter above pixels
1758 : // store in temp buffers:
1759 : // above[x] * 32 + 16
1760 : // above[x+1] - above[x]
1761 : // final pixels will be caluculated as:
1762 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1763 : __m256i a0, a0_1, a1, a1_1, a32, a16;
1764 : __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1765 :
1766 0 : a16 = _mm256_set1_epi32(16);
1767 0 : a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1768 0 : max_base_x256 = _mm256_set1_epi16(max_base_x);
1769 :
1770 0 : x = dx;
1771 0 : for (int32_t r = 0; r < N; r++) {
1772 : __m256i b, res[2], res1;
1773 :
1774 0 : int32_t base = x >> frac_bits;
1775 0 : if (base >= max_base_x) {
1776 0 : for (int32_t i = r; i < N; ++i) {
1777 0 : dstvec[i] = a_mbase_x; // save 32 values
1778 0 : dstvec[i + N] = a_mbase_x;
1779 : }
1780 0 : return;
1781 : }
1782 :
1783 0 : __m256i shift = _mm256_srli_epi32(
1784 : _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1785 :
1786 0 : for (int32_t j = 0; j < 32; j += 16) {
1787 0 : int32_t mdif = max_base_x - (base + j);
1788 0 : if (mdif <= 0) {
1789 0 : res1 = a_mbase_x;
1790 : }
1791 : else {
1792 0 : a0 = _mm256_cvtepu16_epi32(
1793 0 : _mm_loadu_si128((__m128i *)(above + base + j)));
1794 0 : a1 = _mm256_cvtepu16_epi32(
1795 0 : _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
1796 :
1797 0 : diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1798 0 : a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1799 0 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1800 0 : b = _mm256_mullo_epi32(diff, shift);
1801 :
1802 0 : res[0] = _mm256_add_epi32(a32, b);
1803 0 : res[0] = _mm256_srli_epi32(res[0], 5);
1804 0 : res[0] = _mm256_packus_epi32(
1805 : res[0],
1806 0 : _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1807 0 : if (mdif > 8) {
1808 0 : a0_1 = _mm256_cvtepu16_epi32(
1809 0 : _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
1810 0 : a1_1 = _mm256_cvtepu16_epi32(
1811 0 : _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
1812 :
1813 0 : diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
1814 0 : a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
1815 0 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1816 0 : b = _mm256_mullo_epi32(diff, shift);
1817 :
1818 0 : res[1] = _mm256_add_epi32(a32, b);
1819 0 : res[1] = _mm256_srli_epi32(res[1], 5);
1820 0 : res[1] = _mm256_packus_epi32(
1821 : res[1],
1822 0 : _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1823 : }
1824 : else {
1825 0 : res[1] = a_mbase_x;
1826 : }
1827 0 : res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1828 : 1); // 16 16bit values
1829 0 : base_inc256 = _mm256_setr_epi16(
1830 0 : base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1831 0 : base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1832 0 : base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1833 0 : base + j + 13, base + j + 14, base + j + 15);
1834 :
1835 0 : mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1836 0 : res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1837 : }
1838 0 : if (!j)
1839 0 : dstvec[r] = res1;
1840 : else
1841 0 : dstvec[r + N] = res1;
1842 : }
1843 0 : x += dx;
1844 : }
1845 : }
1846 :
1847 0 : static void highbd_dr_prediction_z1_32xN_avx2(int32_t N, uint16_t *dst,
1848 : ptrdiff_t stride,
1849 : const uint16_t *above,
1850 : int32_t upsample_above, int32_t dx) {
1851 : __m256i dstvec[128];
1852 :
1853 : highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
1854 : dx);
1855 0 : for (int32_t i = 0; i < N; i++) {
1856 0 : _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1857 0 : _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
1858 : }
1859 0 : }
1860 :
1861 0 : static void highbd_dr_prediction_z1_64xN_avx2(int32_t N, uint16_t *dst,
1862 : ptrdiff_t stride,
1863 : const uint16_t *above,
1864 : int32_t upsample_above, int32_t dx) {
1865 : int32_t x;
1866 :
1867 : // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1868 : (void)upsample_above;
1869 0 : const int32_t frac_bits = 6;
1870 0 : const int32_t max_base_x = ((64 + N) - 1);
1871 :
1872 : // pre-filter above pixels
1873 : // store in temp buffers:
1874 : // above[x] * 32 + 16
1875 : // above[x+1] - above[x]
1876 : // final pixels will be caluculated as:
1877 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1878 : __m256i a0, a0_1, a1, a1_1, a32, a16;
1879 : __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1880 :
1881 0 : a16 = _mm256_set1_epi32(16);
1882 0 : a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1883 0 : max_base_x256 = _mm256_set1_epi16(max_base_x);
1884 :
1885 0 : x = dx;
1886 0 : for (int32_t r = 0; r < N; r++, dst += stride) {
1887 : __m256i b, res[2], res1;
1888 :
1889 0 : int32_t base = x >> frac_bits;
1890 0 : if (base >= max_base_x) {
1891 0 : for (int32_t i = r; i < N; ++i) {
1892 : _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
1893 0 : _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1894 0 : _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1895 0 : _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1896 0 : dst += stride;
1897 : }
1898 0 : return;
1899 : }
1900 :
1901 0 : __m256i shift = _mm256_srli_epi32(
1902 : _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1903 :
1904 : __m128i a0_128, a0_1_128, a1_128, a1_1_128;
1905 0 : for (int32_t j = 0; j < 64; j += 16) {
1906 0 : int32_t mdif = max_base_x - (base + j);
1907 0 : if (mdif <= 0) {
1908 0 : _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1909 : }
1910 : else {
1911 0 : a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
1912 0 : a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
1913 0 : a0 = _mm256_cvtepu16_epi32(a0_128);
1914 0 : a1 = _mm256_cvtepu16_epi32(a1_128);
1915 :
1916 0 : diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1917 0 : a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1918 0 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1919 0 : b = _mm256_mullo_epi32(diff, shift);
1920 :
1921 0 : res[0] = _mm256_add_epi32(a32, b);
1922 0 : res[0] = _mm256_srli_epi32(res[0], 5);
1923 0 : res[0] = _mm256_packus_epi32(
1924 : res[0],
1925 0 : _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1926 0 : if (mdif > 8) {
1927 0 : a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
1928 0 : a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
1929 0 : a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
1930 0 : a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
1931 :
1932 0 : diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
1933 0 : a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
1934 0 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1935 0 : b = _mm256_mullo_epi32(diff, shift);
1936 :
1937 0 : res[1] = _mm256_add_epi32(a32, b);
1938 0 : res[1] = _mm256_srli_epi32(res[1], 5);
1939 0 : res[1] = _mm256_packus_epi32(
1940 : res[1],
1941 0 : _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1942 : }
1943 : else {
1944 0 : res[1] = a_mbase_x;
1945 : }
1946 0 : res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1947 : 1); // 16 16bit values
1948 0 : base_inc256 = _mm256_setr_epi16(
1949 0 : base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1950 0 : base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1951 0 : base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1952 0 : base + j + 13, base + j + 14, base + j + 15);
1953 :
1954 0 : mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1955 0 : res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1956 0 : _mm256_storeu_si256((__m256i *)(dst + j), res1);
1957 : }
1958 : }
1959 0 : x += dx;
1960 : }
1961 : }
1962 :
1963 : // Directional prediction, zone 1: 0 < angle < 90
1964 0 : void eb_av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int32_t bw,
1965 : int32_t bh, const uint16_t *above,
1966 : const uint16_t *left, int32_t upsample_above,
1967 : int32_t dx, int32_t dy, int32_t bd) {
1968 : (void)left;
1969 : (void)dy;
1970 : (void)bd;
1971 :
1972 0 : switch (bw) {
1973 0 : case 4:
1974 0 : highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
1975 : dx);
1976 0 : break;
1977 0 : case 8:
1978 0 : highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
1979 : dx);
1980 0 : break;
1981 0 : case 16:
1982 0 : highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
1983 : dx);
1984 0 : break;
1985 0 : case 32:
1986 0 : highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
1987 : dx);
1988 0 : break;
1989 0 : case 64:
1990 0 : highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above,
1991 : dx);
1992 0 : break;
1993 0 : default: break;
1994 : }
1995 0 : return;
1996 : }
1997 :
1998 2524300 : static void dr_prediction_z2_Nx4_avx2(int32_t N, uint8_t *dst, ptrdiff_t stride,
1999 : const uint8_t *above, const uint8_t *left,
2000 : int32_t upsample_above, int32_t upsample_left,
2001 : int32_t dx, int32_t dy) {
2002 2524300 : const int32_t min_base_x = -(1 << upsample_above);
2003 2524300 : const int32_t min_base_y = -(1 << upsample_left);
2004 2524300 : const int32_t frac_bits_x = 6 - upsample_above;
2005 2524300 : const int32_t frac_bits_y = 6 - upsample_left;
2006 :
2007 2524300 : assert(dx > 0);
2008 : // pre-filter above pixels
2009 : // store in temp buffers:
2010 : // above[x] * 32 + 16
2011 : // above[x+1] - above[x]
2012 : // final pixels will be caluculated as:
2013 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2014 : __m128i a0_x, a1_x, a32, a16, diff;
2015 : __m128i c3f, min_base_y128, c1234, dy128;
2016 :
2017 2524300 : a16 = _mm_set1_epi16(16);
2018 2524300 : c3f = _mm_set1_epi16(0x3f);
2019 5048600 : min_base_y128 = _mm_set1_epi16(min_base_y);
2020 2524300 : c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
2021 2524300 : dy128 = _mm_set1_epi16(dy);
2022 :
2023 21333700 : for (int r = 0; r < N; r++) {
2024 : __m128i b, res, shift, r6, ydx;
2025 : __m128i resx, resy, resxy;
2026 : __m128i a0_x128, a1_x128;
2027 18809400 : int y = r + 1;
2028 18809400 : int base_x = (-y * dx) >> frac_bits_x;
2029 18809400 : int base_shift = 0;
2030 18809400 : if (base_x < (min_base_x - 1)) {
2031 13511300 : base_shift = (min_base_x - base_x - 1) >> upsample_above;
2032 : }
2033 18809400 : int base_min_diff =
2034 18809400 : (min_base_x - base_x + upsample_above) >> upsample_above;
2035 18809400 : if (base_min_diff > 4) {
2036 8472380 : base_min_diff = 4;
2037 : } else {
2038 10337000 : if (base_min_diff < 0)
2039 0 : base_min_diff = 0;
2040 : }
2041 :
2042 18809400 : if (base_shift > 3) {
2043 8472300 : a0_x = _mm_setzero_si128();
2044 8472300 : a1_x = _mm_setzero_si128();
2045 8472300 : shift = _mm_setzero_si128();
2046 : } else {
2047 10337100 : a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2048 20674200 : ydx = _mm_set1_epi16(y * dx);
2049 10337100 : r6 = _mm_slli_epi16(c1234, 6);
2050 :
2051 10337100 : if (upsample_above) {
2052 : a0_x128 =
2053 4306800 : _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
2054 4306800 : a1_x128 = _mm_srli_si128(a0_x128, 8);
2055 :
2056 17227200 : shift = _mm_srli_epi16(
2057 : _mm_and_si128(
2058 : _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
2059 : 1);
2060 : } else {
2061 6030320 : a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
2062 6030320 : a1_x128 = _mm_srli_si128(a0_x128, 1);
2063 :
2064 18091000 : shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
2065 : }
2066 10337100 : a0_x = _mm_cvtepu8_epi16(a0_x128);
2067 10337100 : a1_x = _mm_cvtepu8_epi16(a1_x128);
2068 : }
2069 : // y calc
2070 : __m128i a0_y, a1_y, shifty;
2071 18809400 : if (base_x < min_base_x) {
2072 : DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2073 : __m128i y_c128, base_y_c128, mask128, c1234_;
2074 15534400 : c1234_ = _mm_srli_si128(c1234, 2);
2075 31068900 : r6 = _mm_set1_epi16(r << 6);
2076 31068900 : y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
2077 15534400 : base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2078 15534400 : mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2079 15534400 : base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2080 : _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2081 :
2082 15534400 : a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2083 15534400 : left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
2084 31068900 : base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
2085 : _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2086 31068900 : a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2087 15534400 : left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
2088 :
2089 15534400 : if (upsample_left) {
2090 13769000 : shifty = _mm_srli_epi16(
2091 : _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
2092 : } else {
2093 21889500 : shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2094 : }
2095 15534400 : a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
2096 15534400 : a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
2097 15534400 : shift = _mm_unpacklo_epi64(shift, shifty);
2098 : }
2099 :
2100 18809400 : diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
2101 18809400 : a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
2102 18809400 : a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
2103 :
2104 18809400 : b = _mm_mullo_epi16(diff, shift);
2105 18809400 : res = _mm_add_epi16(a32, b);
2106 18809400 : res = _mm_srli_epi16(res, 5);
2107 :
2108 18809400 : resx = _mm_packus_epi16(res, res);
2109 18809400 : resy = _mm_srli_si128(resx, 4);
2110 :
2111 37618800 : resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
2112 18809400 : *(uint32_t *)(dst) = _mm_cvtsi128_si32(resxy);
2113 18809400 : dst += stride;
2114 : }
2115 2524300 : }
2116 :
2117 2423490 : static void dr_prediction_z2_Nx8_avx2(int32_t N, uint8_t *dst, ptrdiff_t stride,
2118 : const uint8_t *above, const uint8_t *left,
2119 : int32_t upsample_above, int32_t upsample_left,
2120 : int32_t dx, int32_t dy) {
2121 2423490 : const int32_t min_base_x = -(1 << upsample_above);
2122 2423490 : const int32_t min_base_y = -(1 << upsample_left);
2123 2423490 : const int32_t frac_bits_x = 6 - upsample_above;
2124 2423490 : const int32_t frac_bits_y = 6 - upsample_left;
2125 :
2126 : // pre-filter above pixels
2127 : // store in temp buffers:
2128 : // above[x] * 32 + 16
2129 : // above[x+1] - above[x]
2130 : // final pixels will be caluculated as:
2131 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2132 : __m256i diff, a32, a16;
2133 : __m256i a0_x, a1_x;
2134 : __m128i a0_x128, a1_x128, min_base_y128, c3f;
2135 : __m128i c1234, dy128;
2136 :
2137 2423490 : a16 = _mm256_set1_epi16(16);
2138 2423490 : c3f = _mm_set1_epi16(0x3f);
2139 2423490 : min_base_y128 = _mm_set1_epi16(min_base_y);
2140 4846970 : dy128 = _mm_set1_epi16(dy);
2141 2423490 : c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
2142 :
2143 27487200 : for (int r = 0; r < N; r++) {
2144 : __m256i b, res, shift;
2145 : __m128i resx, resy, resxy, r6, ydx;
2146 :
2147 25063700 : int32_t y = r + 1;
2148 25063700 : int32_t base_x = (-y * dx) >> frac_bits_x;
2149 25063700 : int32_t base_shift = 0;
2150 25063700 : if (base_x < (min_base_x - 1)) {
2151 19586900 : base_shift = (min_base_x - base_x - 1) >> upsample_above;
2152 : }
2153 25063700 : int32_t base_min_diff =
2154 25063700 : (min_base_x - base_x + upsample_above) >> upsample_above;
2155 25063700 : if (base_min_diff > 8) {
2156 10476500 : base_min_diff = 8;
2157 : } else {
2158 14587200 : if (base_min_diff < 0)
2159 0 : base_min_diff = 0;
2160 : }
2161 :
2162 25063700 : if (base_shift > 7) {
2163 10476400 : a0_x = _mm256_setzero_si256();
2164 10476400 : a1_x = _mm256_setzero_si256();
2165 10476400 : shift = _mm256_setzero_si256();
2166 : } else {
2167 14587200 : a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2168 14587200 : ydx = _mm_set1_epi16(y * dx);
2169 14587200 : r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
2170 14587200 : if (upsample_above) {
2171 : a0_x128 =
2172 4215520 : _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
2173 4215520 : a1_x128 = _mm_srli_si128(a0_x128, 8);
2174 :
2175 21077600 : shift = _mm256_castsi128_si256(_mm_srli_epi16(
2176 : _mm_and_si128(
2177 : _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
2178 : 1));
2179 : } else {
2180 10371700 : a1_x128 = _mm_srli_si128(a0_x128, 1);
2181 10371700 : a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
2182 20743400 : a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
2183 :
2184 41486900 : shift = _mm256_castsi128_si256(
2185 : _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
2186 : }
2187 29174500 : a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
2188 29174500 : a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
2189 : }
2190 :
2191 : // y calc
2192 : __m128i a0_y, a1_y, shifty;
2193 25063700 : if (base_x < min_base_x) {
2194 : DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
2195 : __m128i y_c128, base_y_c128, mask128;
2196 43509300 : r6 = _mm_set1_epi16(r << 6);
2197 43509300 : y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2198 21754600 : base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2199 21754600 : mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2200 21754600 : base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2201 : _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2202 :
2203 21754600 : a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2204 21754600 : left[base_y_c[2]], left[base_y_c[3]],
2205 21754600 : left[base_y_c[4]], left[base_y_c[5]],
2206 21754600 : left[base_y_c[6]], left[base_y_c[7]]);
2207 65263900 : base_y_c128 = _mm_add_epi16(
2208 : base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
2209 : _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2210 :
2211 43509300 : a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2212 21754600 : left[base_y_c[2]], left[base_y_c[3]],
2213 21754600 : left[base_y_c[4]], left[base_y_c[5]],
2214 21754600 : left[base_y_c[6]], left[base_y_c[7]]);
2215 :
2216 21754600 : if (upsample_left) {
2217 15280900 : shifty = _mm_srli_epi16(
2218 : _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
2219 : } else {
2220 33322000 : shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2221 : }
2222 :
2223 21754600 : a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2224 21754600 : a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2225 21754600 : shift = _mm256_inserti128_si256(shift, shifty, 1);
2226 : }
2227 :
2228 25063700 : diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
2229 25063700 : a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
2230 25063700 : a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
2231 :
2232 25063700 : b = _mm256_mullo_epi16(diff, shift);
2233 25063700 : res = _mm256_add_epi16(a32, b);
2234 25063700 : res = _mm256_srli_epi16(res, 5);
2235 :
2236 50127300 : resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
2237 : _mm256_castsi256_si128(res));
2238 25063700 : resy = _mm256_extracti128_si256(res, 1);
2239 25063700 : resy = _mm_packus_epi16(resy, resy);
2240 :
2241 50127300 : resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
2242 25063700 : _mm_storel_epi64((__m128i *)(dst), resxy);
2243 25063700 : dst += stride;
2244 : }
2245 2423490 : }
2246 :
2247 2437990 : static void dr_prediction_z2_HxW_avx2(int32_t H, int32_t W, uint8_t *dst,
2248 : ptrdiff_t stride, const uint8_t *above,
2249 : const uint8_t *left, int32_t upsample_above,
2250 : int32_t upsample_left, int32_t dx, int32_t dy) {
2251 : // here upsample_above and upsample_left are 0 by design of
2252 : // av1_use_intra_edge_upsample
2253 2437990 : const int32_t min_base_x = -1;
2254 2437990 : const int32_t min_base_y = -1;
2255 : (void)upsample_above;
2256 : (void)upsample_left;
2257 2437990 : const int32_t frac_bits_x = 6;
2258 2437990 : const int32_t frac_bits_y = 6;
2259 :
2260 : __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
2261 : __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
2262 : __m128i a0_x128, a1_x128;
2263 :
2264 : DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
2265 2437990 : a16 = _mm256_set1_epi16(16);
2266 2437990 : c1 = _mm256_srli_epi16(a16, 4);
2267 4875990 : min_base_y256 = _mm256_set1_epi16(min_base_y);
2268 2437990 : c3f = _mm256_set1_epi16(0x3f);
2269 4875990 : dy256 = _mm256_set1_epi16(dy);
2270 : c0123 =
2271 2437990 : _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2272 2437990 : c1234 = _mm256_add_epi16(c0123, c1);
2273 :
2274 44162600 : for (int r = 0; r < H; r++) {
2275 : __m256i b, res, shift, j256, r6, ydx;
2276 : __m128i resx, resy;
2277 : __m128i resxy;
2278 41724600 : int y = r + 1;
2279 41724600 : ydx = _mm256_set1_epi16(y * dx);
2280 :
2281 41724600 : int base_x = (-y * dx) >> frac_bits_x;
2282 111368000 : for (int j = 0; j < W; j += 16) {
2283 69643300 : j256 = _mm256_set1_epi16(j);
2284 69643300 : int base_shift = 0;
2285 69643300 : if ((base_x + j) < (min_base_x - 1)) {
2286 46967300 : base_shift = (min_base_x - (base_x + j) - 1);
2287 : }
2288 69643300 : int base_min_diff = (min_base_x - base_x - j);
2289 69643300 : if (base_min_diff > 16) {
2290 25858400 : base_min_diff = 16;
2291 : } else {
2292 43784900 : if (base_min_diff < 0)
2293 15478600 : base_min_diff = 0;
2294 : }
2295 :
2296 69643300 : if (base_shift < 16) {
2297 43865600 : a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
2298 : a1_x128 =
2299 43865600 : _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
2300 43865600 : a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
2301 87731300 : a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
2302 :
2303 43865600 : a0_x = _mm256_cvtepu8_epi16(a0_x128);
2304 43865600 : a1_x = _mm256_cvtepu8_epi16(a1_x128);
2305 :
2306 87731300 : r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
2307 131597000 : shift = _mm256_srli_epi16(
2308 : _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
2309 :
2310 43865600 : diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
2311 43865600 : a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
2312 43865600 : a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
2313 :
2314 43865600 : b = _mm256_mullo_epi16(diff, shift);
2315 43865600 : res = _mm256_add_epi16(a32, b);
2316 43865600 : res = _mm256_srli_epi16(res, 5); // 16 16-bit values
2317 131597000 : resx = _mm256_castsi256_si128(_mm256_packus_epi16(
2318 43865600 : res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2319 : } else {
2320 25777700 : resx = _mm_setzero_si128();
2321 : }
2322 :
2323 : // y calc
2324 69643300 : if (base_x < min_base_x) {
2325 : __m256i c256, y_c256, base_y_c256, mask256, mul16;
2326 128689000 : r6 = _mm256_set1_epi16(r << 6);
2327 64344300 : c256 = _mm256_add_epi16(j256, c1234);
2328 193033000 : mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
2329 : _mm256_srli_epi16(min_base_y256, 1));
2330 64344300 : y_c256 = _mm256_sub_epi16(r6, mul16);
2331 :
2332 64344300 : base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
2333 64344300 : mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
2334 64344300 : base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2335 : _mm256_store_si256((__m256i *)base_y_c, base_y_c256); /**/
2336 :
2337 64344300 : a0_y = _mm256_setr_epi16(
2338 64344300 : left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2339 64344300 : left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2340 64344300 : left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2341 64344300 : left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2342 64344300 : left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2343 64344300 : left[base_y_c[15]]);
2344 64344300 : base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
2345 : _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2346 :
2347 64344300 : a1_y = _mm256_setr_epi16(
2348 64344300 : left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2349 64344300 : left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2350 64344300 : left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2351 64344300 : left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2352 64344300 : left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2353 64344300 : left[base_y_c[15]]);
2354 :
2355 128689000 : shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
2356 :
2357 64344300 : diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
2358 64344300 : a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32
2359 64344300 : a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
2360 :
2361 64344300 : b = _mm256_mullo_epi16(diff, shifty);
2362 64344300 : res = _mm256_add_epi16(a32, b);
2363 64344300 : res = _mm256_srli_epi16(res, 5); // 16 16-bit values
2364 193033000 : resy = _mm256_castsi256_si128(_mm256_packus_epi16(
2365 64344300 : res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2366 : } else {
2367 5299010 : resy = _mm_setzero_si128();
2368 : }
2369 69643300 : resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
2370 69643300 : _mm_storeu_si128((__m128i *)(dst + j), resxy);
2371 : } // for j
2372 41724600 : dst += stride;
2373 : }
2374 2437990 : }
2375 :
2376 : // Directional prediction, zone 2: 90 < angle < 180
2377 7384800 : void eb_av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int32_t bw, int32_t bh,
2378 : const uint8_t *above, const uint8_t *left,
2379 : int32_t upsample_above, int32_t upsample_left, int32_t dx,
2380 : int32_t dy) {
2381 7384800 : assert(dx > 0);
2382 7384800 : assert(dy > 0);
2383 7384800 : switch (bw) {
2384 2524300 : case 4:
2385 2524300 : dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
2386 : upsample_left, dx, dy);
2387 2524330 : break;
2388 2423460 : case 8:
2389 2423460 : dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
2390 : upsample_left, dx, dy);
2391 :
2392 2423480 : break;
2393 2437030 : default:
2394 2437030 : dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2395 : upsample_above, upsample_left, dx, dy);
2396 2437980 : break;
2397 : }
2398 7385800 : return;
2399 : }
2400 :
2401 : // z3 functions
2402 185813 : static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) {
2403 : __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
2404 185813 : w0 = _mm_unpacklo_epi8(x[0], x[1]);
2405 185813 : w1 = _mm_unpacklo_epi8(x[2], x[3]);
2406 185813 : w2 = _mm_unpackhi_epi8(x[0], x[1]);
2407 371626 : w3 = _mm_unpackhi_epi8(x[2], x[3]);
2408 :
2409 185813 : ww0 = _mm_unpacklo_epi16(w0, w1);
2410 185813 : ww1 = _mm_unpacklo_epi16(w2, w3);
2411 185813 : ww2 = _mm_unpackhi_epi16(w0, w1);
2412 185813 : ww3 = _mm_unpackhi_epi16(w2, w3);
2413 :
2414 185813 : w0 = _mm_unpacklo_epi32(ww0, ww1);
2415 185813 : w2 = _mm_unpacklo_epi32(ww2, ww3);
2416 185813 : w1 = _mm_unpackhi_epi32(ww0, ww1);
2417 185813 : w3 = _mm_unpackhi_epi32(ww2, ww3);
2418 :
2419 185813 : d[0] = _mm_unpacklo_epi64(w0, w2);
2420 185813 : d[1] = _mm_unpackhi_epi64(w0, w2);
2421 185813 : d[2] = _mm_unpacklo_epi64(w1, w3);
2422 185813 : d[3] = _mm_unpackhi_epi64(w1, w3);
2423 :
2424 185813 : d[4] = _mm_srli_si128(d[0], 8);
2425 185813 : d[5] = _mm_srli_si128(d[1], 8);
2426 185813 : d[6] = _mm_srli_si128(d[2], 8);
2427 185813 : d[7] = _mm_srli_si128(d[3], 8);
2428 :
2429 185813 : d[8] = _mm_srli_si128(d[0], 4);
2430 185813 : d[9] = _mm_srli_si128(d[1], 4);
2431 185813 : d[10] = _mm_srli_si128(d[2], 4);
2432 185813 : d[11] = _mm_srli_si128(d[3], 4);
2433 :
2434 185813 : d[12] = _mm_srli_si128(d[0], 12);
2435 185813 : d[13] = _mm_srli_si128(d[1], 12);
2436 185813 : d[14] = _mm_srli_si128(d[2], 12);
2437 185813 : d[15] = _mm_srli_si128(d[3], 12);
2438 185813 : }
2439 :
2440 424346 : static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
2441 : __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
2442 : __m256i w10, w11, w12, w13, w14, w15;
2443 :
2444 424346 : w0 = _mm256_unpacklo_epi8(x[0], x[1]);
2445 424346 : w1 = _mm256_unpacklo_epi8(x[2], x[3]);
2446 424346 : w2 = _mm256_unpacklo_epi8(x[4], x[5]);
2447 424346 : w3 = _mm256_unpacklo_epi8(x[6], x[7]);
2448 :
2449 424346 : w8 = _mm256_unpacklo_epi8(x[8], x[9]);
2450 424346 : w9 = _mm256_unpacklo_epi8(x[10], x[11]);
2451 424346 : w10 = _mm256_unpacklo_epi8(x[12], x[13]);
2452 848692 : w11 = _mm256_unpacklo_epi8(x[14], x[15]);
2453 :
2454 424346 : w4 = _mm256_unpacklo_epi16(w0, w1);
2455 424346 : w5 = _mm256_unpacklo_epi16(w2, w3);
2456 424346 : w12 = _mm256_unpacklo_epi16(w8, w9);
2457 424346 : w13 = _mm256_unpacklo_epi16(w10, w11);
2458 :
2459 424346 : w6 = _mm256_unpacklo_epi32(w4, w5);
2460 424346 : w7 = _mm256_unpackhi_epi32(w4, w5);
2461 424346 : w14 = _mm256_unpacklo_epi32(w12, w13);
2462 424346 : w15 = _mm256_unpackhi_epi32(w12, w13);
2463 :
2464 : // Store first 4-line result
2465 424346 : d[0] = _mm256_unpacklo_epi64(w6, w14);
2466 424346 : d[1] = _mm256_unpackhi_epi64(w6, w14);
2467 424346 : d[2] = _mm256_unpacklo_epi64(w7, w15);
2468 848692 : d[3] = _mm256_unpackhi_epi64(w7, w15);
2469 :
2470 424346 : w4 = _mm256_unpackhi_epi16(w0, w1);
2471 424346 : w5 = _mm256_unpackhi_epi16(w2, w3);
2472 424346 : w12 = _mm256_unpackhi_epi16(w8, w9);
2473 424346 : w13 = _mm256_unpackhi_epi16(w10, w11);
2474 :
2475 424346 : w6 = _mm256_unpacklo_epi32(w4, w5);
2476 424346 : w7 = _mm256_unpackhi_epi32(w4, w5);
2477 424346 : w14 = _mm256_unpacklo_epi32(w12, w13);
2478 424346 : w15 = _mm256_unpackhi_epi32(w12, w13);
2479 :
2480 : // Store second 4-line result
2481 424346 : d[4] = _mm256_unpacklo_epi64(w6, w14);
2482 424346 : d[5] = _mm256_unpackhi_epi64(w6, w14);
2483 424346 : d[6] = _mm256_unpacklo_epi64(w7, w15);
2484 424346 : d[7] = _mm256_unpackhi_epi64(w7, w15);
2485 :
2486 : // upper half
2487 424346 : w0 = _mm256_unpackhi_epi8(x[0], x[1]);
2488 424346 : w1 = _mm256_unpackhi_epi8(x[2], x[3]);
2489 424346 : w2 = _mm256_unpackhi_epi8(x[4], x[5]);
2490 424346 : w3 = _mm256_unpackhi_epi8(x[6], x[7]);
2491 :
2492 424346 : w8 = _mm256_unpackhi_epi8(x[8], x[9]);
2493 424346 : w9 = _mm256_unpackhi_epi8(x[10], x[11]);
2494 424346 : w10 = _mm256_unpackhi_epi8(x[12], x[13]);
2495 848692 : w11 = _mm256_unpackhi_epi8(x[14], x[15]);
2496 :
2497 424346 : w4 = _mm256_unpacklo_epi16(w0, w1);
2498 424346 : w5 = _mm256_unpacklo_epi16(w2, w3);
2499 424346 : w12 = _mm256_unpacklo_epi16(w8, w9);
2500 424346 : w13 = _mm256_unpacklo_epi16(w10, w11);
2501 :
2502 424346 : w6 = _mm256_unpacklo_epi32(w4, w5);
2503 424346 : w7 = _mm256_unpackhi_epi32(w4, w5);
2504 424346 : w14 = _mm256_unpacklo_epi32(w12, w13);
2505 424346 : w15 = _mm256_unpackhi_epi32(w12, w13);
2506 :
2507 : // Store first 4-line result
2508 424346 : d[8] = _mm256_unpacklo_epi64(w6, w14);
2509 424346 : d[9] = _mm256_unpackhi_epi64(w6, w14);
2510 424346 : d[10] = _mm256_unpacklo_epi64(w7, w15);
2511 848692 : d[11] = _mm256_unpackhi_epi64(w7, w15);
2512 :
2513 424346 : w4 = _mm256_unpackhi_epi16(w0, w1);
2514 424346 : w5 = _mm256_unpackhi_epi16(w2, w3);
2515 424346 : w12 = _mm256_unpackhi_epi16(w8, w9);
2516 424346 : w13 = _mm256_unpackhi_epi16(w10, w11);
2517 :
2518 424346 : w6 = _mm256_unpacklo_epi32(w4, w5);
2519 424346 : w7 = _mm256_unpackhi_epi32(w4, w5);
2520 424346 : w14 = _mm256_unpacklo_epi32(w12, w13);
2521 424346 : w15 = _mm256_unpackhi_epi32(w12, w13);
2522 :
2523 : // Store second 4-line result
2524 424346 : d[12] = _mm256_unpacklo_epi64(w6, w14);
2525 424346 : d[13] = _mm256_unpackhi_epi64(w6, w14);
2526 424346 : d[14] = _mm256_unpacklo_epi64(w7, w15);
2527 424346 : d[15] = _mm256_unpackhi_epi64(w7, w15);
2528 424346 : }
2529 :
2530 751669 : static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
2531 : uint8_t *dst, ptrdiff_t pitchDst) {
2532 : __m128i r[16];
2533 : __m128i d[16];
2534 12777600 : for (int j = 0; j < 16; j++) {
2535 24051800 : r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
2536 : }
2537 751669 : transpose16x16_sse2(r, d);
2538 12777900 : for (int j = 0; j < 16; j++) {
2539 12026200 : _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
2540 : }
2541 751714 : }
2542 :
2543 79074 : static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
2544 : ptrdiff_t pitchDst, int32_t width, int32_t height) {
2545 374032 : for (int j = 0; j < height; j += 16)
2546 1046630 : for (int i = 0; i < width; i += 16)
2547 751669 : transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
2548 751669 : dst + j * pitchDst + i, pitchDst);
2549 79075 : }
2550 :
2551 508543 : static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
2552 : const uint8_t *left, int32_t upsample_left,
2553 : int32_t dy) {
2554 : __m128i dstvec[4], d[4];
2555 :
2556 : dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
2557 508543 : transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2558 : &d[0], &d[1], &d[2], &d[3]);
2559 :
2560 508537 : *(uint32_t *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
2561 508537 : *(uint32_t *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
2562 508537 : *(uint32_t *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
2563 508537 : *(uint32_t *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
2564 508537 : return;
2565 : }
2566 :
2567 503028 : static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
2568 : const uint8_t *left, int32_t upsample_left,
2569 : int32_t dy) {
2570 : __m128i dstvec[8], d[8];
2571 :
2572 : dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
2573 503028 : transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
2574 : &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
2575 : &d[3]);
2576 :
2577 503026 : _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
2578 503026 : _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
2579 503026 : _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
2580 503026 : _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
2581 503026 : _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
2582 503026 : _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
2583 503026 : _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
2584 503026 : _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
2585 503026 : }
2586 :
2587 262039 : static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
2588 : const uint8_t *left, int32_t upsample_left,
2589 : int32_t dy) {
2590 : __m128i dstvec[4], d[8];
2591 :
2592 : dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
2593 :
2594 262039 : transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
2595 : &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
2596 2358360 : for (int32_t i = 0; i < 8; i++) {
2597 4192630 : *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
2598 : }
2599 262040 : }
2600 :
2601 235052 : static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
2602 : const uint8_t *left, int32_t upsample_left,
2603 : int32_t dy) {
2604 : __m128i dstvec[8], d[4];
2605 :
2606 : dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
2607 235052 : transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2608 : &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
2609 : &d[1], &d[2], &d[3]);
2610 235052 : _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
2611 235052 : _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
2612 235052 : _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
2613 235052 : _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
2614 235052 : }
2615 :
2616 143056 : static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
2617 : const uint8_t *left, int32_t upsample_left,
2618 : int32_t dy) {
2619 : __m128i dstvec[8], d[8];
2620 :
2621 : dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
2622 :
2623 143056 : transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
2624 : dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
2625 : d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
2626 1287500 : for (int32_t i = 0; i < 8; i++) {
2627 1144440 : _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
2628 1144440 : _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
2629 1144440 : _mm_srli_si128(d[i], 8));
2630 : }
2631 143056 : }
2632 :
2633 136872 : static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
2634 : const uint8_t *left, int32_t upsample_left,
2635 : int32_t dy) {
2636 : __m128i dstvec[16], d[16];
2637 :
2638 : dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
2639 :
2640 136872 : transpose16x8_8x16_sse2(
2641 : &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
2642 : &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
2643 : &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
2644 : &d[3], &d[4], &d[5], &d[6], &d[7]);
2645 :
2646 1231830 : for (int32_t i = 0; i < 8; i++) {
2647 1094960 : _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
2648 : }
2649 136871 : }
2650 :
2651 185811 : static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
2652 : const uint8_t *left, int32_t upsample_left,
2653 : int32_t dy) {
2654 : __m128i dstvec[4], d[16];
2655 :
2656 : dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
2657 :
2658 185811 : transpose4x16_sse2(dstvec, d);
2659 3158810 : for (int32_t i = 0; i < 16; i++) {
2660 5945990 : *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
2661 : }
2662 185813 : }
2663 :
2664 181559 : static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
2665 : const uint8_t *left, int32_t upsample_left,
2666 : int32_t dy) {
2667 : __m128i dstvec[16], d[8];
2668 : dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
2669 :
2670 907799 : for (int32_t i = 4; i < 8; i++) {
2671 726240 : d[i] = _mm_setzero_si128();
2672 : }
2673 181559 : transpose16x8_8x16_sse2(
2674 : &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
2675 : &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
2676 : &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
2677 : &d[3], &d[4], &d[5], &d[6], &d[7]);
2678 :
2679 907800 : for (int32_t i = 0; i < 4; i++) {
2680 726240 : _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
2681 : }
2682 181560 : }
2683 :
2684 95468 : static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
2685 : const uint8_t *left, int32_t upsample_left,
2686 : int32_t dy) {
2687 : __m256i dstvec[16], d[16];
2688 :
2689 : dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
2690 859209 : for (int32_t i = 8; i < 16; i++) {
2691 763741 : dstvec[i] = _mm256_setzero_si256();
2692 : }
2693 95468 : transpose16x32_avx2(dstvec, d);
2694 :
2695 1622920 : for (int32_t i = 0; i < 16; i++) {
2696 3054900 : _mm_storel_epi64((__m128i *)(dst + i * stride),
2697 : _mm256_castsi256_si128(d[i]));
2698 : }
2699 1622910 : for (int32_t i = 0; i < 16; i++) {
2700 1527440 : _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
2701 1527440 : _mm256_extracti128_si256(d[i], 1));
2702 : }
2703 95468 : }
2704 :
2705 69623 : static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
2706 : const uint8_t *left, int32_t upsample_left,
2707 : int32_t dy) {
2708 : __m128i dstvec[32], d[16];
2709 :
2710 : dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
2711 :
2712 69623 : transpose16x8_8x16_sse2(
2713 : &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
2714 : &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
2715 : &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
2716 : &d[3], &d[4], &d[5], &d[6], &d[7]);
2717 69623 : transpose16x8_8x16_sse2(
2718 : &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
2719 : &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
2720 : &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
2721 : &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
2722 : &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
2723 : &d[6 + 8], &d[7 + 8]);
2724 :
2725 626607 : for (int32_t i = 0; i < 8; i++) {
2726 556984 : _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
2727 556984 : _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
2728 : }
2729 69623 : }
2730 :
2731 303524 : static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
2732 : const uint8_t *left, int32_t upsample_left,
2733 : int32_t dy) {
2734 : __m128i dstvec[16], d[16];
2735 :
2736 : dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
2737 303524 : transpose16x16_sse2(dstvec, d);
2738 :
2739 5159860 : for (int32_t i = 0; i < 16; i++) {
2740 4856340 : _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
2741 : }
2742 303524 : }
2743 :
2744 133774 : static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
2745 : const uint8_t *left, int32_t upsample_left,
2746 : int32_t dy) {
2747 : __m256i dstvec[32], d[32];
2748 :
2749 : dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
2750 133774 : transpose16x32_avx2(dstvec, d);
2751 133774 : transpose16x32_avx2(dstvec + 16, d + 16);
2752 2274130 : for (int32_t j = 0; j < 16; j++) {
2753 4280710 : _mm_storeu_si128((__m128i *)(dst + j * stride),
2754 : _mm256_castsi256_si128(d[j]));
2755 2140360 : _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
2756 2140360 : _mm256_castsi256_si128(d[j + 16]));
2757 : }
2758 2274120 : for (int32_t j = 0; j < 16; j++) {
2759 2140340 : _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
2760 2140340 : _mm256_extracti128_si256(d[j], 1));
2761 2140340 : _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
2762 2140340 : _mm256_extracti128_si256(d[j + 16], 1));
2763 : }
2764 133774 : }
2765 :
2766 27692 : static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
2767 : const uint8_t *left, int32_t upsample_left,
2768 : int32_t dy) {
2769 : DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
2770 27692 : dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
2771 27693 : transpose(dstT, 64, dst, stride, 64, 64);
2772 27693 : }
2773 :
2774 61332 : static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
2775 : const uint8_t *left, int32_t upsample_left,
2776 : int32_t dy) {
2777 : __m256i dstvec[16], d[16];
2778 :
2779 : dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
2780 61332 : transpose16x32_avx2(dstvec, d);
2781 : // store
2782 1042660 : for (int32_t j = 0; j < 16; j++) {
2783 1962650 : _mm_storeu_si128((__m128i *)(dst + j * stride),
2784 : _mm256_castsi256_si128(d[j]));
2785 981325 : _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
2786 981325 : _mm256_extracti128_si256(d[j], 1));
2787 : }
2788 61333 : }
2789 :
2790 48602 : static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
2791 : const uint8_t *left, int32_t upsample_left,
2792 : int32_t dy) {
2793 : __m128i dstvec[32], d[16];
2794 :
2795 : dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
2796 :
2797 145808 : for (int32_t i = 0; i < 32; i += 16) {
2798 97205 : transpose16x16_sse2((dstvec + i), d);
2799 1652460 : for (int32_t j = 0; j < 16; j++) {
2800 1555250 : _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
2801 : }
2802 : }
2803 48603 : }
2804 :
2805 15113 : static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
2806 : const uint8_t *left, int32_t upsample_left,
2807 : int32_t dy) {
2808 : EB_ALIGN(32) uint8_t dstT[64 * 32];
2809 15113 : dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
2810 15113 : transpose(dstT, 64, dst, stride, 32, 64);
2811 15113 : }
2812 :
2813 10667 : static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
2814 : const uint8_t *left, int32_t upsample_left,
2815 : int32_t dy) {
2816 : EB_ALIGN(32) uint8_t dstT[32 * 64];
2817 10667 : dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
2818 10667 : transpose(dstT, 32, dst, stride, 64, 32);
2819 10667 : return;
2820 : }
2821 :
2822 25601 : static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
2823 : const uint8_t *left, int32_t upsample_left,
2824 : int32_t dy) {
2825 : EB_ALIGN(32) uint8_t dstT[64 * 16];
2826 25601 : dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
2827 25601 : transpose(dstT, 64, dst, stride, 16, 64);
2828 25601 : }
2829 :
2830 17044 : static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
2831 : const uint8_t *left, int32_t upsample_left,
2832 : int32_t dy) {
2833 : __m128i dstvec[64], d[16];
2834 :
2835 : dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
2836 :
2837 85220 : for (int32_t i = 0; i < 64; i += 16) {
2838 68176 : transpose16x16_sse2((dstvec + i), d);
2839 1158990 : for (int32_t j = 0; j < 16; j++) {
2840 1090820 : _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
2841 : }
2842 : }
2843 17044 : }
2844 :
2845 2964200 : void eb_av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int32_t bw, int32_t bh,
2846 : const uint8_t *above, const uint8_t *left,
2847 : int32_t upsample_left, int32_t dx, int32_t dy) {
2848 : (void)above;
2849 : (void)dx;
2850 2964200 : assert(dx == 1);
2851 2964200 : assert(dy > 0);
2852 :
2853 2964200 : if (bw == bh) {
2854 1476520 : switch (bw) {
2855 508542 : case 4:
2856 508542 : dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
2857 508535 : break;
2858 503027 : case 8:
2859 503027 : dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
2860 503026 : break;
2861 303525 : case 16:
2862 303525 : dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
2863 303524 : break;
2864 133774 : case 32:
2865 133774 : dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
2866 133773 : break;
2867 27693 : case 64:
2868 27693 : dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
2869 27693 : break;
2870 : }
2871 1476510 : }
2872 : else {
2873 1487680 : if (bw < bh) {
2874 788407 : if (bw + bw == bh) {
2875 481537 : switch (bw) {
2876 262039 : case 4:
2877 262039 : dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
2878 262040 : break;
2879 143056 : case 8:
2880 143056 : dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
2881 143056 : break;
2882 61332 : case 16:
2883 61332 : dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
2884 61333 : break;
2885 15113 : case 32:
2886 15113 : dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
2887 15113 : break;
2888 : }
2889 481539 : }
2890 : else {
2891 306870 : switch (bw) {
2892 185811 : case 4:
2893 185811 : dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
2894 185813 : break;
2895 95468 : case 8:
2896 95468 : dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
2897 95467 : break;
2898 25601 : case 16:
2899 25601 : dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
2900 25601 : break;
2901 : }
2902 788410 : }
2903 : }
2904 : else {
2905 699273 : if (bh + bh == bw) {
2906 431194 : switch (bh) {
2907 235054 : case 4:
2908 235054 : dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
2909 235052 : break;
2910 136872 : case 8:
2911 136872 : dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
2912 136869 : break;
2913 48603 : case 16:
2914 48603 : dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
2915 48603 : break;
2916 10667 : case 32:
2917 10667 : dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
2918 10667 : break;
2919 : }
2920 431189 : }
2921 : else {
2922 268079 : switch (bh) {
2923 181559 : case 4:
2924 181559 : dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
2925 181560 : break;
2926 69623 : case 8:
2927 69623 : dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
2928 69623 : break;
2929 17044 : case 16:
2930 17044 : dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
2931 17044 : break;
2932 : }
2933 2964190 : }
2934 : }
2935 : }
2936 2964190 : return;
2937 : }
2938 :
2939 : static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
2940 : { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
2941 : { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
2942 : { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
2943 : { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
2944 : { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
2945 : { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
2946 : { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
2947 : { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
2948 : };
2949 :
2950 : static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
2951 : {0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15},
2952 : {0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13},
2953 : {0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11},
2954 : {0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9}};
2955 :
2956 : static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
2957 : {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
2958 : 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31},
2959 : {0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
2960 : 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29},
2961 : {0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25,
2962 : 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27},
2963 : {0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
2964 : 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25},
2965 : {0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21,
2966 : 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23},
2967 : {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
2968 : 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21},
2969 : {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
2970 : 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19},
2971 : {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
2972 : 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17}};
2973 :
2974 : static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
2975 : {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
2976 : { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2977 : { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2978 : { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2979 : { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2980 : { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2981 : { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2982 : 0 },
2983 : { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
2984 : 0, 0 },
2985 : { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
2986 : 0, 0, 0, 0 },
2987 : { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
2988 : 0, 0, 0, 0, 0, 0 },
2989 : { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
2990 : 0xffff, 0, 0, 0, 0, 0, 0 },
2991 : { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
2992 : 0xffff, 0xffff, 0, 0, 0, 0, 0 },
2993 : { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
2994 : 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
2995 : { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
2996 : 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
2997 : { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
2998 : 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
2999 : { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
3000 : 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
3001 : { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
3002 : 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
3003 : };
3004 :
3005 0 : static void highbd_dr_prediction_z2_Nx4_avx2(
3006 : int32_t N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
3007 : const uint16_t *left, int32_t upsample_above, int32_t upsample_left, int32_t dx,
3008 : int32_t dy) {
3009 0 : const int32_t min_base_x = -(1 << upsample_above);
3010 0 : const int32_t min_base_y = -(1 << upsample_left);
3011 0 : const int32_t frac_bits_x = 6 - upsample_above;
3012 0 : const int32_t frac_bits_y = 6 - upsample_left;
3013 :
3014 0 : assert(dx > 0);
3015 : // pre-filter above pixels
3016 : // store in temp buffers:
3017 : // above[x] * 32 + 16
3018 : // above[x+1] - above[x]
3019 : // final pixels will be caluculated as:
3020 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3021 : __m256i a0_x, a1_x, a32, a16;
3022 : __m256i diff;
3023 : __m128i c3f, min_base_y128;
3024 :
3025 0 : a16 = _mm256_set1_epi16(16);
3026 0 : c3f = _mm_set1_epi16(0x3f);
3027 0 : min_base_y128 = _mm_set1_epi16(min_base_y);
3028 :
3029 0 : for (int32_t r = 0; r < N; r++) {
3030 : __m256i b, res, shift;
3031 : __m128i resx, resy, resxy;
3032 : __m128i a0_x128, a1_x128;
3033 0 : int32_t y = r + 1;
3034 0 : int32_t base_x = (-y * dx) >> frac_bits_x;
3035 0 : int32_t base_shift = 0;
3036 0 : if (base_x < (min_base_x - 1)) {
3037 0 : base_shift = (min_base_x - base_x - 1) >> upsample_above;
3038 : }
3039 0 : int32_t base_min_diff =
3040 0 : (min_base_x - base_x + upsample_above) >> upsample_above;
3041 0 : if (base_min_diff > 4) {
3042 0 : base_min_diff = 4;
3043 : } else {
3044 0 : if (base_min_diff < 0)
3045 0 : base_min_diff = 0;
3046 : }
3047 :
3048 0 : if (base_shift > 3) {
3049 0 : a0_x = _mm256_setzero_si256();
3050 0 : a1_x = _mm256_setzero_si256();
3051 0 : shift = _mm256_setzero_si256();
3052 : } else {
3053 0 : a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3054 0 : if (upsample_above) {
3055 0 : a0_x128 = _mm_shuffle_epi8(a0_x128,
3056 0 : *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
3057 0 : a1_x128 = _mm_srli_si128(a0_x128, 8);
3058 :
3059 0 : shift = _mm256_castsi128_si256(_mm_srli_epi16(
3060 : _mm_and_si128(
3061 0 : _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
3062 0 : (2 << 6) - y * dx,
3063 0 : (3 << 6) - y * dx, 0, 0, 0, 0),
3064 : upsample_above),
3065 : c3f),
3066 : 1));
3067 : } else {
3068 : a0_x128 =
3069 0 : _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
3070 0 : a1_x128 = _mm_srli_si128(a0_x128, 2);
3071 :
3072 0 : shift = _mm256_castsi128_si256(_mm_srli_epi16(
3073 0 : _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
3074 0 : (2 << 6) - y * dx, (3 << 6) - y * dx,
3075 : 0, 0, 0, 0),
3076 : c3f),
3077 : 1));
3078 : }
3079 0 : a0_x = _mm256_castsi128_si256(a0_x128);
3080 0 : a1_x = _mm256_castsi128_si256(a1_x128);
3081 : }
3082 : // y calc
3083 : __m128i a0_y, a1_y, shifty;
3084 0 : if (base_x < min_base_x) {
3085 : __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
3086 : DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
3087 0 : r6 = _mm_set1_epi16(r << 6);
3088 0 : dy128 = _mm_set1_epi16(dy);
3089 0 : c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
3090 0 : y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
3091 0 : base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
3092 0 : mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
3093 0 : base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
3094 : _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3095 :
3096 0 : a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3097 0 : left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3098 0 : a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
3099 0 : left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
3100 : 0, 0);
3101 :
3102 0 : if (upsample_left) {
3103 0 : shifty = _mm_srli_epi16(
3104 : _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
3105 : } else {
3106 0 : shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
3107 : }
3108 0 : a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
3109 0 : a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
3110 0 : shift = _mm256_inserti128_si256(shift, shifty, 1);
3111 : }
3112 :
3113 0 : diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
3114 0 : a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
3115 0 : a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
3116 :
3117 0 : b = _mm256_mullo_epi16(diff, shift);
3118 0 : res = _mm256_add_epi16(a32, b);
3119 0 : res = _mm256_srli_epi16(res, 5);
3120 :
3121 0 : resx = _mm256_castsi256_si128(res);
3122 0 : resy = _mm256_extracti128_si256(res, 1);
3123 : resxy =
3124 0 : _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
3125 0 : _mm_storel_epi64((__m128i *)(dst), resxy);
3126 0 : dst += stride;
3127 : }
3128 0 : }
3129 :
3130 0 : static void highbd_dr_prediction_z2_Nx8_avx2(int32_t N, uint16_t *dst, ptrdiff_t stride,
3131 : const uint16_t *above, const uint16_t *left,
3132 : int32_t upsample_above, int32_t upsample_left,
3133 : int32_t dx, int32_t dy) {
3134 0 : const int min_base_x = -(1 << upsample_above);
3135 0 : const int min_base_y = -(1 << upsample_left);
3136 0 : const int frac_bits_x = 6 - upsample_above;
3137 0 : const int frac_bits_y = 6 - upsample_left;
3138 :
3139 : // pre-filter above pixels
3140 : // store in temp buffers:
3141 : // above[x] * 32 + 16
3142 : // above[x+1] - above[x]
3143 : // final pixels will be calculated as:
3144 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3145 : __m128i c3f, min_base_y128;
3146 : __m256i a0_x, a1_x, diff, a32, a16;
3147 : __m128i a0_x128, a1_x128;
3148 :
3149 0 : a16 = _mm256_set1_epi16(16);
3150 0 : c3f = _mm_set1_epi16(0x3f);
3151 0 : min_base_y128 = _mm_set1_epi16(min_base_y);
3152 :
3153 0 : for (int r = 0; r < N; r++) {
3154 : __m256i b, res, shift;
3155 : __m128i resx, resy, resxy;
3156 0 : int y = r + 1;
3157 0 : int base_x = (-y * dx) >> frac_bits_x;
3158 0 : int base_shift = 0;
3159 0 : if (base_x < (min_base_x - 1)) {
3160 0 : base_shift = (min_base_x - base_x - 1) >> upsample_above;
3161 : }
3162 0 : int base_min_diff =
3163 0 : (min_base_x - base_x + upsample_above) >> upsample_above;
3164 0 : if (base_min_diff > 8) {
3165 0 : base_min_diff = 8;
3166 : } else {
3167 0 : if (base_min_diff < 0)
3168 0 : base_min_diff = 0;
3169 : }
3170 :
3171 0 : if (base_shift > 7) {
3172 0 : a0_x = _mm256_setzero_si256();
3173 0 : a1_x = _mm256_setzero_si256();
3174 0 : shift = _mm256_setzero_si256();
3175 : } else {
3176 0 : a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3177 0 : if (upsample_above) {
3178 : __m128i mask, atmp0, atmp1, atmp2, atmp3;
3179 0 : a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
3180 0 : atmp0 = _mm_shuffle_epi8(a0_x128,
3181 0 : *(__m128i *)HighbdEvenOddMaskx[base_shift]);
3182 0 : atmp1 = _mm_shuffle_epi8(a1_x128,
3183 0 : *(__m128i *)HighbdEvenOddMaskx[base_shift]);
3184 0 : atmp2 = _mm_shuffle_epi8(
3185 0 : a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
3186 0 : atmp3 = _mm_shuffle_epi8(
3187 0 : a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
3188 0 : mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
3189 : _mm_set1_epi8(15));
3190 0 : a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
3191 0 : mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
3192 : _mm_set1_epi8(15));
3193 0 : a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
3194 :
3195 0 : shift = _mm256_castsi128_si256(_mm_srli_epi16(
3196 : _mm_and_si128(
3197 : _mm_slli_epi16(
3198 0 : _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
3199 0 : (2 << 6) - y * dx, (3 << 6) - y * dx,
3200 0 : (4 << 6) - y * dx, (5 << 6) - y * dx,
3201 0 : (6 << 6) - y * dx, (7 << 6) - y * dx),
3202 : upsample_above),
3203 : c3f),
3204 : 1));
3205 : } else {
3206 0 : a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
3207 : a0_x128 =
3208 0 : _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
3209 : a1_x128 =
3210 0 : _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
3211 :
3212 0 : shift = _mm256_castsi128_si256(_mm_srli_epi16(
3213 0 : _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
3214 0 : (2 << 6) - y * dx, (3 << 6) - y * dx,
3215 0 : (4 << 6) - y * dx, (5 << 6) - y * dx,
3216 0 : (6 << 6) - y * dx, (7 << 6) - y * dx),
3217 : c3f),
3218 : 1));
3219 : }
3220 0 : a0_x = _mm256_castsi128_si256(a0_x128);
3221 0 : a1_x = _mm256_castsi128_si256(a1_x128);
3222 : }
3223 :
3224 : // y calc
3225 : __m128i a0_y, a1_y, shifty;
3226 0 : if (base_x < min_base_x) {
3227 : DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
3228 : __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
3229 0 : r6 = _mm_set1_epi16(r << 6);
3230 0 : dy128 = _mm_set1_epi16(dy);
3231 0 : c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3232 0 : y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
3233 0 : base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
3234 0 : mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
3235 0 : base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
3236 : _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3237 :
3238 0 : a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3239 0 : left[base_y_c[2]], left[base_y_c[3]],
3240 0 : left[base_y_c[4]], left[base_y_c[5]],
3241 0 : left[base_y_c[6]], left[base_y_c[7]]);
3242 0 : a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
3243 0 : left[base_y_c[2] + 1], left[base_y_c[3] + 1],
3244 0 : left[base_y_c[4] + 1], left[base_y_c[5] + 1],
3245 0 : left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
3246 :
3247 0 : if (upsample_left) {
3248 0 : shifty = _mm_srli_epi16(
3249 : _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
3250 : } else {
3251 0 : shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
3252 : }
3253 0 : a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
3254 0 : a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
3255 0 : shift = _mm256_inserti128_si256(shift, shifty, 1);
3256 : }
3257 :
3258 0 : diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
3259 0 : a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
3260 0 : a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
3261 :
3262 0 : b = _mm256_mullo_epi16(diff, shift);
3263 0 : res = _mm256_add_epi16(a32, b);
3264 0 : res = _mm256_srli_epi16(res, 5);
3265 :
3266 0 : resx = _mm256_castsi256_si128(res);
3267 0 : resy = _mm256_extracti128_si256(res, 1);
3268 :
3269 : resxy =
3270 0 : _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
3271 : _mm_storeu_si128((__m128i *)(dst), resxy);
3272 0 : dst += stride;
3273 : }
3274 0 : }
3275 :
3276 0 : static void highbd_dr_prediction_z2_Nx8_32bit_avx2(
3277 : int32_t N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
3278 : const uint16_t *left, int32_t upsample_above, int32_t upsample_left, int32_t dx,
3279 : int32_t dy) {
3280 0 : const int32_t min_base_x = -(1 << upsample_above);
3281 0 : const int32_t min_base_y = -(1 << upsample_left);
3282 0 : const int32_t frac_bits_x = 6 - upsample_above;
3283 0 : const int32_t frac_bits_y = 6 - upsample_left;
3284 :
3285 : // pre-filter above pixels
3286 : // store in temp buffers:
3287 : // above[x] * 32 + 16
3288 : // above[x+1] - above[x]
3289 : // final pixels will be calculated as:
3290 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3291 : __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
3292 : __m256i diff;
3293 : __m128i a0_x128, a1_x128;
3294 :
3295 0 : a16 = _mm256_set1_epi32(16);
3296 0 : c3f = _mm256_set1_epi32(0x3f);
3297 0 : min_base_y256 = _mm256_set1_epi32(min_base_y);
3298 :
3299 0 : for (int32_t r = 0; r < N; r++) {
3300 : __m256i b, res, shift;
3301 : __m128i resx, resy, resxy;
3302 0 : int32_t y = r + 1;
3303 0 : int32_t base_x = (-y * dx) >> frac_bits_x;
3304 0 : int32_t base_shift = 0;
3305 0 : if (base_x < (min_base_x - 1)) {
3306 0 : base_shift = (min_base_x - base_x - 1) >> upsample_above;
3307 : }
3308 0 : int32_t base_min_diff =
3309 0 : (min_base_x - base_x + upsample_above) >> upsample_above;
3310 0 : if (base_min_diff > 8) {
3311 0 : base_min_diff = 8;
3312 : }
3313 : else {
3314 0 : if (base_min_diff < 0) base_min_diff = 0;
3315 : }
3316 :
3317 0 : if (base_shift > 7) {
3318 0 : resx = _mm_setzero_si128();
3319 : } else {
3320 0 : a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3321 0 : if (upsample_above) {
3322 : __m128i mask, atmp0, atmp1, atmp2, atmp3;
3323 0 : a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
3324 0 : atmp0 = _mm_shuffle_epi8(a0_x128,
3325 0 : *(__m128i *)HighbdEvenOddMaskx[base_shift]);
3326 0 : atmp1 = _mm_shuffle_epi8(a1_x128,
3327 0 : *(__m128i *)HighbdEvenOddMaskx[base_shift]);
3328 0 : atmp2 = _mm_shuffle_epi8(
3329 0 : a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
3330 0 : atmp3 = _mm_shuffle_epi8(
3331 0 : a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
3332 0 : mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
3333 : _mm_set1_epi8(15));
3334 0 : a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
3335 0 : mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
3336 : _mm_set1_epi8(15));
3337 0 : a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
3338 0 : shift = _mm256_srli_epi32(
3339 : _mm256_and_si256(
3340 : _mm256_slli_epi32(
3341 0 : _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
3342 0 : (2 << 6) - y * dx, (3 << 6) - y * dx,
3343 0 : (4 << 6) - y * dx, (5 << 6) - y * dx,
3344 0 : (6 << 6) - y * dx, (7 << 6) - y * dx),
3345 : upsample_above),
3346 : c3f),
3347 : 1);
3348 : } else {
3349 0 : a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
3350 : a0_x128 =
3351 0 : _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
3352 : a1_x128 =
3353 0 : _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
3354 :
3355 0 : shift = _mm256_srli_epi32(
3356 : _mm256_and_si256(
3357 0 : _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
3358 0 : (3 << 6) - y * dx, (4 << 6) - y * dx,
3359 0 : (5 << 6) - y * dx, (6 << 6) - y * dx,
3360 0 : (7 << 6) - y * dx),
3361 : c3f),
3362 : 1);
3363 : }
3364 :
3365 0 : a0_x = _mm256_cvtepu16_epi32(a0_x128);
3366 0 : a1_x = _mm256_cvtepu16_epi32(a1_x128);
3367 :
3368 0 : diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
3369 0 : a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
3370 0 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
3371 :
3372 0 : b = _mm256_mullo_epi32(diff, shift);
3373 0 : res = _mm256_add_epi32(a32, b);
3374 0 : res = _mm256_srli_epi32(res, 5);
3375 :
3376 0 : resx = _mm256_castsi256_si128(_mm256_packus_epi32(
3377 0 : res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
3378 : }
3379 : // y calc
3380 0 : if (base_x < min_base_x) {
3381 : DECLARE_ALIGNED(32, int32_t, base_y_c[8]);
3382 : __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
3383 0 : r6 = _mm256_set1_epi32(r << 6);
3384 0 : dy256 = _mm256_set1_epi32(dy);
3385 0 : c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3386 0 : y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
3387 0 : base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
3388 0 : mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
3389 0 : base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
3390 : _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
3391 :
3392 0 : a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
3393 0 : left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
3394 0 : left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
3395 0 : left[base_y_c[6]], left[base_y_c[7]]));
3396 0 : a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
3397 0 : left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
3398 0 : left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
3399 0 : left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
3400 :
3401 0 : if (upsample_left) {
3402 0 : shift = _mm256_srli_epi32(
3403 : _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
3404 : 1);
3405 : }
3406 : else {
3407 0 : shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
3408 : }
3409 0 : diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
3410 0 : a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
3411 0 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
3412 :
3413 0 : b = _mm256_mullo_epi32(diff, shift);
3414 0 : res = _mm256_add_epi32(a32, b);
3415 0 : res = _mm256_srli_epi32(res, 5);
3416 :
3417 0 : resy = _mm256_castsi256_si128(_mm256_packus_epi32(
3418 0 : res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
3419 : }
3420 : else {
3421 0 : resy = resx;
3422 : }
3423 : resxy =
3424 0 : _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
3425 : _mm_storeu_si128((__m128i *)(dst), resxy);
3426 0 : dst += stride;
3427 : }
3428 0 : }
3429 :
3430 0 : static void highbd_dr_prediction_z2_HxW_avx2(
3431 : int32_t H, int32_t W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
3432 : const uint16_t *left, int32_t upsample_above, int32_t upsample_left, int32_t dx,
3433 : int32_t dy) {
3434 : // here upsample_above and upsample_left are 0 by design of
3435 : // av1_use_intra_edge_upsample
3436 0 : const int min_base_x = -1;
3437 0 : const int min_base_y = -1;
3438 : (void)upsample_above;
3439 : (void)upsample_left;
3440 0 : const int frac_bits_x = 6;
3441 0 : const int frac_bits_y = 6;
3442 :
3443 : // pre-filter above pixels
3444 : // store in temp buffers:
3445 : // above[x] * 32 + 16
3446 : // above[x+1] - above[x]
3447 : // final pixels will be calculated as:
3448 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3449 : __m256i a0_x, a1_x, a32, a16, c3f, c1;
3450 : __m256i diff, min_base_y256, dy256, c1234, c0123;
3451 : DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
3452 :
3453 0 : a16 = _mm256_set1_epi16(16);
3454 0 : c1 = _mm256_srli_epi16(a16, 4);
3455 0 : min_base_y256 = _mm256_set1_epi16(min_base_y);
3456 0 : c3f = _mm256_set1_epi16(0x3f);
3457 0 : dy256 = _mm256_set1_epi16(dy);
3458 : c0123 =
3459 0 : _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3460 0 : c1234 = _mm256_add_epi16(c0123, c1);
3461 :
3462 0 : for (int r = 0; r < H; r++) {
3463 : __m256i b, res, shift;
3464 : __m256i resx, resy, ydx;
3465 : __m256i resxy, j256, r6;
3466 : __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
3467 0 : int y = r + 1;
3468 0 : ydx = _mm256_set1_epi16(y * dx);
3469 :
3470 0 : for (int j = 0; j < W; j += 16) {
3471 0 : j256 = _mm256_set1_epi16(j);
3472 0 : int base_x = (-y * dx) >> frac_bits_x;
3473 0 : int base_shift = 0;
3474 0 : if ((base_x + j) < (min_base_x - 1)) {
3475 0 : base_shift = (min_base_x - (base_x + j) - 1);
3476 : }
3477 0 : int base_min_diff = (min_base_x - base_x - j);
3478 0 : if (base_min_diff > 16) {
3479 0 : base_min_diff = 16;
3480 : } else {
3481 0 : if (base_min_diff < 0)
3482 0 : base_min_diff = 0;
3483 : }
3484 :
3485 0 : if (base_shift < 8) {
3486 0 : a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
3487 : a1_x128 =
3488 0 : _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
3489 : a0_x128 =
3490 0 : _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
3491 : a1_x128 =
3492 0 : _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
3493 :
3494 0 : a0_x = _mm256_castsi128_si256(a0_x128);
3495 0 : a1_x = _mm256_castsi128_si256(a1_x128);
3496 : } else {
3497 0 : a0_x = _mm256_setzero_si256();
3498 0 : a1_x = _mm256_setzero_si256();
3499 : }
3500 :
3501 0 : int base_shift1 = 0;
3502 0 : if (base_shift > 8) {
3503 0 : base_shift1 = base_shift - 8;
3504 : }
3505 0 : if (base_shift1 < 8) {
3506 : a0_1_x128 =
3507 0 : _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8 + j));
3508 : a1_1_x128 =
3509 0 : _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9 + j));
3510 0 : a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
3511 0 : *(__m128i *)HighbdLoadMaskx[base_shift1]);
3512 0 : a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
3513 0 : *(__m128i *)HighbdLoadMaskx[base_shift1]);
3514 :
3515 0 : a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
3516 0 : a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
3517 : }
3518 0 : r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
3519 0 : shift = _mm256_srli_epi16(
3520 : _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
3521 :
3522 0 : diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
3523 0 : a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
3524 0 : a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
3525 :
3526 0 : b = _mm256_mullo_epi16(diff, shift);
3527 0 : res = _mm256_add_epi16(a32, b);
3528 0 : resx = _mm256_srli_epi16(res, 5); // 16 16-bit values
3529 :
3530 : // y calc
3531 0 : resy = _mm256_setzero_si256();
3532 : __m256i a0_y, a1_y, shifty;
3533 0 : if ((base_x < min_base_x)) {
3534 : __m256i c256, y_c256, base_y_c256, mask256, mul16;
3535 0 : r6 = _mm256_set1_epi16(r << 6);
3536 0 : c256 = _mm256_add_epi16(j256, c1234);
3537 0 : mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
3538 : _mm256_srli_epi16(min_base_y256, 1));
3539 0 : y_c256 = _mm256_sub_epi16(r6, mul16);
3540 0 : base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
3541 0 : mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
3542 0 : base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
3543 : _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
3544 :
3545 0 : a0_y = _mm256_setr_epi16(
3546 0 : left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
3547 0 : left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
3548 0 : left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
3549 0 : left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
3550 0 : left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
3551 0 : left[base_y_c[15]]);
3552 0 : base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
3553 : _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
3554 :
3555 0 : a1_y = _mm256_setr_epi16(
3556 0 : left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
3557 0 : left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
3558 0 : left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
3559 0 : left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
3560 0 : left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
3561 0 : left[base_y_c[15]]);
3562 :
3563 0 : shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
3564 :
3565 0 : diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
3566 0 : a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32
3567 0 : a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
3568 :
3569 0 : b = _mm256_mullo_epi16(diff, shifty);
3570 0 : res = _mm256_add_epi16(a32, b);
3571 0 : resy = _mm256_srli_epi16(res, 5);
3572 : }
3573 :
3574 0 : resxy = _mm256_blendv_epi8(resx, resy,
3575 0 : *(__m256i *)HighbdBaseMask[base_min_diff]);
3576 0 : _mm256_storeu_si256((__m256i *)(dst + j), resxy);
3577 : } // for j
3578 0 : dst += stride;
3579 : }
3580 0 : }
3581 :
3582 0 : static void highbd_dr_prediction_z2_HxW_32bit_avx2(
3583 : int32_t H, int32_t W, uint16_t *dst, ptrdiff_t stride,
3584 : const uint16_t *above, const uint16_t *left, int32_t upsample_above,
3585 : int32_t upsample_left, int32_t dx, int32_t dy) {
3586 : // here upsample_above and upsample_left are 0 by design of
3587 : // av1_use_intra_edge_upsample
3588 0 : const int32_t min_base_x = -1;
3589 0 : const int32_t min_base_y = -1;
3590 : (void)upsample_above;
3591 : (void)upsample_left;
3592 0 : const int32_t frac_bits_x = 6;
3593 0 : const int32_t frac_bits_y = 6;
3594 :
3595 : // pre-filter above pixels
3596 : // store in temp buffers:
3597 : // above[x] * 32 + 16
3598 : // above[x+1] - above[x]
3599 : // final pixels will be caluculated as:
3600 : // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3601 : __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16;
3602 : __m256i diff, min_base_y256, c3f;
3603 : __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
3604 :
3605 0 : a16 = _mm256_set1_epi32(16);
3606 0 : min_base_y256 = _mm256_set1_epi16(min_base_y);
3607 0 : c3f = _mm256_set1_epi32(0x3f);
3608 0 : for (int32_t r = 0; r < H; r++) {
3609 : __m256i b, res, shift;
3610 : __m256i resx[2], resy[2];
3611 : __m256i resxy;
3612 0 : for (int32_t j = 0; j < W; j += 16) {
3613 0 : int32_t y = r + 1;
3614 0 : int32_t base_x = (-y * dx) >> frac_bits_x;
3615 0 : int32_t base_shift = 0;
3616 0 : if ((base_x + j) < (min_base_x - 1)) {
3617 0 : base_shift = (min_base_x - (base_x + j) - 1);
3618 : }
3619 0 : int32_t base_min_diff = (min_base_x - base_x - j);
3620 0 : if (base_min_diff > 16) {
3621 0 : base_min_diff = 16;
3622 : }
3623 : else {
3624 0 : if (base_min_diff < 0) base_min_diff = 0;
3625 : }
3626 :
3627 0 : if (base_shift > 7) {
3628 0 : resx[0] = _mm256_setzero_si256();
3629 : }
3630 : else {
3631 0 : a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
3632 : a1_x128 =
3633 0 : _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
3634 : a0_x128 =
3635 0 : _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
3636 : a1_x128 =
3637 0 : _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
3638 :
3639 0 : a0_x = _mm256_cvtepu16_epi32(a0_x128);
3640 0 : a1_x = _mm256_cvtepu16_epi32(a1_x128);
3641 :
3642 0 : shift = _mm256_srli_epi32(
3643 : _mm256_and_si256(
3644 : _mm256_setr_epi32(
3645 0 : ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx,
3646 0 : ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx,
3647 0 : ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx,
3648 0 : ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx),
3649 : _mm256_set1_epi32(0x3f)),
3650 : 1);
3651 :
3652 0 : diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
3653 0 : a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
3654 0 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
3655 :
3656 0 : b = _mm256_mullo_epi32(diff, shift);
3657 0 : res = _mm256_add_epi32(a32, b);
3658 0 : res = _mm256_srli_epi32(res, 5);
3659 :
3660 0 : resx[0] = _mm256_packus_epi32(
3661 0 : res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
3662 : }
3663 0 : base_shift = 0;
3664 0 : if ((base_x + j + 8) < (min_base_x - 1)) {
3665 0 : base_shift = (min_base_x - (base_x + j + 8) - 1);
3666 : }
3667 0 : if (base_shift > 7) {
3668 0 : resx[1] = _mm256_setzero_si256();
3669 : }
3670 : else {
3671 : a0_1_x128 =
3672 0 : _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 8 + j));
3673 : a1_1_x128 =
3674 0 : _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 9 + j));
3675 0 : a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
3676 0 : *(__m128i *)HighbdLoadMaskx[base_shift]);
3677 0 : a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
3678 0 : *(__m128i *)HighbdLoadMaskx[base_shift]);
3679 :
3680 0 : a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
3681 0 : a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
3682 :
3683 0 : shift = _mm256_srli_epi32(
3684 : _mm256_and_si256(
3685 : _mm256_setr_epi32(
3686 0 : ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx,
3687 0 : ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx,
3688 0 : ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx,
3689 0 : ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx),
3690 : _mm256_set1_epi32(0x3f)),
3691 : 1);
3692 :
3693 0 : diff = _mm256_sub_epi32(a1_1_x, a0_1_x); // a[x+1] - a[x]
3694 0 : a32 = _mm256_slli_epi32(a0_1_x, 5); // a[x] * 32
3695 0 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
3696 0 : b = _mm256_mullo_epi32(diff, shift);
3697 :
3698 0 : resx[1] = _mm256_add_epi32(a32, b);
3699 0 : resx[1] = _mm256_srli_epi32(resx[1], 5);
3700 0 : resx[1] = _mm256_packus_epi32(
3701 : resx[1],
3702 0 : _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
3703 : }
3704 0 : resx[0] =
3705 0 : _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
3706 : 1); // 16 16bit values
3707 :
3708 : // y calc
3709 0 : if ((base_x < min_base_x)) {
3710 : DECLARE_ALIGNED(32, int32_t, base_y_c[16]);
3711 : __m256i r6, c256, dy256, y_c256, y_c_1_256, base_y_c256, mask256;
3712 0 : r6 = _mm256_set1_epi32(r << 6);
3713 0 : dy256 = _mm256_set1_epi32(dy);
3714 0 : c256 = _mm256_setr_epi32(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j,
3715 : 7 + j, 8 + j);
3716 0 : y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
3717 0 : base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
3718 0 : mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
3719 0 : base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
3720 : _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
3721 0 : c256 = _mm256_setr_epi32(9 + j, 10 + j, 11 + j, 12 + j, 13 + j, 14 + j,
3722 : 15 + j, 16 + j);
3723 0 : y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
3724 0 : base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
3725 0 : mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
3726 0 : base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
3727 0 : _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
3728 :
3729 0 : a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
3730 0 : left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
3731 0 : left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
3732 0 : left[base_y_c[6]], left[base_y_c[7]]));
3733 0 : a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
3734 0 : left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
3735 0 : left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
3736 0 : left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
3737 :
3738 0 : shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
3739 :
3740 0 : diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
3741 0 : a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
3742 0 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
3743 :
3744 0 : b = _mm256_mullo_epi32(diff, shift);
3745 0 : res = _mm256_add_epi32(a32, b);
3746 0 : res = _mm256_srli_epi32(res, 5);
3747 :
3748 0 : resy[0] = _mm256_packus_epi32(
3749 0 : res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
3750 :
3751 0 : a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
3752 0 : left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
3753 0 : left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
3754 0 : left[base_y_c[14]], left[base_y_c[15]]));
3755 0 : a1_y = _mm256_cvtepu16_epi32(
3756 0 : _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
3757 0 : left[base_y_c[10] + 1], left[base_y_c[11] + 1],
3758 0 : left[base_y_c[12] + 1], left[base_y_c[13] + 1],
3759 0 : left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
3760 0 : shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
3761 :
3762 0 : diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
3763 0 : a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
3764 0 : a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
3765 :
3766 0 : b = _mm256_mullo_epi32(diff, shift);
3767 0 : res = _mm256_add_epi32(a32, b);
3768 0 : res = _mm256_srli_epi32(res, 5);
3769 :
3770 0 : resy[1] = _mm256_packus_epi32(
3771 0 : res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
3772 :
3773 0 : resy[0] =
3774 0 : _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
3775 : 1); // 16 16bit values
3776 : }
3777 : else {
3778 0 : resy[0] = resx[0];
3779 : }
3780 0 : resxy = _mm256_blendv_epi8(resx[0], resy[0],
3781 0 : *(__m256i *)HighbdBaseMask[base_min_diff]);
3782 0 : _mm256_storeu_si256((__m256i *)(dst + j), resxy);
3783 : } // for j
3784 0 : dst += stride;
3785 : }
3786 0 : }
3787 :
3788 : // Directional prediction, zone 2: 90 < angle < 180
3789 0 : void eb_av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int32_t bw,
3790 : int32_t bh, const uint16_t *above, const uint16_t *left, int32_t upsample_above,
3791 : int32_t upsample_left, int32_t dx, int32_t dy, int32_t bd) {
3792 : (void)bd;
3793 0 : assert(dx > 0);
3794 0 : assert(dy > 0);
3795 0 : switch (bw) {
3796 0 : case 4:
3797 0 : highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
3798 : upsample_above, upsample_left, dx, dy);
3799 0 : break;
3800 0 : case 8:
3801 0 : if (bd < 12) {
3802 0 : highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
3803 : upsample_above, upsample_left, dx, dy);
3804 : } else {
3805 0 : highbd_dr_prediction_z2_Nx8_32bit_avx2(
3806 : bh, dst, stride, above, left, upsample_above, upsample_left, dx, dy);
3807 : }
3808 0 : break;
3809 0 : default:
3810 0 : if (bd < 12) {
3811 0 : highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
3812 : upsample_above, upsample_left, dx, dy);
3813 : } else {
3814 0 : highbd_dr_prediction_z2_HxW_32bit_avx2(bh, bw, dst, stride, above, left,
3815 : upsample_above, upsample_left, dx,
3816 : dy);
3817 : }
3818 0 : break;
3819 : }
3820 0 : return;
3821 : }
3822 :
3823 0 : static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
3824 : uint16_t *dst, ptrdiff_t pitchDst) {
3825 : __m256i r[16];
3826 : __m256i d[16];
3827 0 : for (int j = 0; j < 16; j++) {
3828 0 : r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
3829 : }
3830 0 : transpose_16bit_16x16_avx2(r, d);
3831 0 : for (int j = 0; j < 16; j++) {
3832 0 : _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
3833 : }
3834 0 : }
3835 :
3836 0 : static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
3837 : uint16_t *dst, ptrdiff_t pitchDst, int32_t width,
3838 : int32_t height) {
3839 0 : for (int j = 0; j < height; j += 16)
3840 0 : for (int i = 0; i < width; i += 16)
3841 0 : highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
3842 0 : dst + j * pitchDst + i, pitchDst);
3843 0 : }
3844 :
3845 0 : static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
3846 : const uint16_t *left,
3847 : int32_t upsample_left, int32_t dy) {
3848 : __m128i dstvec[4], d[4];
3849 :
3850 : highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, dy);
3851 0 : highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
3852 : &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
3853 0 : _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
3854 0 : _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
3855 0 : _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
3856 0 : _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
3857 0 : return;
3858 : }
3859 :
3860 0 : static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
3861 : const uint16_t *left,
3862 : int32_t upsample_left, int32_t dy) {
3863 : __m128i dstvec[8], d[8];
3864 :
3865 : highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, dy);
3866 0 : highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
3867 : &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
3868 : &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
3869 : &d[7]);
3870 0 : for (int32_t i = 0; i < 8; i++) {
3871 0 : _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3872 : }
3873 0 : }
3874 :
3875 0 : static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
3876 : const uint16_t *left,
3877 : int32_t upsample_left, int32_t dy) {
3878 : __m128i dstvec[4], d[8];
3879 :
3880 : highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, dy);
3881 0 : highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
3882 : &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
3883 : &d[7]);
3884 0 : for (int32_t i = 0; i < 8; i++) {
3885 0 : _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
3886 : }
3887 0 : }
3888 :
3889 0 : static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
3890 : const uint16_t *left,
3891 : int32_t upsample_left, int32_t dy) {
3892 : __m128i dstvec[8], d[4];
3893 :
3894 : highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, dy);
3895 0 : highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
3896 : &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
3897 : &d[0], &d[1], &d[2], &d[3]);
3898 0 : _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
3899 0 : _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
3900 0 : _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
3901 0 : _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
3902 0 : }
3903 :
3904 0 : static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
3905 : const uint16_t *left,
3906 : int32_t upsample_left, int32_t dy) {
3907 : __m256i dstvec[8], d[16];
3908 :
3909 : highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
3910 : dy);
3911 0 : highbd_transpose8x16_16x8_avx2(dstvec, d);
3912 0 : for (int32_t i = 0; i < 8; i++) {
3913 0 : _mm_storeu_si128((__m128i *)(dst + i * stride),
3914 : _mm256_castsi256_si128(d[i]));
3915 : }
3916 0 : for (int32_t i = 8; i < 16; i++) {
3917 0 : _mm_storeu_si128((__m128i *)(dst + i * stride),
3918 0 : _mm256_extracti128_si256(d[i - 8], 1));
3919 : }
3920 0 : }
3921 :
3922 0 : static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
3923 : const uint16_t *left,
3924 : int32_t upsample_left, int32_t dy) {
3925 : __m128i dstvec[16], d[16];
3926 :
3927 : highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
3928 : dy);
3929 0 : for (int32_t i = 0; i < 16; i += 8) {
3930 0 : highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3931 0 : &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3932 0 : &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3933 0 : &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3934 0 : &d[5 + i], &d[6 + i], &d[7 + i]);
3935 : }
3936 0 : for (int32_t i = 0; i < 8; i++) {
3937 0 : _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3938 0 : _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3939 : }
3940 0 : }
3941 :
3942 0 : static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
3943 : const uint16_t *left,
3944 : int32_t upsample_left, int32_t dy) {
3945 : __m256i dstvec[4], d[4], d1;
3946 :
3947 : highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
3948 : dy);
3949 0 : highbd_transpose4x16_avx2(dstvec, d);
3950 0 : for (int32_t i = 0; i < 4; i++) {
3951 0 : _mm_storel_epi64((__m128i *)(dst + i * stride),
3952 : _mm256_castsi256_si128(d[i]));
3953 0 : d1 = _mm256_srli_si256(d[i], 8);
3954 0 : _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
3955 : _mm256_castsi256_si128(d1));
3956 0 : _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
3957 0 : _mm256_extracti128_si256(d[i], 1));
3958 0 : _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
3959 0 : _mm256_extracti128_si256(d1, 1));
3960 : }
3961 0 : }
3962 :
3963 0 : static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
3964 : const uint16_t *left,
3965 : int32_t upsample_left, int32_t dy) {
3966 : __m128i dstvec[16], d[8];
3967 :
3968 : highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
3969 : dy);
3970 0 : highbd_transpose16x4_8x8_sse2(dstvec, d);
3971 :
3972 0 : _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
3973 0 : _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
3974 0 : _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
3975 0 : _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
3976 0 : _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
3977 0 : _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
3978 0 : _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
3979 0 : _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
3980 0 : }
3981 :
3982 0 : static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
3983 : const uint16_t *left,
3984 : int32_t upsample_left, int32_t dy) {
3985 : __m256i dstvec[16], d[16];
3986 :
3987 : highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
3988 : dy);
3989 0 : for (int32_t i = 0; i < 16; i += 8) {
3990 0 : highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3991 : }
3992 :
3993 0 : for (int32_t i = 0; i < 8; i++) {
3994 0 : _mm_storeu_si128((__m128i *)(dst + i * stride),
3995 : _mm256_castsi256_si128(d[i]));
3996 : }
3997 0 : for (int32_t i = 0; i < 8; i++) {
3998 0 : _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3999 0 : _mm256_extracti128_si256(d[i], 1));
4000 : }
4001 0 : for (int32_t i = 8; i < 16; i++) {
4002 0 : _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
4003 : _mm256_castsi256_si128(d[i]));
4004 : }
4005 0 : for (int32_t i = 8; i < 16; i++) {
4006 0 : _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
4007 0 : _mm256_extracti128_si256(d[i], 1));
4008 : }
4009 0 : }
4010 :
4011 0 : static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
4012 : const uint16_t *left,
4013 : int32_t upsample_left, int32_t dy) {
4014 : __m128i dstvec[32], d[32];
4015 :
4016 : highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
4017 : dy);
4018 0 : for (int32_t i = 0; i < 32; i += 8) {
4019 0 : highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
4020 0 : &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
4021 0 : &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
4022 0 : &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
4023 0 : &d[5 + i], &d[6 + i], &d[7 + i]);
4024 : }
4025 0 : for (int32_t i = 0; i < 8; i++) {
4026 0 : _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4027 0 : _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
4028 0 : _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
4029 0 : _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
4030 : }
4031 0 : }
4032 :
4033 0 : static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
4034 : const uint16_t *left,
4035 : int32_t upsample_left, int32_t dy) {
4036 : __m256i dstvec[16], d[16];
4037 :
4038 : highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
4039 : dy);
4040 0 : transpose_16bit_16x16_avx2(dstvec, d);
4041 :
4042 0 : for (int32_t i = 0; i < 16; i++) {
4043 0 : _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
4044 : }
4045 0 : }
4046 :
4047 0 : static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
4048 : const uint16_t *left,
4049 : int32_t upsample_left, int32_t dy) {
4050 : __m256i dstvec[64], d[16];
4051 :
4052 : highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
4053 : dy);
4054 :
4055 0 : transpose_16bit_16x16_avx2(dstvec, d);
4056 0 : for (int32_t j = 0; j < 16; j++) {
4057 0 : _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
4058 : }
4059 0 : transpose_16bit_16x16_avx2(dstvec + 16, d);
4060 0 : for (int32_t j = 0; j < 16; j++) {
4061 0 : _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
4062 : }
4063 0 : transpose_16bit_16x16_avx2(dstvec + 32, d);
4064 0 : for (int32_t j = 0; j < 16; j++) {
4065 0 : _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
4066 : }
4067 0 : transpose_16bit_16x16_avx2(dstvec + 48, d);
4068 0 : for (int32_t j = 0; j < 16; j++) {
4069 0 : _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
4070 : }
4071 0 : }
4072 :
4073 0 : static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
4074 : const uint16_t *left,
4075 : int32_t upsample_left, int32_t dy) {
4076 : DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
4077 0 : highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
4078 0 : highbd_transpose(dstT, 64, dst, stride, 64, 64);
4079 0 : }
4080 :
4081 0 : static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
4082 : const uint16_t *left,
4083 : int32_t upsample_left, int32_t dy) {
4084 : __m256i dstvec[32], d[32];
4085 :
4086 : highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
4087 : dy);
4088 0 : for (int32_t i = 0; i < 32; i += 8) {
4089 0 : highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
4090 : }
4091 : // store
4092 0 : for (int32_t j = 0; j < 32; j += 16) {
4093 0 : for (int32_t i = 0; i < 8; i++) {
4094 0 : _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
4095 0 : _mm256_castsi256_si128(d[(i + j)]));
4096 : }
4097 0 : for (int32_t i = 0; i < 8; i++) {
4098 0 : _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
4099 0 : _mm256_castsi256_si128(d[(i + j) + 8]));
4100 : }
4101 0 : for (int32_t i = 8; i < 16; i++) {
4102 : _mm256_storeu_si256(
4103 0 : (__m256i *)(dst + (i + j) * stride),
4104 0 : _mm256_inserti128_si256(
4105 : d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
4106 : }
4107 : }
4108 0 : }
4109 :
4110 0 : static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
4111 : const uint16_t *left,
4112 : int32_t upsample_left, int32_t dy) {
4113 : __m256i dstvec[32], d[16];
4114 :
4115 : highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
4116 : dy);
4117 0 : for (int32_t i = 0; i < 32; i += 16) {
4118 0 : transpose_16bit_16x16_avx2((dstvec + i), d);
4119 0 : for (int32_t j = 0; j < 16; j++) {
4120 0 : _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
4121 : }
4122 : }
4123 0 : }
4124 :
4125 0 : static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
4126 : const uint16_t *left,
4127 : int32_t upsample_left, int32_t dy) {
4128 : uint16_t dstT[64 * 32];
4129 0 : highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
4130 0 : highbd_transpose(dstT, 64, dst, stride, 32, 64);
4131 0 : }
4132 :
4133 0 : static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
4134 : const uint16_t *left,
4135 : int32_t upsample_left, int32_t dy) {
4136 : DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
4137 0 : highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
4138 0 : highbd_transpose(dstT, 32, dst, stride, 64, 32);
4139 0 : return;
4140 : }
4141 :
4142 0 : static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
4143 : const uint16_t *left,
4144 : int32_t upsample_left, int32_t dy) {
4145 : DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
4146 0 : highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
4147 0 : highbd_transpose(dstT, 64, dst, stride, 16, 64);
4148 0 : }
4149 :
4150 0 : static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
4151 : const uint16_t *left,
4152 : int32_t upsample_left, int32_t dy) {
4153 : __m256i dstvec[64], d[16];
4154 :
4155 : highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
4156 : dy);
4157 0 : for (int32_t i = 0; i < 64; i += 16) {
4158 0 : transpose_16bit_16x16_avx2((dstvec + i), d);
4159 0 : for (int32_t j = 0; j < 16; j++) {
4160 0 : _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
4161 : }
4162 : }
4163 0 : }
4164 :
4165 0 : void eb_av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int32_t bw,
4166 : int32_t bh, const uint16_t *above,
4167 : const uint16_t *left, int32_t upsample_left,
4168 : int32_t dx, int32_t dy, int32_t bd) {
4169 : (void)above;
4170 : (void)dx;
4171 : (void)bd;
4172 0 : assert(dx == 1);
4173 0 : assert(dy > 0);
4174 0 : if (bw == bh) {
4175 0 : switch (bw) {
4176 0 : case 4:
4177 0 : highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
4178 0 : break;
4179 0 : case 8:
4180 0 : highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
4181 0 : break;
4182 0 : case 16:
4183 0 : highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left,
4184 : dy);
4185 0 : break;
4186 0 : case 32:
4187 0 : highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left,
4188 : dy);
4189 0 : break;
4190 0 : case 64:
4191 0 : highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left,
4192 : dy);
4193 0 : break;
4194 : }
4195 0 : }
4196 : else {
4197 0 : if (bw < bh) {
4198 0 : if (bw + bw == bh) {
4199 0 : switch (bw) {
4200 0 : case 4:
4201 0 : highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
4202 : dy);
4203 0 : break;
4204 0 : case 8:
4205 0 : highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
4206 : dy);
4207 0 : break;
4208 0 : case 16:
4209 0 : highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
4210 : dy);
4211 0 : break;
4212 0 : case 32:
4213 0 : highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
4214 : dy);
4215 0 : break;
4216 : }
4217 0 : }
4218 : else {
4219 0 : switch (bw) {
4220 0 : case 4:
4221 0 : highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
4222 : dy);
4223 0 : break;
4224 0 : case 8:
4225 0 : highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
4226 : dy);
4227 0 : break;
4228 0 : case 16:
4229 0 : highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
4230 : dy);
4231 0 : break;
4232 : }
4233 0 : }
4234 : }
4235 : else {
4236 0 : if (bh + bh == bw) {
4237 0 : switch (bh) {
4238 0 : case 4:
4239 0 : highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
4240 : dy);
4241 0 : break;
4242 0 : case 8:
4243 0 : highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
4244 : dy);
4245 0 : break;
4246 0 : case 16:
4247 0 : highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
4248 : dy);
4249 0 : break;
4250 0 : case 32:
4251 0 : highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
4252 : dy);
4253 0 : break;
4254 : }
4255 0 : }
4256 : else {
4257 0 : switch (bh) {
4258 0 : case 4:
4259 0 : highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
4260 : dy);
4261 0 : break;
4262 0 : case 8:
4263 0 : highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
4264 : dy);
4265 0 : break;
4266 0 : case 16:
4267 0 : highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
4268 : dy);
4269 0 : break;
4270 : }
4271 0 : }
4272 : }
4273 : }
4274 0 : return;
4275 : }
4276 :
4277 4223800 : static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
4278 : const __m256i *topleft) {
4279 4223800 : __m256i pl = _mm256_sub_epi16(*top, *topleft);
4280 8447600 : __m256i pt = _mm256_sub_epi16(*left, *topleft);
4281 8447600 : __m256i ptl = _mm256_abs_epi16(_mm256_add_epi16(pl, pt));
4282 4223800 : pl = _mm256_abs_epi16(pl);
4283 4223800 : pt = _mm256_abs_epi16(pt);
4284 :
4285 4223800 : __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
4286 8447600 : mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
4287 4223800 : __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
4288 :
4289 4223800 : pl = _mm256_andnot_si256(mask1, *left);
4290 :
4291 4223800 : ptl = _mm256_and_si256(mask2, *topleft);
4292 8447600 : pt = _mm256_andnot_si256(mask2, *top);
4293 4223800 : pt = _mm256_or_si256(pt, ptl);
4294 4223800 : pt = _mm256_and_si256(mask1, pt);
4295 :
4296 4223800 : return _mm256_or_si256(pt, pl);
4297 : }
4298 :
4299 : // Return 16 8-bit pixels in one row (__m128i)
4300 3904150 : static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
4301 : const __m256i *topleft) {
4302 3904150 : const __m256i p0 = paeth_pred(left, top, topleft);
4303 3904180 : const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
4304 3904180 : const __m256i p = _mm256_packus_epi16(p0, p1);
4305 3904180 : return _mm256_castsi256_si128(p);
4306 : }
4307 :
4308 153841 : static INLINE __m256i get_top_vector(const uint8_t *above) {
4309 153841 : const __m128i x = _mm_load_si128((const __m128i *)above);
4310 153841 : const __m128i zero = _mm_setzero_si128();
4311 153841 : const __m128i t0 = _mm_unpacklo_epi8(x, zero);
4312 153841 : const __m128i t1 = _mm_unpackhi_epi8(x, zero);
4313 153841 : return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
4314 : }
4315 :
4316 16758 : void eb_aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
4317 : const uint8_t *above, const uint8_t *left) {
4318 16758 : __m128i x = _mm_loadl_epi64((const __m128i *)left);
4319 16758 : const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
4320 33516 : const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
4321 16758 : __m256i rep = _mm256_set1_epi16(0x8000);
4322 16758 : const __m256i one = _mm256_set1_epi16(1);
4323 16758 : const __m256i top = get_top_vector(above);
4324 :
4325 : int i;
4326 150820 : for (i = 0; i < 8; ++i) {
4327 134062 : const __m256i l16 = _mm256_shuffle_epi8(l, rep);
4328 134062 : const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
4329 :
4330 : _mm_storeu_si128((__m128i *)dst, row);
4331 134062 : dst += stride;
4332 134062 : rep = _mm256_add_epi16(rep, one);
4333 : }
4334 16758 : }
4335 :
4336 139790 : static INLINE __m256i get_left_vector(const uint8_t *left) {
4337 139790 : const __m128i x = _mm_load_si128((const __m128i *)left);
4338 139790 : return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
4339 : }
4340 :
4341 32909 : void eb_aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
4342 : const uint8_t *above, const uint8_t *left) {
4343 32909 : const __m256i l = get_left_vector(left);
4344 65818 : const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
4345 32909 : __m256i rep = _mm256_set1_epi16(0x8000);
4346 32909 : const __m256i one = _mm256_set1_epi16(1);
4347 32909 : const __m256i top = get_top_vector(above);
4348 :
4349 : int i;
4350 559437 : for (i = 0; i < 16; ++i) {
4351 526525 : const __m256i l16 = _mm256_shuffle_epi8(l, rep);
4352 526525 : const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
4353 :
4354 : _mm_storeu_si128((__m128i *)dst, row);
4355 526529 : dst += stride;
4356 526529 : rep = _mm256_add_epi16(rep, one);
4357 : }
4358 32912 : }
4359 :
4360 9193 : void eb_aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
4361 : const uint8_t *above, const uint8_t *left) {
4362 9193 : __m256i l = get_left_vector(left);
4363 18386 : const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
4364 9193 : __m256i rep = _mm256_set1_epi16(0x8000);
4365 9193 : const __m256i one = _mm256_set1_epi16(1);
4366 9193 : const __m256i top = get_top_vector(above);
4367 :
4368 : int i;
4369 156281 : for (i = 0; i < 16; ++i) {
4370 147088 : const __m256i l16 = _mm256_shuffle_epi8(l, rep);
4371 147088 : const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
4372 :
4373 : _mm_storeu_si128((__m128i *)dst, row);
4374 147088 : dst += stride;
4375 147088 : rep = _mm256_add_epi16(rep, one);
4376 : }
4377 :
4378 9193 : l = get_left_vector(left + 16);
4379 9193 : rep = _mm256_set1_epi16(0x8000);
4380 156281 : for (i = 0; i < 16; ++i) {
4381 147088 : const __m256i l16 = _mm256_shuffle_epi8(l, rep);
4382 147088 : const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
4383 :
4384 : _mm_storeu_si128((__m128i *)dst, row);
4385 147088 : dst += stride;
4386 147088 : rep = _mm256_add_epi16(rep, one);
4387 : }
4388 9193 : }
4389 :
4390 3222 : void eb_aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
4391 : const uint8_t *above, const uint8_t *left) {
4392 6444 : const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
4393 3222 : const __m256i one = _mm256_set1_epi16(1);
4394 3222 : const __m256i top = get_top_vector(above);
4395 :
4396 16110 : for (int j = 0; j < 4; ++j) {
4397 12888 : const __m256i l = get_left_vector(left + j * 16);
4398 12888 : __m256i rep = _mm256_set1_epi16(0x8000);
4399 219096 : for (int i = 0; i < 16; ++i) {
4400 206208 : const __m256i l16 = _mm256_shuffle_epi8(l, rep);
4401 206208 : const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
4402 :
4403 : _mm_storeu_si128((__m128i *)dst, row);
4404 206208 : dst += stride;
4405 206208 : rep = _mm256_add_epi16(rep, one);
4406 : }
4407 : }
4408 3222 : }
4409 :
4410 : // Return 32 8-bit pixels in one row (__m256i)
4411 159904 : static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
4412 : const __m256i *top1,
4413 : const __m256i *topleft) {
4414 159904 : __m256i p0 = paeth_pred(left, top0, topleft);
4415 159904 : __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
4416 159904 : const __m256i x0 = _mm256_packus_epi16(p0, p1);
4417 :
4418 159904 : p0 = paeth_pred(left, top1, topleft);
4419 159904 : p1 = _mm256_permute4x64_epi64(p0, 0xe);
4420 159904 : const __m256i x1 = _mm256_packus_epi16(p0, p1);
4421 :
4422 159904 : return _mm256_permute2x128_si256(x0, x1, 0x20);
4423 : }
4424 :
4425 9994 : void eb_aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
4426 : const uint8_t *above, const uint8_t *left) {
4427 9994 : const __m256i l = get_left_vector(left);
4428 9994 : const __m256i t0 = get_top_vector(above);
4429 9994 : const __m256i t1 = get_top_vector(above + 16);
4430 19988 : const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
4431 9994 : __m256i rep = _mm256_set1_epi16(0x8000);
4432 9994 : const __m256i one = _mm256_set1_epi16(1);
4433 :
4434 : int i;
4435 169898 : for (i = 0; i < 16; ++i) {
4436 159904 : const __m256i l16 = _mm256_shuffle_epi8(l, rep);
4437 :
4438 159904 : const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
4439 :
4440 : _mm256_storeu_si256((__m256i *)dst, r);
4441 :
4442 159904 : dst += stride;
4443 159904 : rep = _mm256_add_epi16(rep, one);
4444 : }
4445 9994 : }
4446 :
4447 18773 : void eb_aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
4448 : const uint8_t *above, const uint8_t *left) {
4449 18773 : __m256i l = get_left_vector(left);
4450 18773 : const __m256i t0 = get_top_vector(above);
4451 18773 : const __m256i t1 = get_top_vector(above + 16);
4452 37546 : const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
4453 18773 : __m256i rep = _mm256_set1_epi16(0x8000);
4454 18773 : const __m256i one = _mm256_set1_epi16(1);
4455 :
4456 : int i;
4457 319139 : for (i = 0; i < 16; ++i) {
4458 300366 : const __m256i l16 = _mm256_shuffle_epi8(l, rep);
4459 :
4460 300366 : const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
4461 300367 : const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
4462 :
4463 : _mm_storeu_si128((__m128i *)dst, r0);
4464 300366 : _mm_storeu_si128((__m128i *)(dst + 16), r1);
4465 :
4466 300366 : dst += stride;
4467 300366 : rep = _mm256_add_epi16(rep, one);
4468 : }
4469 :
4470 18773 : l = get_left_vector(left + 16);
4471 18773 : rep = _mm256_set1_epi16(0x8000);
4472 319139 : for (i = 0; i < 16; ++i) {
4473 300367 : const __m256i l16 = _mm256_shuffle_epi8(l, rep);
4474 :
4475 300367 : const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
4476 300367 : const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
4477 :
4478 : _mm_storeu_si128((__m128i *)dst, r0);
4479 300366 : _mm_storeu_si128((__m128i *)(dst + 16), r1);
4480 :
4481 300366 : dst += stride;
4482 300366 : rep = _mm256_add_epi16(rep, one);
4483 : }
4484 18772 : }
4485 :
4486 1983 : void eb_aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
4487 : const uint8_t *above, const uint8_t *left) {
4488 1983 : const __m256i t0 = get_top_vector(above);
4489 1983 : const __m256i t1 = get_top_vector(above + 16);
4490 3966 : const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
4491 1983 : const __m256i one = _mm256_set1_epi16(1);
4492 :
4493 : int i, j;
4494 9915 : for (j = 0; j < 4; ++j) {
4495 7932 : const __m256i l = get_left_vector(left + j * 16);
4496 7932 : __m256i rep = _mm256_set1_epi16(0x8000);
4497 134844 : for (i = 0; i < 16; ++i) {
4498 126912 : const __m256i l16 = _mm256_shuffle_epi8(l, rep);
4499 :
4500 126912 : const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
4501 126912 : const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
4502 :
4503 : _mm_storeu_si128((__m128i *)dst, r0);
4504 126912 : _mm_storeu_si128((__m128i *)(dst + 16), r1);
4505 :
4506 126912 : dst += stride;
4507 126912 : rep = _mm256_add_epi16(rep, one);
4508 : }
4509 : }
4510 1983 : }
4511 :
4512 1513 : void eb_aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
4513 : const uint8_t *above, const uint8_t *left) {
4514 1513 : const __m256i t0 = get_top_vector(above);
4515 1513 : const __m256i t1 = get_top_vector(above + 16);
4516 1513 : const __m256i t2 = get_top_vector(above + 32);
4517 1513 : const __m256i t3 = get_top_vector(above + 48);
4518 3026 : const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
4519 1513 : const __m256i one = _mm256_set1_epi16(1);
4520 :
4521 : int i, j;
4522 4539 : for (j = 0; j < 2; ++j) {
4523 3026 : const __m256i l = get_left_vector(left + j * 16);
4524 3026 : __m256i rep = _mm256_set1_epi16(0x8000);
4525 51442 : for (i = 0; i < 16; ++i) {
4526 48416 : const __m256i l16 = _mm256_shuffle_epi8(l, rep);
4527 :
4528 48416 : const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
4529 48416 : const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
4530 48416 : const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
4531 48416 : const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
4532 :
4533 : _mm_storeu_si128((__m128i *)dst, r0);
4534 48416 : _mm_storeu_si128((__m128i *)(dst + 16), r1);
4535 48416 : _mm_storeu_si128((__m128i *)(dst + 32), r2);
4536 48416 : _mm_storeu_si128((__m128i *)(dst + 48), r3);
4537 :
4538 48416 : dst += stride;
4539 48416 : rep = _mm256_add_epi16(rep, one);
4540 : }
4541 : }
4542 1513 : }
4543 :
4544 3686 : void eb_aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
4545 : const uint8_t *above, const uint8_t *left) {
4546 3686 : const __m256i t0 = get_top_vector(above);
4547 3686 : const __m256i t1 = get_top_vector(above + 16);
4548 3686 : const __m256i t2 = get_top_vector(above + 32);
4549 3686 : const __m256i t3 = get_top_vector(above + 48);
4550 7372 : const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
4551 3686 : const __m256i one = _mm256_set1_epi16(1);
4552 :
4553 : int i, j;
4554 18427 : for (j = 0; j < 4; ++j) {
4555 14744 : const __m256i l = get_left_vector(left + j * 16);
4556 14744 : __m256i rep = _mm256_set1_epi16(0x8000);
4557 250623 : for (i = 0; i < 16; ++i) {
4558 235882 : const __m256i l16 = _mm256_shuffle_epi8(l, rep);
4559 :
4560 235882 : const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
4561 235893 : const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
4562 235883 : const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
4563 235881 : const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
4564 :
4565 : _mm_storeu_si128((__m128i *)dst, r0);
4566 235879 : _mm_storeu_si128((__m128i *)(dst + 16), r1);
4567 235879 : _mm_storeu_si128((__m128i *)(dst + 32), r2);
4568 235879 : _mm_storeu_si128((__m128i *)(dst + 48), r3);
4569 :
4570 235879 : dst += stride;
4571 235879 : rep = _mm256_add_epi16(rep, one);
4572 : }
4573 : }
4574 3683 : }
4575 :
4576 2366 : void eb_aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
4577 : const uint8_t *above, const uint8_t *left) {
4578 2366 : const __m256i t0 = get_top_vector(above);
4579 2366 : const __m256i t1 = get_top_vector(above + 16);
4580 2366 : const __m256i t2 = get_top_vector(above + 32);
4581 2366 : const __m256i t3 = get_top_vector(above + 48);
4582 4732 : const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
4583 2366 : const __m256i one = _mm256_set1_epi16(1);
4584 :
4585 : int i;
4586 2366 : const __m256i l = get_left_vector(left);
4587 2366 : __m256i rep = _mm256_set1_epi16(0x8000);
4588 40222 : for (i = 0; i < 16; ++i) {
4589 37856 : const __m256i l16 = _mm256_shuffle_epi8(l, rep);
4590 :
4591 37856 : const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
4592 37856 : const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
4593 37856 : const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
4594 37856 : const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
4595 :
4596 : _mm_storeu_si128((__m128i *)dst, r0);
4597 37856 : _mm_storeu_si128((__m128i *)(dst + 16), r1);
4598 37856 : _mm_storeu_si128((__m128i *)(dst + 32), r2);
4599 37856 : _mm_storeu_si128((__m128i *)(dst + 48), r3);
4600 :
4601 37856 : dst += stride;
4602 37856 : rep = _mm256_add_epi16(rep, one);
4603 : }
4604 2366 : }
4605 :
4606 0 : void eb_aom_highbd_paeth_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
4607 : const uint16_t *above, const uint16_t *left, int bd) {
4608 : (void) bd;
4609 0 : const __m256i tl16 = _mm256_set1_epi16(above[-1]);
4610 0 : const __m256i top = _mm256_loadu_si256((const __m256i *)above);
4611 : __m256i l16, row;
4612 : int i;
4613 :
4614 0 : for (i = 0; i < 4; ++i) {
4615 0 : l16 = _mm256_set1_epi16(left[i]);
4616 0 : row = paeth_pred(&l16, &top, &tl16);
4617 : _mm256_storeu_si256((__m256i *)dst, row);
4618 0 : dst += stride;
4619 : }
4620 0 : }
4621 :
4622 0 : void eb_aom_highbd_paeth_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
4623 : const uint16_t *above, const uint16_t *left, int bd) {
4624 0 : const __m256i tl16 = _mm256_set1_epi16(above[-1]);
4625 0 : const __m256i top = _mm256_loadu_si256((const __m256i *)above);
4626 : __m256i l16, row;
4627 : int i;
4628 : (void) bd;
4629 :
4630 0 : for (i = 0; i < 8; ++i) {
4631 0 : l16 = _mm256_set1_epi16(left[i]);
4632 0 : row = paeth_pred(&l16, &top, &tl16);
4633 : _mm256_storeu_si256((__m256i *)dst, row);
4634 0 : dst += stride;
4635 : }
4636 0 : }
4637 :
4638 0 : void eb_aom_highbd_paeth_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
4639 : const uint16_t *above, const uint16_t *left, int bd) {
4640 0 : const __m256i tl16 = _mm256_set1_epi16(above[-1]);
4641 0 : const __m256i top = _mm256_loadu_si256((const __m256i *)above);
4642 : __m256i l16, row;
4643 : int i;
4644 : (void) bd;
4645 :
4646 0 : for (i = 0; i < 16; ++i) {
4647 0 : l16 = _mm256_set1_epi16(left[i]);
4648 0 : row = paeth_pred(&l16, &top, &tl16);
4649 : _mm256_storeu_si256((__m256i *)dst, row);
4650 0 : dst += stride;
4651 : }
4652 0 : }
4653 :
4654 0 : void eb_aom_highbd_paeth_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
4655 : const uint16_t *above, const uint16_t *left, int bd) {
4656 0 : const __m256i tl16 = _mm256_set1_epi16(above[-1]);
4657 0 : const __m256i top = _mm256_loadu_si256((const __m256i *)above);
4658 : __m256i l16, row;
4659 : int i;
4660 : (void) bd;
4661 :
4662 0 : for (i = 0; i < 32; ++i) {
4663 0 : l16 = _mm256_set1_epi16(left[i]);
4664 0 : row = paeth_pred(&l16, &top, &tl16);
4665 : _mm256_storeu_si256((__m256i *)dst, row);
4666 0 : dst += stride;
4667 : }
4668 0 : }
4669 :
4670 0 : void eb_aom_highbd_paeth_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
4671 : const uint16_t *above, const uint16_t *left, int bd) {
4672 0 : const __m256i tl16 = _mm256_set1_epi16(above[-1]);
4673 0 : const __m256i top = _mm256_loadu_si256((const __m256i *)above);
4674 : __m256i l16, row;
4675 : int i;
4676 : (void) bd;
4677 :
4678 0 : for (i = 0; i < 64; ++i) {
4679 0 : l16 = _mm256_set1_epi16(left[i]);
4680 0 : row = paeth_pred(&l16, &top, &tl16);
4681 : _mm256_storeu_si256((__m256i *)dst, row);
4682 0 : dst += stride;
4683 : }
4684 0 : }
4685 :
4686 0 : void eb_aom_highbd_paeth_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
4687 : const uint16_t *above, const uint16_t *left, int bd) {
4688 0 : const __m256i t0 = _mm256_loadu_si256((const __m256i *)above);
4689 0 : const __m256i t1 = _mm256_loadu_si256((const __m256i *)(above + 16));
4690 0 : const __m256i tl = _mm256_set1_epi16(above[-1]);
4691 : __m256i l16, row;
4692 : int i;
4693 : (void) bd;
4694 :
4695 0 : for (i = 0; i < 8; ++i) {
4696 0 : l16 = _mm256_set1_epi16(left[i]);
4697 :
4698 0 : row = paeth_pred(&l16, &t0, &tl);
4699 : _mm256_storeu_si256((__m256i *)dst, row);
4700 :
4701 0 : row = paeth_pred(&l16, &t1, &tl);
4702 0 : _mm256_storeu_si256((__m256i *)(dst + 16), row);
4703 :
4704 0 : dst += stride;
4705 : }
4706 0 : }
4707 :
4708 0 : void eb_aom_highbd_paeth_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
4709 : const uint16_t *above, const uint16_t *left, int bd) {
4710 0 : const __m256i t0 = _mm256_loadu_si256((const __m256i *)above);
4711 0 : const __m256i t1 = _mm256_loadu_si256((const __m256i *)(above + 16));
4712 0 : const __m256i tl = _mm256_set1_epi16(above[-1]);
4713 : __m256i l16, row;
4714 : int i;
4715 : (void) bd;
4716 :
4717 0 : for (i = 0; i < 16; ++i) {
4718 0 : l16 = _mm256_set1_epi16(left[i]);
4719 :
4720 0 : row = paeth_pred(&l16, &t0, &tl);
4721 : _mm256_storeu_si256((__m256i *)dst, row);
4722 :
4723 0 : row = paeth_pred(&l16, &t1, &tl);
4724 0 : _mm256_storeu_si256((__m256i *)(dst + 16), row);
4725 :
4726 0 : dst += stride;
4727 : }
4728 0 : }
4729 :
4730 0 : void eb_aom_highbd_paeth_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
4731 : const uint16_t *above, const uint16_t *left, int bd) {
4732 0 : const __m256i t0 = _mm256_loadu_si256((const __m256i *)above);
4733 0 : const __m256i t1 = _mm256_loadu_si256((const __m256i *)(above + 16));
4734 0 : const __m256i tl = _mm256_set1_epi16(above[-1]);
4735 : __m256i l16, row;
4736 : int i;
4737 : (void) bd;
4738 :
4739 0 : for (i = 0; i < 32; ++i) {
4740 0 : l16 = _mm256_set1_epi16(left[i]);
4741 :
4742 0 : row = paeth_pred(&l16, &t0, &tl);
4743 : _mm256_storeu_si256((__m256i *)dst, row);
4744 :
4745 0 : row = paeth_pred(&l16, &t1, &tl);
4746 0 : _mm256_storeu_si256((__m256i *)(dst + 16), row);
4747 :
4748 0 : dst += stride;
4749 : }
4750 0 : }
4751 :
4752 0 : void eb_aom_highbd_paeth_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
4753 : const uint16_t *above, const uint16_t *left, int bd) {
4754 0 : const __m256i t0 = _mm256_loadu_si256((const __m256i *)above);
4755 0 : const __m256i t1 = _mm256_loadu_si256((const __m256i *)(above + 16));
4756 0 : const __m256i tl = _mm256_set1_epi16(above[-1]);
4757 : __m256i l16, row;
4758 : int i;
4759 : (void) bd;
4760 :
4761 0 : for (i = 0; i < 64; ++i) {
4762 0 : l16 = _mm256_set1_epi16(left[i]);
4763 :
4764 0 : row = paeth_pred(&l16, &t0, &tl);
4765 : _mm256_storeu_si256((__m256i *)dst, row);
4766 :
4767 0 : row = paeth_pred(&l16, &t1, &tl);
4768 0 : _mm256_storeu_si256((__m256i *)(dst + 16), row);
4769 :
4770 0 : dst += stride;
4771 : }
4772 0 : }
4773 :
4774 0 : void eb_aom_highbd_paeth_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
4775 : const uint16_t *above, const uint16_t *left, int bd) {
4776 0 : const __m256i t0 = _mm256_loadu_si256((const __m256i *)above);
4777 0 : const __m256i t1 = _mm256_loadu_si256((const __m256i *)(above + 16));
4778 0 : const __m256i t2 = _mm256_loadu_si256((const __m256i *)(above + 32));
4779 0 : const __m256i t3 = _mm256_loadu_si256((const __m256i *)(above + 48));
4780 0 : const __m256i tl = _mm256_set1_epi16(above[-1]);
4781 : __m256i l16, row;
4782 : int i;
4783 : (void) bd;
4784 :
4785 0 : for (i = 0; i < 16; ++i) {
4786 0 : l16 = _mm256_set1_epi16(left[i]);
4787 :
4788 0 : row = paeth_pred(&l16, &t0, &tl);
4789 : _mm256_storeu_si256((__m256i *)dst, row);
4790 :
4791 0 : row = paeth_pred(&l16, &t1, &tl);
4792 0 : _mm256_storeu_si256((__m256i *)(dst + 16), row);
4793 :
4794 0 : row = paeth_pred(&l16, &t2, &tl);
4795 0 : _mm256_storeu_si256((__m256i *)(dst + 32), row);
4796 :
4797 0 : row = paeth_pred(&l16, &t3, &tl);
4798 0 : _mm256_storeu_si256((__m256i *)(dst + 48), row);
4799 :
4800 0 : dst += stride;
4801 : }
4802 0 : }
4803 :
4804 0 : void eb_aom_highbd_paeth_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
4805 : const uint16_t *above, const uint16_t *left, int bd) {
4806 0 : const __m256i t0 = _mm256_loadu_si256((const __m256i *)above);
4807 0 : const __m256i t1 = _mm256_loadu_si256((const __m256i *)(above + 16));
4808 0 : const __m256i t2 = _mm256_loadu_si256((const __m256i *)(above + 32));
4809 0 : const __m256i t3 = _mm256_loadu_si256((const __m256i *)(above + 48));
4810 0 : const __m256i tl = _mm256_set1_epi16(above[-1]);
4811 : __m256i l16, row;
4812 : int i;
4813 : (void) bd;
4814 :
4815 0 : for (i = 0; i < 32; ++i) {
4816 0 : l16 = _mm256_set1_epi16(left[i]);
4817 :
4818 0 : row = paeth_pred(&l16, &t0, &tl);
4819 : _mm256_storeu_si256((__m256i *)dst, row);
4820 :
4821 0 : row = paeth_pred(&l16, &t1, &tl);
4822 0 : _mm256_storeu_si256((__m256i *)(dst + 16), row);
4823 :
4824 0 : row = paeth_pred(&l16, &t2, &tl);
4825 0 : _mm256_storeu_si256((__m256i *)(dst + 32), row);
4826 :
4827 0 : row = paeth_pred(&l16, &t3, &tl);
4828 0 : _mm256_storeu_si256((__m256i *)(dst + 48), row);
4829 :
4830 0 : dst += stride;
4831 : }
4832 0 : }
4833 :
4834 0 : void eb_aom_highbd_paeth_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
4835 : const uint16_t *above, const uint16_t *left, int bd) {
4836 0 : const __m256i t0 = _mm256_loadu_si256((const __m256i *)above);
4837 0 : const __m256i t1 = _mm256_loadu_si256((const __m256i *)(above + 16));
4838 0 : const __m256i t2 = _mm256_loadu_si256((const __m256i *)(above + 32));
4839 0 : const __m256i t3 = _mm256_loadu_si256((const __m256i *)(above + 48));
4840 0 : const __m256i tl = _mm256_set1_epi16(above[-1]);
4841 : __m256i l16, row;
4842 : int i;
4843 : (void) bd;
4844 :
4845 0 : for (i = 0; i < 64; ++i) {
4846 0 : l16 = _mm256_set1_epi16(left[i]);
4847 :
4848 0 : row = paeth_pred(&l16, &t0, &tl);
4849 : _mm256_storeu_si256((__m256i *)dst, row);
4850 :
4851 0 : row = paeth_pred(&l16, &t1, &tl);
4852 0 : _mm256_storeu_si256((__m256i *)(dst + 16), row);
4853 :
4854 0 : row = paeth_pred(&l16, &t2, &tl);
4855 0 : _mm256_storeu_si256((__m256i *)(dst + 32), row);
4856 :
4857 0 : row = paeth_pred(&l16, &t3, &tl);
4858 0 : _mm256_storeu_si256((__m256i *)(dst + 48), row);
4859 :
4860 0 : dst += stride;
4861 : }
4862 0 : }
4863 :
4864 0 : void eb_aom_highbd_paeth_predictor_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
4865 : const uint16_t *above, const uint16_t *left, int bd) {
4866 0 : const __m128i t = _mm_loadu_si128((const __m128i *)above);
4867 0 : const __m256i t0 = _mm256_setr_m128i(t, t);
4868 0 : const __m256i tl = _mm256_set1_epi16(above[-1]);
4869 : __m256i l16, row;
4870 : int i;
4871 : (void) bd;
4872 :
4873 0 : for (i = 0; i < 4; i += 2) {
4874 0 : l16 = _mm256_setr_m128i(_mm_set1_epi16(left[i]),
4875 : _mm_set1_epi16(left[i + 1]));
4876 :
4877 0 : row = paeth_pred(&l16, &t0, &tl);
4878 0 : _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 0));
4879 0 : dst += stride;
4880 0 : _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 1));
4881 0 : dst += stride;
4882 : }
4883 0 : }
4884 :
4885 0 : void eb_aom_highbd_paeth_predictor_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
4886 : const uint16_t *above, const uint16_t *left, int bd) {
4887 0 : const __m128i t = _mm_loadu_si128((const __m128i *)above);
4888 0 : const __m256i t0 = _mm256_setr_m128i(t, t);
4889 0 : const __m256i tl = _mm256_set1_epi16(above[-1]);
4890 : __m256i l16, row;
4891 : int i;
4892 : (void) bd;
4893 :
4894 0 : for (i = 0; i < 8; i += 2) {
4895 0 : l16 = _mm256_setr_m128i(_mm_set1_epi16(left[i]),
4896 : _mm_set1_epi16(left[i + 1]));
4897 :
4898 0 : row = paeth_pred(&l16, &t0, &tl);
4899 0 : _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 0));
4900 0 : dst += stride;
4901 0 : _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 1));
4902 0 : dst += stride;
4903 : }
4904 0 : }
4905 :
4906 0 : void eb_aom_highbd_paeth_predictor_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
4907 : const uint16_t *above, const uint16_t *left, int bd) {
4908 0 : const __m128i t = _mm_loadu_si128((const __m128i *)above);
4909 0 : const __m256i t0 = _mm256_setr_m128i(t, t);
4910 0 : const __m256i tl = _mm256_set1_epi16(above[-1]);
4911 : __m256i l16, row;
4912 : int i;
4913 : (void) bd;
4914 :
4915 0 : for (i = 0; i < 16; i += 2) {
4916 0 : l16 = _mm256_setr_m128i(_mm_set1_epi16(left[i]),
4917 : _mm_set1_epi16(left[i + 1]));
4918 :
4919 0 : row = paeth_pred(&l16, &t0, &tl);
4920 0 : _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 0));
4921 0 : dst += stride;
4922 0 : _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 1));
4923 0 : dst += stride;
4924 : }
4925 0 : }
4926 :
4927 0 : void eb_aom_highbd_paeth_predictor_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
4928 : const uint16_t *above, const uint16_t *left, int bd) {
4929 0 : const __m128i t = _mm_loadu_si128((const __m128i *)above);
4930 0 : const __m256i t0 = _mm256_setr_m128i(t, t);
4931 0 : const __m256i tl = _mm256_set1_epi16(above[-1]);
4932 : __m256i l16, row;
4933 : int i;
4934 : (void) bd;
4935 :
4936 0 : for (i = 0; i < 32; i += 2) {
4937 0 : l16 = _mm256_setr_m128i(_mm_set1_epi16(left[i]),
4938 : _mm_set1_epi16(left[i + 1]));
4939 :
4940 0 : row = paeth_pred(&l16, &t0, &tl);
4941 0 : _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 0));
4942 0 : dst += stride;
4943 0 : _mm_storeu_si128((__m128i *)dst, _mm256_extractf128_si256(row, 1));
4944 0 : dst += stride;
4945 : }
4946 0 : }
4947 :
4948 0 : void eb_aom_highbd_paeth_predictor_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
4949 : const uint16_t *above, const uint16_t *left, int bd) {
4950 0 : const __m256i t0 = _mm256_set1_epi64x(((uint64_t*)above)[0]);
4951 0 : const __m256i tl = _mm256_set1_epi16(above[-1]);
4952 : __m256i l16, row;
4953 : (void) bd;
4954 :
4955 : /* l16 = left: 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 */
4956 0 : __m256i t1 = _mm256_cvtepi16_epi64(
4957 : _mm_lddqu_si128((__m128i const*)left));
4958 0 : __m256i t1s = _mm256_slli_epi64(t1, 16);
4959 0 : t1 = _mm256_or_si256(t1s, t1);
4960 0 : t1s = _mm256_slli_epi64(t1, 32);
4961 0 : l16 = _mm256_or_si256(t1s, t1);
4962 :
4963 0 : row = paeth_pred(&l16, &t0, &tl);
4964 :
4965 0 : *(uint64_t*)&dst[0 * stride] = _mm256_extract_epi64(row, 0);
4966 0 : *(uint64_t*)&dst[1 * stride] = _mm256_extract_epi64(row, 1);
4967 0 : *(uint64_t*)&dst[2 * stride] = _mm256_extract_epi64(row, 2);
4968 0 : *(uint64_t*)&dst[3 * stride] = _mm256_extract_epi64(row, 3);
4969 0 : }
4970 :
4971 0 : void eb_aom_highbd_paeth_predictor_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
4972 : const uint16_t *above, const uint16_t *left, int bd) {
4973 0 : const __m256i t0 = _mm256_set1_epi64x(((uint64_t*)above)[0]);
4974 0 : const __m256i tl = _mm256_set1_epi16(above[-1]);
4975 : __m256i l16, row;
4976 : int i;
4977 : (void) bd;
4978 :
4979 0 : for (i = 0; i < 8; i += 4) {
4980 : /* l16 = left: 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 */
4981 0 : __m256i t1 = _mm256_cvtepi16_epi64(
4982 0 : _mm_lddqu_si128((__m128i const*)&left[i]));
4983 0 : __m256i t1s = _mm256_slli_epi64(t1, 16);
4984 0 : t1 = _mm256_or_si256(t1s, t1);
4985 0 : t1s = _mm256_slli_epi64(t1, 32);
4986 0 : l16 = _mm256_or_si256(t1s, t1);
4987 :
4988 0 : row = paeth_pred(&l16, &t0, &tl);
4989 :
4990 0 : *(uint64_t*)&dst[0 * stride] = _mm256_extract_epi64(row, 0);
4991 0 : *(uint64_t*)&dst[1 * stride] = _mm256_extract_epi64(row, 1);
4992 0 : *(uint64_t*)&dst[2 * stride] = _mm256_extract_epi64(row, 2);
4993 0 : *(uint64_t*)&dst[3 * stride] = _mm256_extract_epi64(row, 3);
4994 0 : dst += 4 * stride;
4995 : }
4996 0 : }
4997 :
4998 0 : void eb_aom_highbd_paeth_predictor_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
4999 : const uint16_t *above, const uint16_t *left, int bd) {
5000 0 : const __m256i t0 = _mm256_set1_epi64x(((uint64_t*)above)[0]);
5001 0 : const __m256i tl = _mm256_set1_epi16(above[-1]);
5002 : __m256i l16, row;
5003 :
5004 : (void) bd;
5005 : int i;
5006 0 : for (i = 0; i < 16; i += 4) {
5007 : /* l16 = left: 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 */
5008 0 : __m256i t1 = _mm256_cvtepi16_epi64(
5009 0 : _mm_lddqu_si128((__m128i const*)&left[i]));
5010 0 : __m256i t1s = _mm256_slli_epi64(t1, 16);
5011 0 : t1 = _mm256_or_si256(t1s, t1);
5012 0 : t1s = _mm256_slli_epi64(t1, 32);
5013 0 : l16 = _mm256_or_si256(t1s, t1);
5014 :
5015 0 : row = paeth_pred(&l16, &t0, &tl);
5016 :
5017 0 : *(uint64_t*)&dst[0 * stride] = _mm256_extract_epi64(row, 0);
5018 0 : *(uint64_t*)&dst[1 * stride] = _mm256_extract_epi64(row, 1);
5019 0 : *(uint64_t*)&dst[2 * stride] = _mm256_extract_epi64(row, 2);
5020 0 : *(uint64_t*)&dst[3 * stride] = _mm256_extract_epi64(row, 3);
5021 0 : dst += 4 * stride;
5022 : }
5023 0 : }
5024 :
5025 0 : void eb_aom_highbd_paeth_predictor_2x2_avx2(uint16_t *dst, ptrdiff_t stride,
5026 : const uint16_t *above, const uint16_t *left, int bd) {
5027 : (void) bd;
5028 0 : __m256i tl = _mm256_set1_epi16(above[-1]);
5029 0 : __m256i t0 = _mm256_set1_epi32(((uint32_t*)above)[0]);
5030 :
5031 : /* l16 = left: 0, 0, 1, 1, 0, 0, 0, 0 */
5032 0 : __m256i gg = _mm256_cvtepi16_epi32(_mm_cvtsi32_si128(*(uint32_t const*)(left)));
5033 0 : __m256i ss = _mm256_slli_epi64(gg, 16);
5034 0 : __m256i l16 = _mm256_or_si256(gg, ss);
5035 :
5036 0 : __m256i row = paeth_pred(&l16, &t0, &tl);
5037 :
5038 0 : *(uint32_t*)&dst[0] = _mm256_extract_epi32(row, 0);
5039 0 : *(uint32_t*)&dst[stride] = _mm256_extract_epi32(row, 1);
5040 0 : }
|