Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : #include <immintrin.h>
7 : #include "EbHighbdIntraPrediction_SSE2.h"
8 : #include "EbDefinitions.h"
9 : #include "aom_dsp_rtcd.h"
10 : #include "EbIntraPrediction_AVX2.h"
11 :
12 : // =============================================================================
13 :
14 : // DC RELATED PRED
15 :
16 : // Handle number of elements: up to 64.
17 0 : static INLINE __m128i dc_sum_large(const __m256i src) {
18 0 : const __m128i s_lo = _mm256_extracti128_si256(src, 0);
19 0 : const __m128i s_hi = _mm256_extracti128_si256(src, 1);
20 : __m128i sum, sum_hi;
21 0 : sum = _mm_add_epi16(s_lo, s_hi);
22 0 : sum_hi = _mm_srli_si128(sum, 8);
23 0 : sum = _mm_add_epi16(sum, sum_hi);
24 : // Unpack to avoid 12-bit overflow.
25 0 : sum = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
26 :
27 0 : return dc_sum_4x32bit(sum);
28 : }
29 :
30 : // Handle number of elements: 65 to 128.
31 0 : static INLINE __m128i dc_sum_larger(const __m256i src) {
32 0 : const __m128i s_lo = _mm256_extracti128_si256(src, 0);
33 0 : const __m128i s_hi = _mm256_extracti128_si256(src, 1);
34 : __m128i sum, sum_hi;
35 0 : sum = _mm_add_epi16(s_lo, s_hi);
36 : // Unpack to avoid 12-bit overflow.
37 0 : sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
38 0 : sum = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
39 0 : sum = _mm_add_epi32(sum, sum_hi);
40 :
41 0 : return dc_sum_4x32bit(sum);
42 : }
43 :
44 0 : static INLINE __m128i dc_sum_16(const uint16_t *const src) {
45 0 : const __m256i s = _mm256_loadu_si256((const __m256i *)src);
46 0 : const __m128i s_lo = _mm256_extracti128_si256(s, 0);
47 0 : const __m128i s_hi = _mm256_extracti128_si256(s, 1);
48 0 : const __m128i sum = _mm_add_epi16(s_lo, s_hi);
49 0 : return dc_sum_8x16bit(sum);
50 : }
51 :
52 0 : static INLINE __m128i dc_sum_32(const uint16_t *const src) {
53 0 : const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src + 0x00));
54 0 : const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + 0x10));
55 0 : const __m256i sum = _mm256_add_epi16(s0, s1);
56 0 : return dc_sum_large(sum);
57 : }
58 :
59 0 : static INLINE __m128i dc_sum_64(const uint16_t *const src) {
60 0 : const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src + 0x00));
61 0 : const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + 0x10));
62 0 : const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 0x20));
63 0 : const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 0x30));
64 0 : const __m256i s01 = _mm256_add_epi16(s0, s1);
65 0 : const __m256i s23 = _mm256_add_epi16(s2, s3);
66 0 : const __m256i sum = _mm256_add_epi16(s01, s23);
67 0 : return dc_sum_large(sum);
68 : }
69 :
70 0 : static INLINE __m128i dc_sum_4_16(const uint16_t *const src_4,
71 : const uint16_t *const src_16) {
72 0 : const __m128i s_4 = _mm_loadl_epi64((const __m128i *)src_4);
73 0 : const __m256i s_16 = _mm256_loadu_si256((const __m256i *)src_16);
74 0 : const __m128i s_lo = _mm256_extracti128_si256(s_16, 0);
75 0 : const __m128i s_hi = _mm256_extracti128_si256(s_16, 1);
76 0 : const __m128i s_16_sum0 = _mm_add_epi16(s_lo, s_hi);
77 0 : const __m128i s_16_sum_hi = _mm_srli_si128(s_16_sum0, 8);
78 0 : const __m128i s_16_sum = _mm_add_epi16(s_16_sum0, s_16_sum_hi);
79 0 : const __m128i sum = _mm_add_epi16(s_16_sum, s_4);
80 0 : return dc_sum_4x16bit_large(sum);
81 : }
82 :
83 0 : static INLINE __m128i dc_sum_8_16(const uint16_t *const src_8,
84 : const uint16_t *const src_16) {
85 0 : const __m128i s_8 = _mm_load_si128((const __m128i *)src_8);
86 0 : const __m256i s_16 = _mm256_loadu_si256((const __m256i *)src_16);
87 0 : const __m128i s_lo = _mm256_extracti128_si256(s_16, 0);
88 0 : const __m128i s_hi = _mm256_extracti128_si256(s_16, 1);
89 0 : const __m128i s_16_sum = _mm_add_epi16(s_lo, s_hi);
90 0 : const __m128i sum = _mm_add_epi16(s_16_sum, s_8);
91 0 : return dc_sum_8x16bit_large(sum);
92 : }
93 :
94 0 : static INLINE __m128i dc_sum_8_32(const uint16_t *const src_8,
95 : const uint16_t *const src_32) {
96 0 : const __m128i s_8 = _mm_loadu_si128((const __m128i *)src_8);
97 0 : const __m256i s_32_0 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x00));
98 0 : const __m256i s_32_1 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x10));
99 0 : const __m256i s_32 = _mm256_add_epi16(s_32_0, s_32_1);
100 0 : const __m128i s_lo = _mm256_extracti128_si256(s_32, 0);
101 0 : const __m128i s_hi = _mm256_extracti128_si256(s_32, 1);
102 0 : const __m128i s_16_sum = _mm_add_epi16(s_lo, s_hi);
103 0 : const __m128i sum = _mm_add_epi16(s_8, s_16_sum);
104 0 : return dc_sum_8x16bit_large(sum);
105 : }
106 :
107 0 : static INLINE __m128i dc_sum_16_16(const uint16_t *const src0,
108 : const uint16_t *const src1) {
109 0 : const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
110 0 : const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
111 0 : const __m256i sum = _mm256_add_epi16(s0, s1);
112 0 : return dc_sum_large(sum);
113 : }
114 :
115 0 : static INLINE __m128i dc_sum_16_32(const uint16_t *const src_16,
116 : const uint16_t *const src_32) {
117 0 : const __m256i s_16 = _mm256_loadu_si256((const __m256i *)src_16);
118 0 : const __m256i s_32_0 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x00));
119 0 : const __m256i s_32_1 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x10));
120 0 : const __m256i sum0 = _mm256_add_epi16(s_16, s_32_0);
121 0 : const __m256i sum = _mm256_add_epi16(sum0, s_32_1);
122 0 : return dc_sum_large(sum);
123 : }
124 :
125 0 : static INLINE __m128i dc_sum_32_32(const uint16_t *const src0,
126 : const uint16_t *const src1) {
127 0 : const __m256i s0_0 = _mm256_loadu_si256((const __m256i *)(src0 + 0x00));
128 0 : const __m256i s0_1 = _mm256_loadu_si256((const __m256i *)(src0 + 0x10));
129 0 : const __m256i s1_0 = _mm256_loadu_si256((const __m256i *)(src1 + 0x00));
130 0 : const __m256i s1_1 = _mm256_loadu_si256((const __m256i *)(src1 + 0x10));
131 0 : const __m256i sum0 = _mm256_add_epi16(s0_0, s1_0);
132 0 : const __m256i sum1 = _mm256_add_epi16(s0_1, s1_1);
133 0 : const __m256i sum = _mm256_add_epi16(sum0, sum1);
134 0 : return dc_sum_large(sum);
135 : }
136 :
137 0 : static INLINE __m128i dc_sum_32_64(const uint16_t *const src_32,
138 : const uint16_t *const src_64) {
139 0 : const __m256i s_32_0 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x00));
140 0 : const __m256i s_32_1 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x10));
141 0 : const __m256i s_64_0 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x00));
142 0 : const __m256i s_64_1 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x10));
143 0 : const __m256i s_64_2 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x20));
144 0 : const __m256i s_64_3 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x30));
145 0 : const __m256i sum0 = _mm256_add_epi16(s_32_0, s_64_0);
146 0 : const __m256i sum1 = _mm256_add_epi16(s_32_1, s_64_1);
147 0 : const __m256i sum2 = _mm256_add_epi16(s_64_2, s_64_3);
148 0 : const __m256i sum3 = _mm256_add_epi16(sum0, sum1);
149 0 : const __m256i sum = _mm256_add_epi16(sum2, sum3);
150 0 : return dc_sum_larger(sum);
151 : }
152 :
153 0 : static INLINE __m128i dc_sum_64_64(const uint16_t *const src0,
154 : const uint16_t *const src1) {
155 0 : const __m256i s0_0 = _mm256_loadu_si256((const __m256i *)(src0 + 0x00));
156 0 : const __m256i s0_1 = _mm256_loadu_si256((const __m256i *)(src0 + 0x10));
157 0 : const __m256i s0_2 = _mm256_loadu_si256((const __m256i *)(src0 + 0x20));
158 0 : const __m256i s0_3 = _mm256_loadu_si256((const __m256i *)(src0 + 0x30));
159 0 : const __m256i s1_0 = _mm256_loadu_si256((const __m256i *)(src1 + 0x00));
160 0 : const __m256i s1_1 = _mm256_loadu_si256((const __m256i *)(src1 + 0x10));
161 0 : const __m256i s1_2 = _mm256_loadu_si256((const __m256i *)(src1 + 0x20));
162 0 : const __m256i s1_3 = _mm256_loadu_si256((const __m256i *)(src1 + 0x30));
163 0 : const __m256i sum0 = _mm256_add_epi16(s0_0, s1_0);
164 0 : const __m256i sum1 = _mm256_add_epi16(s0_1, s1_1);
165 0 : const __m256i sum2 = _mm256_add_epi16(s0_2, s1_2);
166 0 : const __m256i sum3 = _mm256_add_epi16(s0_3, s1_3);
167 0 : const __m256i sum4 = _mm256_add_epi16(sum0, sum1);
168 0 : const __m256i sum5 = _mm256_add_epi16(sum2, sum3);
169 0 : const __m256i sum = _mm256_add_epi16(sum4, sum5);
170 0 : return dc_sum_larger(sum);
171 : }
172 :
173 0 : static INLINE __m128i dc_sum_16_64(const uint16_t *const src_16,
174 : const uint16_t *const src_64)
175 : {
176 0 : const __m256i s_16 = _mm256_loadu_si256((const __m256i *)src_16);
177 0 : const __m256i s_64_0 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x00));
178 0 : const __m256i s_64_1 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x10));
179 0 : const __m256i s_64_2 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x20));
180 0 : const __m256i s_64_3 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x30));
181 0 : const __m256i s0 = _mm256_add_epi16(s_16, s_64_0);
182 0 : const __m256i s1 = _mm256_add_epi16(s0, s_64_1);
183 0 : const __m256i s2 = _mm256_add_epi16(s_64_2, s_64_3);
184 0 : const __m256i sum = _mm256_add_epi16(s1, s2);
185 0 : return dc_sum_larger(sum);
186 : }
187 :
188 0 : static INLINE void dc_common_predictor_16xh_kernel(uint16_t *dst,
189 : const ptrdiff_t stride, const int32_t h, const __m256i dc)
190 : {
191 0 : for (int32_t i = 0; i < h; i++) {
192 : _mm256_storeu_si256((__m256i *)dst, dc);
193 0 : dst += stride;
194 : }
195 0 : }
196 :
197 0 : static INLINE void dc_common_predictor_32xh_kernel(uint16_t *dst,
198 : const ptrdiff_t stride, const int32_t h, const __m256i dc)
199 : {
200 0 : for (int32_t i = 0; i < h; i++) {
201 : _mm256_storeu_si256((__m256i *)(dst + 0x00), dc);
202 0 : _mm256_storeu_si256((__m256i *)(dst + 0x10), dc);
203 0 : dst += stride;
204 : }
205 0 : }
206 :
207 0 : static INLINE void dc_common_predictor_64xh_kernel(uint16_t *dst,
208 : const ptrdiff_t stride, const int32_t h, const __m256i dc)
209 : {
210 0 : for (int32_t i = 0; i < h; i++) {
211 : _mm256_storeu_si256((__m256i *)(dst + 0x00), dc);
212 0 : _mm256_storeu_si256((__m256i *)(dst + 0x10), dc);
213 0 : _mm256_storeu_si256((__m256i *)(dst + 0x20), dc);
214 0 : _mm256_storeu_si256((__m256i *)(dst + 0x30), dc);
215 0 : dst += stride;
216 : }
217 0 : }
218 :
219 0 : static INLINE void dc_common_predictor_16xh(uint16_t *const dst,
220 : const ptrdiff_t stride, const int32_t h, const __m128i dc)
221 : {
222 0 : const __m256i expected_dc = _mm256_broadcastw_epi16(dc);
223 0 : dc_common_predictor_16xh_kernel(dst, stride, h, expected_dc);
224 0 : }
225 :
226 0 : static INLINE void dc_common_predictor_32xh(uint16_t *const dst,
227 : const ptrdiff_t stride, const int32_t h, const __m128i dc)
228 : {
229 0 : const __m256i expected_dc = _mm256_broadcastw_epi16(dc);
230 0 : dc_common_predictor_32xh_kernel(dst, stride, h, expected_dc);
231 0 : }
232 :
233 0 : static INLINE void dc_common_predictor_64xh(uint16_t *const dst,
234 : const ptrdiff_t stride, const int32_t h, const __m128i dc)
235 : {
236 0 : const __m256i expected_dc = _mm256_broadcastw_epi16(dc);
237 0 : dc_common_predictor_64xh_kernel(dst, stride, h, expected_dc);
238 0 : }
239 :
240 : // =============================================================================
241 :
242 : // DC_128_PRED
243 :
244 : // 16xN
245 :
246 0 : static INLINE void dc_128_predictor_16xh(uint16_t *const dst,
247 : const ptrdiff_t stride, const int32_t h, const int32_t bd)
248 : {
249 0 : const __m256i dc = _mm256_set1_epi16(1 << (bd - 1));
250 0 : dc_common_predictor_16xh_kernel(dst, stride, h, dc);
251 0 : }
252 :
253 0 : void eb_aom_highbd_dc_128_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
254 : const uint16_t *above, const uint16_t *left, int32_t bd)
255 : {
256 : (void)above;
257 : (void)left;
258 0 : dc_128_predictor_16xh(dst, stride, 4, bd);
259 0 : }
260 :
261 0 : void eb_aom_highbd_dc_128_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
262 : const uint16_t *above, const uint16_t *left, int32_t bd)
263 : {
264 : (void)above;
265 : (void)left;
266 0 : dc_128_predictor_16xh(dst, stride, 8, bd);
267 0 : }
268 :
269 0 : void eb_aom_highbd_dc_128_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
270 : const uint16_t *above, const uint16_t *left, int32_t bd)
271 : {
272 : (void)above;
273 : (void)left;
274 0 : dc_128_predictor_16xh(dst, stride, 16, bd);
275 0 : }
276 :
277 0 : void eb_aom_highbd_dc_128_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
278 : const uint16_t *above, const uint16_t *left, int32_t bd)
279 : {
280 : (void)above;
281 : (void)left;
282 0 : dc_128_predictor_16xh(dst, stride, 32, bd);
283 0 : }
284 :
285 0 : void eb_aom_highbd_dc_128_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
286 : const uint16_t *above, const uint16_t *left, int32_t bd)
287 : {
288 : (void)above;
289 : (void)left;
290 0 : dc_128_predictor_16xh(dst, stride, 64, bd);
291 0 : }
292 :
293 : // 32xN
294 :
295 0 : static INLINE void dc_128_predictor_32xh(uint16_t *const dst,
296 : const ptrdiff_t stride, const int32_t h, const int32_t bd)
297 : {
298 0 : const __m256i dc = _mm256_set1_epi16(1 << (bd - 1));
299 0 : dc_common_predictor_32xh_kernel(dst, stride, h, dc);
300 0 : }
301 :
302 0 : void eb_aom_highbd_dc_128_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
303 : const uint16_t *above, const uint16_t *left, int32_t bd)
304 : {
305 : (void)above;
306 : (void)left;
307 0 : dc_128_predictor_32xh(dst, stride, 8, bd);
308 0 : }
309 :
310 0 : void eb_aom_highbd_dc_128_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
311 : const uint16_t *above, const uint16_t *left, int32_t bd)
312 : {
313 : (void)above;
314 : (void)left;
315 0 : dc_128_predictor_32xh(dst, stride, 16, bd);
316 0 : }
317 :
318 0 : void eb_aom_highbd_dc_128_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
319 : const uint16_t *above, const uint16_t *left, int32_t bd)
320 : {
321 : (void)above;
322 : (void)left;
323 0 : dc_128_predictor_32xh(dst, stride, 32, bd);
324 0 : }
325 :
326 0 : void eb_aom_highbd_dc_128_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
327 : const uint16_t *above, const uint16_t *left, int32_t bd)
328 : {
329 : (void)above;
330 : (void)left;
331 0 : dc_128_predictor_32xh(dst, stride, 64, bd);
332 0 : }
333 :
334 : // 64xN
335 :
336 0 : static INLINE void dc_128_predictor_64xh(uint16_t *const dst,
337 : const ptrdiff_t stride, const int32_t h, const int32_t bd)
338 : {
339 0 : const __m256i dc = _mm256_set1_epi16(1 << (bd - 1));
340 0 : dc_common_predictor_64xh_kernel(dst, stride, h, dc);
341 0 : }
342 :
343 0 : void eb_aom_highbd_dc_128_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
344 : const uint16_t *above, const uint16_t *left, int32_t bd)
345 : {
346 : (void)above;
347 : (void)left;
348 0 : dc_128_predictor_64xh(dst, stride, 16, bd);
349 0 : }
350 :
351 0 : void eb_aom_highbd_dc_128_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
352 : const uint16_t *above, const uint16_t *left, int32_t bd)
353 : {
354 : (void)above;
355 : (void)left;
356 0 : dc_128_predictor_64xh(dst, stride, 32, bd);
357 0 : }
358 :
359 0 : void eb_aom_highbd_dc_128_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
360 : const uint16_t *above, const uint16_t *left, int32_t bd)
361 : {
362 : (void)above;
363 : (void)left;
364 0 : dc_128_predictor_64xh(dst, stride, 64, bd);
365 0 : }
366 :
367 : // =============================================================================
368 :
369 : // DC_LEFT_PRED
370 :
371 : // 16xN
372 :
373 0 : void eb_aom_highbd_dc_left_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
374 : const uint16_t *above, const uint16_t *left, int32_t bd)
375 : {
376 0 : const __m128i round = _mm_cvtsi32_si128(2);
377 : __m128i sum;
378 : (void)above;
379 : (void)bd;
380 :
381 0 : sum = dc_sum_4(left);
382 0 : sum = _mm_add_epi16(sum, round);
383 0 : sum = _mm_srli_epi16(sum, 2);
384 0 : dc_common_predictor_16xh(dst, stride, 4, sum);
385 0 : }
386 :
387 0 : void eb_aom_highbd_dc_left_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
388 : const uint16_t *above, const uint16_t *left, int32_t bd)
389 : {
390 0 : const __m128i round = _mm_cvtsi32_si128(4);
391 : __m128i sum;
392 : (void)above;
393 : (void)bd;
394 :
395 0 : sum = dc_sum_8(left);
396 0 : sum = _mm_add_epi16(sum, round);
397 0 : sum = _mm_srli_epi16(sum, 3);
398 0 : dc_common_predictor_16xh(dst, stride, 8, sum);
399 0 : }
400 :
401 0 : void eb_aom_highbd_dc_left_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
402 : const uint16_t *above, const uint16_t *left, int32_t bd)
403 : {
404 0 : const __m128i round = _mm_cvtsi32_si128(8);
405 : __m128i sum;
406 : (void)above;
407 : (void)bd;
408 :
409 0 : sum = dc_sum_16(left);
410 0 : sum = _mm_add_epi16(sum, round);
411 0 : sum = _mm_srli_epi16(sum, 4);
412 0 : dc_common_predictor_16xh(dst, stride, 16, sum);
413 0 : }
414 :
415 0 : void eb_aom_highbd_dc_left_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
416 : const uint16_t *above, const uint16_t *left, int32_t bd)
417 : {
418 0 : const __m128i round = _mm_cvtsi32_si128(16);
419 : __m128i sum;
420 : (void)above;
421 : (void)bd;
422 :
423 0 : sum = dc_sum_32(left);
424 0 : sum = _mm_add_epi32(sum, round);
425 0 : sum = _mm_srli_epi32(sum, 5);
426 0 : dc_common_predictor_16xh(dst, stride, 32, sum);
427 0 : }
428 :
429 0 : void eb_aom_highbd_dc_left_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
430 : const uint16_t *above, const uint16_t *left, int32_t bd)
431 : {
432 0 : const __m128i round = _mm_cvtsi32_si128(32);
433 : __m128i sum;
434 : (void)above;
435 : (void)bd;
436 :
437 0 : sum = dc_sum_64(left);
438 0 : sum = _mm_add_epi32(sum, round);
439 0 : sum = _mm_srli_epi32(sum, 6);
440 0 : dc_common_predictor_16xh(dst, stride, 64, sum);
441 0 : }
442 :
443 : // 32xN
444 :
445 0 : void eb_aom_highbd_dc_left_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
446 : const uint16_t *above, const uint16_t *left, int32_t bd)
447 : {
448 0 : const __m128i round = _mm_cvtsi32_si128(4);
449 : __m128i sum;
450 : (void)above;
451 : (void)bd;
452 :
453 0 : sum = dc_sum_8(left);
454 0 : sum = _mm_add_epi16(sum, round);
455 0 : sum = _mm_srli_epi16(sum, 3);
456 0 : dc_common_predictor_32xh(dst, stride, 8, sum);
457 0 : }
458 :
459 0 : void eb_aom_highbd_dc_left_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
460 : const uint16_t *above, const uint16_t *left, int32_t bd)
461 : {
462 0 : const __m128i round = _mm_cvtsi32_si128(8);
463 : __m128i sum;
464 : (void)above;
465 : (void)bd;
466 :
467 0 : sum = dc_sum_16(left);
468 0 : sum = _mm_add_epi16(sum, round);
469 0 : sum = _mm_srli_epi16(sum, 4);
470 0 : dc_common_predictor_32xh(dst, stride, 16, sum);
471 0 : }
472 :
473 0 : void eb_aom_highbd_dc_left_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
474 : const uint16_t *above, const uint16_t *left, int32_t bd)
475 : {
476 0 : const __m128i round = _mm_cvtsi32_si128(16);
477 : __m128i sum;
478 : (void)above;
479 : (void)bd;
480 :
481 0 : sum = dc_sum_32(left);
482 0 : sum = _mm_add_epi32(sum, round);
483 0 : sum = _mm_srli_epi32(sum, 5);
484 0 : dc_common_predictor_32xh(dst, stride, 32, sum);
485 0 : }
486 :
487 0 : void eb_aom_highbd_dc_left_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
488 : const uint16_t *above, const uint16_t *left, int32_t bd)
489 : {
490 0 : const __m128i round = _mm_cvtsi32_si128(32);
491 : __m128i sum;
492 : (void)above;
493 : (void)bd;
494 :
495 0 : sum = dc_sum_64(left);
496 0 : sum = _mm_add_epi32(sum, round);
497 0 : sum = _mm_srli_epi32(sum, 6);
498 0 : dc_common_predictor_32xh(dst, stride, 64, sum);
499 0 : }
500 :
501 : // 64xN
502 :
503 0 : void eb_aom_highbd_dc_left_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
504 : const uint16_t *above, const uint16_t *left, int32_t bd)
505 : {
506 0 : const __m128i round = _mm_cvtsi32_si128(8);
507 : __m128i sum;
508 : (void)above;
509 : (void)bd;
510 :
511 0 : sum = dc_sum_16(left);
512 0 : sum = _mm_add_epi16(sum, round);
513 0 : sum = _mm_srli_epi16(sum, 4);
514 0 : dc_common_predictor_64xh(dst, stride, 16, sum);
515 0 : }
516 :
517 0 : void eb_aom_highbd_dc_left_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
518 : const uint16_t *above, const uint16_t *left, int32_t bd)
519 : {
520 0 : const __m128i round = _mm_cvtsi32_si128(16);
521 : __m128i sum;
522 : (void)above;
523 : (void)bd;
524 :
525 0 : sum = dc_sum_32(left);
526 0 : sum = _mm_add_epi32(sum, round);
527 0 : sum = _mm_srli_epi32(sum, 5);
528 0 : dc_common_predictor_64xh(dst, stride, 32, sum);
529 0 : }
530 :
531 0 : void eb_aom_highbd_dc_left_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
532 : const uint16_t *above, const uint16_t *left, int32_t bd)
533 : {
534 0 : const __m128i round = _mm_cvtsi32_si128(32);
535 : __m128i sum;
536 : (void)above;
537 : (void)bd;
538 :
539 0 : sum = dc_sum_64(left);
540 0 : sum = _mm_add_epi32(sum, round);
541 0 : sum = _mm_srli_epi32(sum, 6);
542 0 : dc_common_predictor_64xh(dst, stride, 64, sum);
543 0 : }
544 :
545 : // =============================================================================
546 :
547 : // DC_TOP_PRED
548 :
549 : // 16xN
550 :
551 0 : static INLINE void dc_top_predictor_16xh(uint16_t *const dst,
552 : const ptrdiff_t stride, const uint16_t *const above,
553 : const int32_t h, const int32_t bd)
554 : {
555 : (void)bd;
556 0 : const __m128i round = _mm_cvtsi32_si128(8);
557 : __m128i sum;
558 :
559 0 : sum = dc_sum_16(above);
560 0 : sum = _mm_add_epi16(sum, round);
561 0 : sum = _mm_srli_epi16(sum, 4);
562 0 : dc_common_predictor_16xh(dst, stride, h, sum);
563 0 : }
564 :
565 0 : void eb_aom_highbd_dc_top_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
566 : const uint16_t *above, const uint16_t *left, int32_t bd)
567 : {
568 : (void)left;
569 0 : dc_top_predictor_16xh(dst, stride, above, 4, bd);
570 0 : }
571 :
572 0 : void eb_aom_highbd_dc_top_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
573 : const uint16_t *above, const uint16_t *left, int32_t bd)
574 : {
575 : (void)left;
576 0 : dc_top_predictor_16xh(dst, stride, above, 8, bd);
577 0 : }
578 :
579 0 : void eb_aom_highbd_dc_top_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
580 : const uint16_t *above, const uint16_t *left, int32_t bd)
581 : {
582 : (void)left;
583 0 : dc_top_predictor_16xh(dst, stride, above, 16, bd);
584 0 : }
585 :
586 0 : void eb_aom_highbd_dc_top_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
587 : const uint16_t *above, const uint16_t *left, int32_t bd)
588 : {
589 : (void)left;
590 0 : dc_top_predictor_16xh(dst, stride, above, 32, bd);
591 0 : }
592 :
593 0 : void eb_aom_highbd_dc_top_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
594 : const uint16_t *above, const uint16_t *left, int32_t bd)
595 : {
596 : (void)left;
597 0 : dc_top_predictor_16xh(dst, stride, above, 64, bd);
598 0 : }
599 :
600 : // 32xN
601 :
602 0 : static INLINE void dc_top_predictor_32xh(uint16_t *const dst,
603 : const ptrdiff_t stride, const uint16_t *const above,
604 : const int32_t h, const int32_t bd)
605 : {
606 0 : const __m128i round = _mm_cvtsi32_si128(16);
607 : __m128i sum;
608 : (void) bd;
609 :
610 0 : sum = dc_sum_32(above);
611 0 : sum = _mm_add_epi32(sum, round);
612 0 : sum = _mm_srli_epi32(sum, 5);
613 0 : dc_common_predictor_32xh(dst, stride, h, sum);
614 0 : }
615 :
616 0 : void eb_aom_highbd_dc_top_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
617 : const uint16_t *above, const uint16_t *left, int32_t bd)
618 : {
619 : (void)left;
620 0 : dc_top_predictor_32xh(dst, stride, above, 8, bd);
621 0 : }
622 :
623 0 : void eb_aom_highbd_dc_top_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
624 : const uint16_t *above, const uint16_t *left, int32_t bd)
625 : {
626 : (void)left;
627 0 : dc_top_predictor_32xh(dst, stride, above, 16, bd);
628 0 : }
629 :
630 0 : void eb_aom_highbd_dc_top_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
631 : const uint16_t *above, const uint16_t *left, int32_t bd)
632 : {
633 : (void)left;
634 0 : dc_top_predictor_32xh(dst, stride, above, 32, bd);
635 0 : }
636 :
637 0 : void eb_aom_highbd_dc_top_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
638 : const uint16_t *above, const uint16_t *left, int32_t bd)
639 : {
640 : (void)left;
641 0 : dc_top_predictor_32xh(dst, stride, above, 64, bd);
642 0 : }
643 :
644 : // 64xN
645 :
646 0 : static INLINE void dc_top_predictor_64xh(uint16_t *const dst,
647 : const ptrdiff_t stride, const uint16_t *const above,
648 : const int32_t h, const int32_t bd)
649 : {
650 0 : const __m128i round = _mm_cvtsi32_si128(32);
651 : __m128i sum;
652 : (void) bd;
653 :
654 0 : sum = dc_sum_64(above);
655 0 : sum = _mm_add_epi32(sum, round);
656 0 : sum = _mm_srli_epi32(sum, 6);
657 0 : dc_common_predictor_64xh(dst, stride, h, sum);
658 0 : }
659 :
660 0 : void eb_aom_highbd_dc_top_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
661 : const uint16_t *above, const uint16_t *left, int32_t bd)
662 : {
663 : (void)left;
664 0 : dc_top_predictor_64xh(dst, stride, above, 16, bd);
665 0 : }
666 :
667 0 : void eb_aom_highbd_dc_top_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
668 : const uint16_t *above, const uint16_t *left, int32_t bd)
669 : {
670 : (void)left;
671 0 : dc_top_predictor_64xh(dst, stride, above, 32, bd);
672 0 : }
673 :
674 0 : void eb_aom_highbd_dc_top_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
675 : const uint16_t *above, const uint16_t *left, int32_t bd)
676 : {
677 : (void)left;
678 0 : dc_top_predictor_64xh(dst, stride, above, 64, bd);
679 0 : }
680 :
681 : // =============================================================================
682 :
683 : // DC_PRED
684 :
685 : // 16xN
686 :
687 0 : void eb_aom_highbd_dc_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
688 : const uint16_t *above, const uint16_t *left, int32_t bd) {
689 : (void)bd;
690 0 : __m128i sum = dc_sum_4_16(left, above);
691 0 : uint32_t sum32 = _mm_cvtsi128_si32(sum);
692 0 : sum32 += 10;
693 0 : sum32 /= 20;
694 0 : const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
695 :
696 0 : dc_common_predictor_16xh_kernel(dst, stride, 4, dc);
697 0 : }
698 :
699 0 : void eb_aom_highbd_dc_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
700 : const uint16_t *above, const uint16_t *left, int32_t bd) {
701 : (void)bd;
702 0 : __m128i sum = dc_sum_8_16(left, above);
703 0 : uint32_t sum32 = _mm_cvtsi128_si32(sum);
704 0 : sum32 += 12;
705 0 : sum32 /= 24;
706 0 : const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
707 :
708 0 : dc_common_predictor_16xh_kernel(dst, stride, 8, dc);
709 0 : }
710 :
711 0 : void eb_aom_highbd_dc_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
712 : const uint16_t *above, const uint16_t *left, int32_t bd) {
713 : (void)bd;
714 0 : __m128i sum = dc_sum_16_16(above, left);
715 0 : sum = _mm_add_epi32(sum, _mm_set1_epi32(16));
716 0 : sum = _mm_srli_epi32(sum, 5);
717 0 : dc_common_predictor_16xh(dst, stride, 16, sum);
718 0 : }
719 :
720 0 : void eb_aom_highbd_dc_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
721 : const uint16_t *above, const uint16_t *left, int32_t bd) {
722 : (void)bd;
723 0 : __m128i sum = dc_sum_16_32(above, left);
724 0 : uint32_t sum32 = _mm_cvtsi128_si32(sum);
725 0 : sum32 += 24;
726 0 : sum32 /= 48;
727 0 : const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
728 :
729 0 : dc_common_predictor_16xh_kernel(dst, stride, 32, dc);
730 0 : }
731 :
732 0 : void eb_aom_highbd_dc_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
733 : const uint16_t *above, const uint16_t *left, int32_t bd) {
734 : (void)bd;
735 0 : __m128i sum = dc_sum_16_64(above, left);
736 0 : uint32_t sum32 = _mm_cvtsi128_si32(sum);
737 0 : sum32 += 40;
738 0 : sum32 /= 80;
739 0 : const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
740 :
741 0 : dc_common_predictor_16xh_kernel(dst, stride, 64, dc);
742 0 : }
743 :
744 : // 32xN
745 :
746 0 : void eb_aom_highbd_dc_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
747 : const uint16_t *above, const uint16_t *left, int32_t bd) {
748 : (void)bd;
749 0 : __m128i sum = dc_sum_8_32(left, above);
750 0 : uint32_t sum32 = _mm_cvtsi128_si32(sum);
751 0 : sum32 += 20;
752 0 : sum32 /= 40;
753 0 : const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
754 :
755 0 : dc_common_predictor_32xh_kernel(dst, stride, 8, dc);
756 0 : }
757 :
758 0 : void eb_aom_highbd_dc_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
759 : const uint16_t *above, const uint16_t *left, int32_t bd) {
760 : (void)bd;
761 0 : __m128i sum = dc_sum_16_32(left, above);
762 0 : uint32_t sum32 = _mm_cvtsi128_si32(sum);
763 0 : sum32 += 24;
764 0 : sum32 /= 48;
765 0 : const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
766 :
767 0 : dc_common_predictor_32xh_kernel(dst, stride, 16, dc);
768 0 : }
769 :
770 0 : void eb_aom_highbd_dc_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
771 : const uint16_t *above, const uint16_t *left, int32_t bd) {
772 : (void)bd;
773 0 : __m128i sum = dc_sum_32_32(above, left);
774 0 : sum = _mm_add_epi32(sum, _mm_set1_epi32(32));
775 0 : sum = _mm_srli_epi32(sum, 6);
776 0 : dc_common_predictor_32xh(dst, stride, 32, sum);
777 0 : }
778 :
779 0 : void eb_aom_highbd_dc_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
780 : const uint16_t *above, const uint16_t *left, int32_t bd) {
781 : (void)bd;
782 0 : __m128i sum = dc_sum_32_64(above, left);
783 0 : uint32_t sum32 = _mm_cvtsi128_si32(sum);
784 0 : sum32 += 48;
785 0 : sum32 /= 96;
786 0 : const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
787 :
788 0 : dc_common_predictor_32xh_kernel(dst, stride, 64, dc);
789 0 : }
790 :
791 : // 64xN
792 :
793 0 : void eb_aom_highbd_dc_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
794 : const uint16_t *above, const uint16_t *left, int32_t bd) {
795 : (void)bd;
796 0 : __m128i sum = dc_sum_16_64(left, above);
797 0 : uint32_t sum32 = _mm_cvtsi128_si32(sum);
798 0 : sum32 += 40;
799 0 : sum32 /= 80;
800 0 : const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
801 :
802 0 : dc_common_predictor_64xh_kernel(dst, stride, 16, dc);
803 0 : }
804 :
805 0 : void eb_aom_highbd_dc_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
806 : const uint16_t *above, const uint16_t *left, int32_t bd) {
807 : (void)bd;
808 0 : __m128i sum = dc_sum_32_64(left, above);
809 0 : uint32_t sum32 = _mm_cvtsi128_si32(sum);
810 0 : sum32 += 48;
811 0 : sum32 /= 96;
812 0 : const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
813 :
814 0 : dc_common_predictor_64xh_kernel(dst, stride, 32, dc);
815 0 : }
816 :
817 0 : void eb_aom_highbd_dc_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
818 : const uint16_t *above, const uint16_t *left, int32_t bd) {
819 : (void)bd;
820 0 : __m128i sum = dc_sum_64_64(above, left);
821 0 : sum = _mm_add_epi32(sum, _mm_set1_epi32(64));
822 0 : sum = _mm_srli_epi32(sum, 7);
823 0 : dc_common_predictor_64xh(dst, stride, 64, sum);
824 0 : }
825 :
826 : // =============================================================================
827 :
828 : // H_PRED
829 :
830 : // 16xN
831 :
832 0 : static INLINE void h_pred_16(uint16_t **const dst, const ptrdiff_t stride,
833 : const __m128i left)
834 : {
835 : // Broadcast the 16-bit left pixel to 256-bit register.
836 0 : const __m256i row = _mm256_broadcastw_epi16(left);
837 :
838 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x00), row);
839 0 : *dst += stride;
840 0 : }
841 :
842 : // Process 8 rows.
843 0 : static INLINE void h_pred_16x8(uint16_t **dst, const ptrdiff_t stride,
844 : const uint16_t *const left)
845 : {
846 : // dst and it's stride must be 32-byte aligned.
847 : assert(!((intptr_t)*dst % 32));
848 : assert(!(stride % 32));
849 :
850 0 : const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
851 :
852 0 : h_pred_16(dst, stride, _mm_srli_si128(left_u16, 0));
853 0 : h_pred_16(dst, stride, _mm_srli_si128(left_u16, 2));
854 0 : h_pred_16(dst, stride, _mm_srli_si128(left_u16, 4));
855 0 : h_pred_16(dst, stride, _mm_srli_si128(left_u16, 6));
856 0 : h_pred_16(dst, stride, _mm_srli_si128(left_u16, 8));
857 0 : h_pred_16(dst, stride, _mm_srli_si128(left_u16, 10));
858 0 : h_pred_16(dst, stride, _mm_srli_si128(left_u16, 12));
859 0 : h_pred_16(dst, stride, _mm_srli_si128(left_u16, 14));
860 0 : }
861 :
862 : // 16x4
863 :
864 0 : void eb_aom_highbd_h_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
865 : const uint16_t *above, const uint16_t *left, int32_t bd)
866 : {
867 : (void)above;
868 : (void)bd;
869 :
870 : // dst and it's stride must be 32-byte aligned.
871 : assert(!((intptr_t)dst % 32));
872 : assert(!(stride % 32));
873 :
874 0 : const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
875 :
876 0 : h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 0));
877 0 : h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 2));
878 0 : h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 4));
879 0 : h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 6));
880 0 : }
881 :
882 : // 16x64
883 :
884 0 : void eb_aom_highbd_h_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
885 : const uint16_t *above, const uint16_t *left, int32_t bd)
886 : {
887 : (void)above;
888 : (void)bd;
889 :
890 0 : for (int32_t i = 0; i < 8; i++, left += 8)
891 0 : h_pred_16x8(&dst, stride, left);
892 0 : }
893 :
894 : // -----------------------------------------------------------------------------
895 :
896 : // 32xN
897 :
898 0 : static INLINE void h_pred_32(uint16_t **const dst, const ptrdiff_t stride,
899 : const __m128i left)
900 : {
901 : // Broadcast the 16-bit left pixel to 256-bit register.
902 0 : const __m256i row = _mm256_broadcastw_epi16(left);
903 :
904 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x00), row);
905 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x10), row);
906 0 : *dst += stride;
907 0 : }
908 :
909 : // Process 8 rows.
910 0 : static INLINE void h_pred_32x8(uint16_t **dst, const ptrdiff_t stride,
911 : const uint16_t *const left)
912 : {
913 : // dst and it's stride must be 32-byte aligned.
914 : assert(!((intptr_t)*dst % 32));
915 : assert(!(stride % 32));
916 :
917 0 : const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
918 :
919 0 : h_pred_32(dst, stride, _mm_srli_si128(left_u16, 0));
920 0 : h_pred_32(dst, stride, _mm_srli_si128(left_u16, 2));
921 0 : h_pred_32(dst, stride, _mm_srli_si128(left_u16, 4));
922 0 : h_pred_32(dst, stride, _mm_srli_si128(left_u16, 6));
923 0 : h_pred_32(dst, stride, _mm_srli_si128(left_u16, 8));
924 0 : h_pred_32(dst, stride, _mm_srli_si128(left_u16, 10));
925 0 : h_pred_32(dst, stride, _mm_srli_si128(left_u16, 12));
926 0 : h_pred_32(dst, stride, _mm_srli_si128(left_u16, 14));
927 0 : }
928 :
929 : // 32x8
930 :
931 0 : void eb_aom_highbd_h_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
932 : const uint16_t *above, const uint16_t *left, int32_t bd)
933 : {
934 : (void)above;
935 : (void)bd;
936 :
937 0 : h_pred_32x8(&dst, stride, left);
938 0 : }
939 :
940 : // 32x64
941 :
942 0 : void eb_aom_highbd_h_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
943 : const uint16_t *above, const uint16_t *left, int32_t bd)
944 : {
945 : (void)above;
946 : (void)bd;
947 :
948 0 : for (int32_t i = 0; i < 8; i++, left += 8)
949 0 : h_pred_32x8(&dst, stride, left);
950 0 : }
951 :
952 : // -----------------------------------------------------------------------------
953 :
954 : // 64xN
955 :
956 0 : static INLINE void h_pred_64(uint16_t **const dst, const ptrdiff_t stride,
957 : const __m128i left)
958 : {
959 : // Broadcast the 16-bit left pixel to 256-bit register.
960 0 : const __m256i row = _mm256_broadcastw_epi16(left);
961 :
962 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x00), row);
963 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x10), row);
964 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x20), row);
965 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x30), row);
966 0 : *dst += stride;
967 0 : }
968 :
969 : // Process 8 rows.
970 0 : static INLINE void h_pred_64x8(uint16_t **dst, const ptrdiff_t stride,
971 : const uint16_t *const left)
972 : {
973 : // dst and it's stride must be 32-byte aligned.
974 : assert(!((intptr_t)*dst % 32));
975 : assert(!(stride % 32));
976 :
977 0 : const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
978 :
979 0 : h_pred_64(dst, stride, _mm_srli_si128(left_u16, 0));
980 0 : h_pred_64(dst, stride, _mm_srli_si128(left_u16, 2));
981 0 : h_pred_64(dst, stride, _mm_srli_si128(left_u16, 4));
982 0 : h_pred_64(dst, stride, _mm_srli_si128(left_u16, 6));
983 0 : h_pred_64(dst, stride, _mm_srli_si128(left_u16, 8));
984 0 : h_pred_64(dst, stride, _mm_srli_si128(left_u16, 10));
985 0 : h_pred_64(dst, stride, _mm_srli_si128(left_u16, 12));
986 0 : h_pred_64(dst, stride, _mm_srli_si128(left_u16, 14));
987 0 : }
988 :
989 : // 64x16
990 :
991 0 : void eb_aom_highbd_h_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
992 : const uint16_t *above, const uint16_t *left, int32_t bd)
993 : {
994 : (void)above;
995 : (void)bd;
996 :
997 0 : for (int32_t i = 0; i < 2; i++, left += 8)
998 0 : h_pred_64x8(&dst, stride, left);
999 0 : }
1000 :
1001 : // 64x32
1002 :
1003 0 : void eb_aom_highbd_h_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
1004 : const uint16_t *above, const uint16_t *left, int32_t bd)
1005 : {
1006 : (void)above;
1007 : (void)bd;
1008 :
1009 0 : for (int32_t i = 0; i < 4; i++, left += 8)
1010 0 : h_pred_64x8(&dst, stride, left);
1011 0 : }
1012 :
1013 : // 64x64
1014 :
1015 0 : void eb_aom_highbd_h_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
1016 : const uint16_t *above, const uint16_t *left, int32_t bd)
1017 : {
1018 : (void)above;
1019 : (void)bd;
1020 :
1021 0 : for (int32_t i = 0; i < 8; i++, left += 8)
1022 0 : h_pred_64x8(&dst, stride, left);
1023 0 : }
1024 :
1025 : // =============================================================================
1026 :
1027 : // V_PRED
1028 :
1029 : // 16xN
1030 :
1031 0 : static INLINE void v_pred_16(uint16_t **const dst, const ptrdiff_t stride,
1032 : const __m256i above0)
1033 : {
1034 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x00), above0);
1035 0 : *dst += stride;
1036 0 : }
1037 :
1038 : // Process 8 rows.
1039 0 : static INLINE void v_pred_16x8(uint16_t **const dst, const ptrdiff_t stride,
1040 : const __m256i above)
1041 : {
1042 : // dst and it's stride must be 32-byte aligned.
1043 : assert(!((intptr_t)*dst % 32));
1044 : assert(!(stride % 32));
1045 :
1046 0 : v_pred_16(dst, stride, above);
1047 0 : v_pred_16(dst, stride, above);
1048 0 : v_pred_16(dst, stride, above);
1049 0 : v_pred_16(dst, stride, above);
1050 0 : v_pred_16(dst, stride, above);
1051 0 : v_pred_16(dst, stride, above);
1052 0 : v_pred_16(dst, stride, above);
1053 0 : v_pred_16(dst, stride, above);
1054 0 : }
1055 :
1056 : // 16x4
1057 :
1058 0 : void eb_aom_highbd_v_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
1059 : const uint16_t *above, const uint16_t *left, int32_t bd)
1060 : {
1061 : // Load all 16 pixels in a row into 256-bit registers.
1062 0 : const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1063 :
1064 : (void)left;
1065 : (void)bd;
1066 :
1067 : // dst and it's stride must be 32-byte aligned.
1068 : assert(!((intptr_t)dst % 32));
1069 : assert(!(stride % 32));
1070 :
1071 0 : v_pred_16(&dst, stride, above0);
1072 0 : v_pred_16(&dst, stride, above0);
1073 0 : v_pred_16(&dst, stride, above0);
1074 0 : v_pred_16(&dst, stride, above0);
1075 0 : }
1076 :
1077 : // 16x8
1078 :
1079 0 : void eb_aom_highbd_v_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
1080 : const uint16_t *above, const uint16_t *left, int32_t bd)
1081 : {
1082 : // Load all 16 pixels in a row into 256-bit registers.
1083 0 : const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1084 :
1085 : (void)left;
1086 : (void)bd;
1087 :
1088 : // dst and it's stride must be 32-byte aligned.
1089 : assert(!((intptr_t)dst % 32));
1090 : assert(!(stride % 32));
1091 :
1092 0 : v_pred_16x8(&dst, stride, above0);
1093 0 : }
1094 :
1095 : // 16x16
1096 :
1097 0 : void eb_aom_highbd_v_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
1098 : const uint16_t *above, const uint16_t *left, int32_t bd)
1099 : {
1100 : // Load all 16 pixels in a row into 256-bit registers.
1101 0 : const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1102 :
1103 : (void)left;
1104 : (void)bd;
1105 :
1106 0 : for (int32_t i = 0; i < 2; i++)
1107 0 : v_pred_16x8(&dst, stride, above0);
1108 0 : }
1109 :
1110 : // 16x32
1111 :
1112 0 : void eb_aom_highbd_v_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
1113 : const uint16_t *above, const uint16_t *left, int32_t bd)
1114 : {
1115 : // Load all 16 pixels in a row into 256-bit registers.
1116 0 : const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1117 :
1118 : (void)left;
1119 : (void)bd;
1120 :
1121 0 : for (int32_t i = 0; i < 4; i++)
1122 0 : v_pred_16x8(&dst, stride, above0);
1123 0 : }
1124 :
1125 : // 16x64
1126 :
1127 0 : void eb_aom_highbd_v_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
1128 : const uint16_t *above, const uint16_t *left, int32_t bd)
1129 : {
1130 : // Load all 16 pixels in a row into 256-bit registers.
1131 0 : const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1132 :
1133 : (void)left;
1134 : (void)bd;
1135 :
1136 0 : for (int32_t i = 0; i < 8; i++)
1137 0 : v_pred_16x8(&dst, stride, above0);
1138 0 : }
1139 :
1140 : // -----------------------------------------------------------------------------
1141 :
1142 : // 32xN
1143 :
1144 0 : static INLINE void v_pred_32(uint16_t **const dst, const ptrdiff_t stride,
1145 : const __m256i above0, const __m256i above1)
1146 : {
1147 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x00), above0);
1148 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x10), above1);
1149 0 : *dst += stride;
1150 0 : }
1151 :
1152 : // Process 8 rows.
1153 0 : static INLINE void v_pred_32x8(uint16_t **const dst, const ptrdiff_t stride,
1154 : const __m256i above0, const __m256i above1)
1155 : {
1156 : // dst and it's stride must be 32-byte aligned.
1157 : assert(!((intptr_t)*dst % 32));
1158 : assert(!(stride % 32));
1159 :
1160 0 : v_pred_32(dst, stride, above0, above1);
1161 0 : v_pred_32(dst, stride, above0, above1);
1162 0 : v_pred_32(dst, stride, above0, above1);
1163 0 : v_pred_32(dst, stride, above0, above1);
1164 0 : v_pred_32(dst, stride, above0, above1);
1165 0 : v_pred_32(dst, stride, above0, above1);
1166 0 : v_pred_32(dst, stride, above0, above1);
1167 0 : v_pred_32(dst, stride, above0, above1);
1168 0 : }
1169 :
1170 : // 32x8
1171 :
1172 0 : void eb_aom_highbd_v_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
1173 : const uint16_t *above, const uint16_t *left, int32_t bd)
1174 : {
1175 : // Load all 32 pixels in a row into 256-bit registers.
1176 0 : const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1177 0 : const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1178 :
1179 : (void)left;
1180 : (void)bd;
1181 :
1182 0 : v_pred_32x8(&dst, stride, above0, above1);
1183 0 : }
1184 :
1185 : // 32x16
1186 :
1187 0 : void eb_aom_highbd_v_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
1188 : const uint16_t *above, const uint16_t *left, int32_t bd)
1189 : {
1190 : // Load all 32 pixels in a row into 256-bit registers.
1191 0 : const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1192 0 : const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1193 :
1194 : (void)left;
1195 : (void)bd;
1196 :
1197 0 : for (int32_t i = 0; i < 2; i++)
1198 0 : v_pred_32x8(&dst, stride, above0, above1);
1199 0 : }
1200 :
1201 : // 32x32
1202 :
1203 0 : void eb_aom_highbd_v_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
1204 : const uint16_t *above, const uint16_t *left, int32_t bd)
1205 : {
1206 : // Load all 32 pixels in a row into 256-bit registers.
1207 0 : const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1208 0 : const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1209 :
1210 : (void)left;
1211 : (void)bd;
1212 :
1213 0 : for (int32_t i = 0; i < 4; i++)
1214 0 : v_pred_32x8(&dst, stride, above0, above1);
1215 0 : }
1216 :
1217 : // 32x64
1218 :
1219 0 : void eb_aom_highbd_v_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
1220 : const uint16_t *above, const uint16_t *left, int32_t bd)
1221 : {
1222 : // Load all 32 pixels in a row into 256-bit registers.
1223 0 : const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1224 0 : const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1225 :
1226 : (void)left;
1227 : (void)bd;
1228 :
1229 0 : for (int32_t i = 0; i < 8; i++)
1230 0 : v_pred_32x8(&dst, stride, above0, above1);
1231 0 : }
1232 :
1233 : // -----------------------------------------------------------------------------
1234 :
1235 : // 64xN
1236 :
1237 0 : static INLINE void v_pred_64(uint16_t **const dst, const ptrdiff_t stride,
1238 : const __m256i above0, const __m256i above1, const __m256i above2,
1239 : const __m256i above3)
1240 : {
1241 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x00), above0);
1242 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x10), above1);
1243 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x20), above2);
1244 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x30), above3);
1245 0 : *dst += stride;
1246 0 : }
1247 :
1248 : // Process 8 rows.
1249 0 : static INLINE void v_pred_64x8(uint16_t **const dst, const ptrdiff_t stride,
1250 : const __m256i above0, const __m256i above1, const __m256i above2,
1251 : const __m256i above3)
1252 : {
1253 : // dst and it's stride must be 32-byte aligned.
1254 : assert(!((intptr_t)*dst % 32));
1255 : assert(!(stride % 32));
1256 :
1257 0 : v_pred_64(dst, stride, above0, above1, above2, above3);
1258 0 : v_pred_64(dst, stride, above0, above1, above2, above3);
1259 0 : v_pred_64(dst, stride, above0, above1, above2, above3);
1260 0 : v_pred_64(dst, stride, above0, above1, above2, above3);
1261 0 : v_pred_64(dst, stride, above0, above1, above2, above3);
1262 0 : v_pred_64(dst, stride, above0, above1, above2, above3);
1263 0 : v_pred_64(dst, stride, above0, above1, above2, above3);
1264 0 : v_pred_64(dst, stride, above0, above1, above2, above3);
1265 0 : }
1266 :
1267 : // 64x16
1268 :
1269 0 : void eb_aom_highbd_v_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
1270 : const uint16_t *above, const uint16_t *left, int32_t bd)
1271 : {
1272 : // Load all 64 pixels in a row into 256-bit registers.
1273 0 : const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1274 0 : const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1275 0 : const __m256i above2 = _mm256_loadu_si256((const __m256i *)(above + 0x20));
1276 0 : const __m256i above3 = _mm256_loadu_si256((const __m256i *)(above + 0x30));
1277 :
1278 : (void)left;
1279 : (void)bd;
1280 :
1281 0 : for (int32_t i = 0; i < 2; i++)
1282 0 : v_pred_64x8(&dst, stride, above0, above1, above2, above3);
1283 0 : }
1284 :
1285 : // 64x32
1286 :
1287 0 : void eb_aom_highbd_v_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
1288 : const uint16_t *above, const uint16_t *left, int32_t bd)
1289 : {
1290 : // Load all 64 pixels in a row into 256-bit registers.
1291 0 : const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1292 0 : const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1293 0 : const __m256i above2 = _mm256_loadu_si256((const __m256i *)(above + 0x20));
1294 0 : const __m256i above3 = _mm256_loadu_si256((const __m256i *)(above + 0x30));
1295 :
1296 : (void)left;
1297 : (void)bd;
1298 :
1299 0 : for (int32_t i = 0; i < 4; i++)
1300 0 : v_pred_64x8(&dst, stride, above0, above1, above2, above3);
1301 0 : }
1302 :
1303 : // 64x64
1304 :
1305 0 : void eb_aom_highbd_v_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
1306 : const uint16_t *above, const uint16_t *left, int32_t bd)
1307 : {
1308 : // Load all 64 pixels in a row into 256-bit registers.
1309 0 : const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1310 0 : const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1311 0 : const __m256i above2 = _mm256_loadu_si256((const __m256i *)(above + 0x20));
1312 0 : const __m256i above3 = _mm256_loadu_si256((const __m256i *)(above + 0x30));
1313 :
1314 : (void)left;
1315 : (void)bd;
1316 :
1317 0 : for (int32_t i = 0; i < 8; i++)
1318 0 : v_pred_64x8(&dst, stride, above0, above1, above2, above3);
1319 0 : }
1320 :
1321 : // =============================================================================
1322 :
1323 : // Repeat for AVX2 optimizations.
1324 :
1325 : // bs = 4
1326 : EB_ALIGN(32) static const uint16_t sm_weights_d_4[16] = {
1327 : 255, 1, 149, 107, 85, 171, 64, 192, // 0 1 2 3
1328 : 255, 1, 149, 107, 85, 171, 64, 192 // 0 1 2 3
1329 : };
1330 :
1331 : // bs = 8
1332 : EB_ALIGN(32) static const uint16_t sm_weights_d_8[32] = {
1333 : 255, 1, 197, 59, 146, 110, 105, 151, // 0 1 2 3
1334 : 255, 1, 197, 59, 146, 110, 105, 151, // 0 1 2 3
1335 : 73, 183, 50, 206, 37, 219, 32, 224, // 4 5 6 7
1336 : 73, 183, 50, 206, 37, 219, 32, 224 // 4 5 6 7
1337 : };
1338 :
1339 : // bs = 16
1340 : EB_ALIGN(32) static const uint16_t sm_weights_d_16[64] = {
1341 : 255, 1, 225, 31, 196, 60, 170, 86, // 0 1 2 3
1342 : 255, 1, 225, 31, 196, 60, 170, 86, // 0 1 2 3
1343 : 145, 111, 123, 133, 102, 154, 84, 172, // 4 5 6 7
1344 : 145, 111, 123, 133, 102, 154, 84, 172, // 4 5 6 7
1345 : 68, 188, 54, 202, 43, 213, 33, 223, // 8 9 10 11
1346 : 68, 188, 54, 202, 43, 213, 33, 223, // 8 9 10 11
1347 : 26, 230, 20, 236, 17, 239, 16, 240, // 12 13 14 15
1348 : 26, 230, 20, 236, 17, 239, 16, 240 // 12 13 14 15
1349 : };
1350 :
1351 : // bs = 32
1352 : EB_ALIGN(32) static const uint16_t sm_weights_d_32[128] = {
1353 : 255, 1, 240, 16, 225, 31, 210, 46, // 0 1 2 3
1354 : 255, 1, 240, 16, 225, 31, 210, 46, // 0 1 2 3
1355 : 196, 60, 182, 74, 169, 87, 157, 99, // 4 5 6 7
1356 : 196, 60, 182, 74, 169, 87, 157, 99, // 4 5 6 7
1357 : 145, 111, 133, 123, 122, 134, 111, 145, // 8 9 10 11
1358 : 145, 111, 133, 123, 122, 134, 111, 145, // 8 9 10 11
1359 : 101, 155, 92, 164, 83, 173, 74, 182, // 12 13 14 15
1360 : 101, 155, 92, 164, 83, 173, 74, 182, // 12 13 14 15
1361 : 66, 190, 59, 197, 52, 204, 45, 211, // 16 17 18 19
1362 : 66, 190, 59, 197, 52, 204, 45, 211, // 16 17 18 19
1363 : 39, 217, 34, 222, 29, 227, 25, 231, // 20 21 22 23
1364 : 39, 217, 34, 222, 29, 227, 25, 231, // 20 21 22 23
1365 : 21, 235, 17, 239, 14, 242, 12, 244, // 24 25 26 27
1366 : 21, 235, 17, 239, 14, 242, 12, 244, // 24 25 26 27
1367 : 10, 246, 9, 247, 8, 248, 8, 248, // 28 29 30 31
1368 : 10, 246, 9, 247, 8, 248, 8, 248 // 28 29 30 31
1369 : };
1370 :
1371 : // bs = 64
1372 : EB_ALIGN(32) static const uint16_t sm_weights_d_64[256] = {
1373 : 255, 1, 248, 8, 240, 16, 233, 23, // 0 1 2 3
1374 : 255, 1, 248, 8, 240, 16, 233, 23, // 0 1 2 3
1375 : 225, 31, 218, 38, 210, 46, 203, 53, // 4 5 6 7
1376 : 225, 31, 218, 38, 210, 46, 203, 53, // 4 5 6 7
1377 : 196, 60, 189, 67, 182, 74, 176, 80, // 8 9 10 11
1378 : 196, 60, 189, 67, 182, 74, 176, 80, // 8 9 10 11
1379 : 169, 87, 163, 93, 156, 100, 150, 106, // 12 13 14 15
1380 : 169, 87, 163, 93, 156, 100, 150, 106, // 12 13 14 15
1381 : 144, 112, 138, 118, 133, 123, 127, 129, // 16 17 18 19
1382 : 144, 112, 138, 118, 133, 123, 127, 129, // 16 17 18 19
1383 : 121, 135, 116, 140, 111, 145, 106, 150, // 20 21 22 23
1384 : 121, 135, 116, 140, 111, 145, 106, 150, // 20 21 22 23
1385 : 101, 155, 96, 160, 91, 165, 86, 170, // 24 25 26 27
1386 : 101, 155, 96, 160, 91, 165, 86, 170, // 24 25 26 27
1387 : 82, 174, 77, 179, 73, 183, 69, 187, // 28 29 30 31
1388 : 82, 174, 77, 179, 73, 183, 69, 187, // 28 29 30 31
1389 : 65, 191, 61, 195, 57, 199, 54, 202, // 32 33 34 35
1390 : 65, 191, 61, 195, 57, 199, 54, 202, // 32 33 34 35
1391 : 50, 206, 47, 209, 44, 212, 41, 215, // 36 37 38 39
1392 : 50, 206, 47, 209, 44, 212, 41, 215, // 36 37 38 39
1393 : 38, 218, 35, 221, 32, 224, 29, 227, // 40 41 42 43
1394 : 38, 218, 35, 221, 32, 224, 29, 227, // 40 41 42 43
1395 : 27, 229, 25, 231, 22, 234, 20, 236, // 44 45 46 47
1396 : 27, 229, 25, 231, 22, 234, 20, 236, // 44 45 46 47
1397 : 18, 238, 16, 240, 15, 241, 13, 243, // 48 49 50 51
1398 : 18, 238, 16, 240, 15, 241, 13, 243, // 48 49 50 51
1399 : 12, 244, 10, 246, 9, 247, 8, 248, // 52 53 54 55
1400 : 12, 244, 10, 246, 9, 247, 8, 248, // 52 53 54 55
1401 : 7, 249, 6, 250, 6, 250, 5, 251, // 56 57 58 59
1402 : 7, 249, 6, 250, 6, 250, 5, 251, // 56 57 58 59
1403 : 5, 251, 4, 252, 4, 252, 4, 252, // 60 61 62 63
1404 : 5, 251, 4, 252, 4, 252, 4, 252 // 60 61 62 63
1405 : };
1406 :
1407 : // -----------------------------------------------------------------------------
1408 :
1409 : // Shuffle for AVX2 optimizations.
1410 :
1411 : // bs = 16
1412 : EB_ALIGN(32) static const uint16_t sm_weights_16[32] = {
1413 : 255, 1, 225, 31, 196, 60, 170, 86, // 0 1 2 3
1414 : 68, 188, 54, 202, 43, 213, 33, 223, // 8 9 10 11
1415 : 145, 111, 123, 133, 102, 154, 84, 172, // 4 5 6 7
1416 : 26, 230, 20, 236, 17, 239, 16, 240 // 12 13 14 15
1417 : };
1418 :
1419 : // bs = 32
1420 : EB_ALIGN(32) static const uint16_t sm_weights_32[64] = {
1421 : 255, 1, 240, 16, 225, 31, 210, 46, // 0 1 2 3
1422 : 145, 111, 133, 123, 122, 134, 111, 145, // 8 9 10 11
1423 : 196, 60, 182, 74, 169, 87, 157, 99, // 4 5 6 7
1424 : 101, 155, 92, 164, 83, 173, 74, 182, // 12 13 14 15
1425 : 66, 190, 59, 197, 52, 204, 45, 211, // 16 17 18 19
1426 : 21, 235, 17, 239, 14, 242, 12, 244, // 24 25 26 27
1427 : 39, 217, 34, 222, 29, 227, 25, 231, // 20 21 22 23
1428 : 10, 246, 9, 247, 8, 248, 8, 248 // 28 29 30 31
1429 : };
1430 :
1431 : // bs = 64
1432 : EB_ALIGN(32) static const uint16_t sm_weights_64[128] = {
1433 : 255, 1, 248, 8, 240, 16, 233, 23, // 0 1 2 3
1434 : 196, 60, 189, 67, 182, 74, 176, 80, // 8 9 10 11
1435 : 225, 31, 218, 38, 210, 46, 203, 53, // 4 5 6 7
1436 : 169, 87, 163, 93, 156, 100, 150, 106, // 12 13 14 15
1437 : 144, 112, 138, 118, 133, 123, 127, 129, // 16 17 18 19
1438 : 101, 155, 96, 160, 91, 165, 86, 170, // 24 25 26 27
1439 : 121, 135, 116, 140, 111, 145, 106, 150, // 20 21 22 23
1440 : 82, 174, 77, 179, 73, 183, 69, 187, // 28 29 30 31
1441 : 65, 191, 61, 195, 57, 199, 54, 202, // 32 33 34 35
1442 : 38, 218, 35, 221, 32, 224, 29, 227, // 40 41 42 43
1443 : 50, 206, 47, 209, 44, 212, 41, 215, // 36 37 38 39
1444 : 27, 229, 25, 231, 22, 234, 20, 236, // 44 45 46 47
1445 : 18, 238, 16, 240, 15, 241, 13, 243, // 48 49 50 51
1446 : 7, 249, 6, 250, 6, 250, 5, 251, // 56 57 58 59
1447 : 12, 244, 10, 246, 9, 247, 8, 248, // 52 53 54 55
1448 : 5, 251, 4, 252, 4, 252, 4, 252 // 60 61 62 63
1449 : };
1450 :
1451 : // SMOOTH_PRED
1452 :
1453 : // 8xN
1454 :
1455 0 : static INLINE void load_right_weights_8(const uint16_t *const above,
1456 : __m256i *const r, __m256i *const weights)
1457 : {
1458 0 : *r = _mm256_set1_epi16((uint16_t)above[7]);
1459 :
1460 : // 0 1 2 3 0 1 2 3
1461 0 : weights[0] = _mm256_load_si256((const __m256i *)(sm_weights_d_8 + 0x00));
1462 : // 4 5 6 7 4 5 6 7
1463 0 : weights[1] = _mm256_load_si256((const __m256i *)(sm_weights_d_8 + 0x10));
1464 0 : }
1465 :
1466 0 : static INLINE __m256i load_left_4(const uint16_t *const left, const __m256i r) {
1467 0 : const __m128i l0 = _mm_loadl_epi64((const __m128i *)left);
1468 : // 0 1 2 3 x x x x 0 1 2 3 x x x x
1469 : const __m256i l =
1470 0 : _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l0, 1);
1471 0 : return _mm256_unpacklo_epi16(l, r); // 0 1 2 3 0 1 2 3
1472 : }
1473 :
1474 0 : static INLINE void load_left_8(const uint16_t *const left, const __m256i r,
1475 : __m256i *const lr)
1476 : {
1477 0 : const __m128i l0 = _mm_load_si128((const __m128i *)left);
1478 : // 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1479 : const __m256i l =
1480 0 : _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l0, 1);
1481 0 : lr[0] = _mm256_unpacklo_epi16(l, r); // 0 1 2 3 0 1 2 3
1482 0 : lr[1] = _mm256_unpackhi_epi16(l, r); // 4 5 6 7 4 5 6 7
1483 0 : }
1484 :
1485 0 : static INLINE void init_8(const uint16_t *const above,
1486 : const uint16_t *const left, const int32_t h, __m256i *const ab,
1487 : __m256i *const r, __m256i *const weights_w, __m256i *const rep)
1488 : {
1489 0 : const __m128i a0 = _mm_loadl_epi64(((const __m128i *)(above + 0)));
1490 0 : const __m128i a1 = _mm_loadl_epi64(((const __m128i *)(above + 4)));
1491 0 : const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
1492 : __m256i a[2];
1493 0 : a[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(a0), a0, 1);
1494 0 : a[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(a1), a1, 1);
1495 0 : ab[0] = _mm256_unpacklo_epi16(a[0], b);
1496 0 : ab[1] = _mm256_unpacklo_epi16(a[1], b);
1497 0 : load_right_weights_8(above, r, weights_w);
1498 :
1499 0 : const __m128i rep0 = _mm_set1_epi32(0x03020100);
1500 0 : const __m128i rep1 = _mm_set1_epi32(0x07060504);
1501 0 : const __m128i rep2 = _mm_set1_epi32(0x0B0A0908);
1502 0 : const __m128i rep3 = _mm_set1_epi32(0x0F0E0D0C);
1503 0 : rep[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(rep0), rep1, 1);
1504 0 : rep[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(rep2), rep3, 1);
1505 0 : }
1506 :
1507 0 : static INLINE __m256i smooth_pred_kernel(const __m256i *const weights_w,
1508 : const __m256i weights_h, const __m256i rep, const __m256i *const ab,
1509 : const __m256i lr)
1510 : {
1511 0 : const __m256i round = _mm256_set1_epi32((1 << sm_weight_log2_scale));
1512 : __m256i s[2], sum[2];
1513 : // 0 0 0 0 1 1 1 1
1514 0 : const __m256i w = _mm256_shuffle_epi8(weights_h, rep);
1515 0 : const __m256i t = _mm256_shuffle_epi8(lr, rep);
1516 0 : s[0] = _mm256_madd_epi16(ab[0], w);
1517 0 : s[1] = _mm256_madd_epi16(ab[1], w);
1518 : // width 8: 00 01 02 03 10 11 12 13
1519 : // width 16: 0 1 2 3 8 9 A B
1520 0 : sum[0] = _mm256_madd_epi16(t, weights_w[0]);
1521 : // width 8: 04 05 06 07 14 15 16 17
1522 : // width 16: 4 5 6 7 C D E F
1523 0 : sum[1] = _mm256_madd_epi16(t, weights_w[1]);
1524 0 : sum[0] = _mm256_add_epi32(sum[0], s[0]);
1525 0 : sum[1] = _mm256_add_epi32(sum[1], s[1]);
1526 0 : sum[0] = _mm256_add_epi32(sum[0], round);
1527 0 : sum[1] = _mm256_add_epi32(sum[1], round);
1528 0 : sum[0] = _mm256_srai_epi32(sum[0], 1 + sm_weight_log2_scale);
1529 0 : sum[1] = _mm256_srai_epi32(sum[1], 1 + sm_weight_log2_scale);
1530 : // width 8: 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
1531 : // width 16: 0 1 2 3 4 5 6 7 8 9 A B C D E F
1532 0 : return _mm256_packs_epi32(sum[0], sum[1]);
1533 : }
1534 :
1535 0 : static INLINE void smooth_pred_8x2(const __m256i *const weights_w,
1536 : const __m256i weights_h, const __m256i rep, const __m256i *const ab,
1537 : const __m256i lr, uint16_t **const dst, const ptrdiff_t stride)
1538 : {
1539 : // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
1540 0 : const __m256i d = smooth_pred_kernel(weights_w, weights_h, rep, ab, lr);
1541 0 : _mm_store_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 0));
1542 0 : *dst += stride;
1543 0 : _mm_store_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 1));
1544 0 : *dst += stride;
1545 0 : }
1546 :
1547 0 : static INLINE void smooth_pred_8x4(const __m256i *const weights_w,
1548 : const uint16_t *const sm_weights_h, const __m256i *const rep,
1549 : const __m256i *const ab, const __m256i lr, uint16_t **const dst,
1550 : const ptrdiff_t stride)
1551 : {
1552 0 : const __m256i weights_h = _mm256_load_si256((const __m256i *)sm_weights_h);
1553 0 : smooth_pred_8x2(weights_w, weights_h, rep[0], ab, lr, dst, stride);
1554 0 : smooth_pred_8x2(weights_w, weights_h, rep[1], ab, lr, dst, stride);
1555 0 : }
1556 :
1557 0 : static INLINE void smooth_pred_8x8(const uint16_t *const left,
1558 : const __m256i *const weights_w, const uint16_t *const sm_weights_h,
1559 : const __m256i *const rep, const __m256i *const ab, const __m256i r,
1560 : uint16_t **const dst, const ptrdiff_t stride)
1561 : {
1562 : __m256i lr[2];
1563 0 : load_left_8(left, r, lr);
1564 :
1565 0 : smooth_pred_8x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
1566 0 : smooth_pred_8x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
1567 0 : }
1568 :
1569 : // 8x4
1570 :
1571 0 : void eb_aom_highbd_smooth_predictor_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
1572 : const uint16_t *above, const uint16_t *left, int32_t bd)
1573 : {
1574 : __m256i ab[2], r, lr, weights_w[2], rep[2];
1575 : (void)bd;
1576 :
1577 0 : init_8(above, left, 4, ab, &r, weights_w, rep);
1578 0 : lr = load_left_4(left, r);
1579 0 : smooth_pred_8x4(weights_w, sm_weights_d_4, rep, ab, lr, &dst, stride);
1580 0 : }
1581 :
1582 : // 8x8
1583 :
1584 0 : void eb_aom_highbd_smooth_predictor_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
1585 : const uint16_t *above, const uint16_t *left, int32_t bd)
1586 : {
1587 : __m256i ab[2], r, weights_w[2], rep[2];
1588 : (void)bd;
1589 :
1590 0 : init_8(above, left, 8, ab, &r, weights_w, rep);
1591 :
1592 0 : smooth_pred_8x8(left, weights_w, sm_weights_d_8, rep, ab, r, &dst, stride);
1593 0 : }
1594 :
1595 : // 8x16
1596 :
1597 0 : void eb_aom_highbd_smooth_predictor_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
1598 : const uint16_t *above, const uint16_t *left, int32_t bd)
1599 : {
1600 : __m256i ab[2], r, weights_w[2], rep[2];
1601 : (void)bd;
1602 :
1603 0 : init_8(above, left, 16, ab, &r, weights_w, rep);
1604 :
1605 0 : for (int32_t i = 0; i < 2; i++) {
1606 0 : smooth_pred_8x8(left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep,
1607 : ab, r, &dst, stride);
1608 : }
1609 0 : }
1610 :
1611 : // 8x32
1612 :
1613 0 : void eb_aom_highbd_smooth_predictor_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
1614 : const uint16_t *above, const uint16_t *left, int32_t bd)
1615 : {
1616 : __m256i ab[2], r, weights_w[2], rep[2];
1617 : (void)bd;
1618 :
1619 0 : init_8(above, left, 32, ab, &r, weights_w, rep);
1620 :
1621 0 : for (int32_t i = 0; i < 4; i++) {
1622 0 : smooth_pred_8x8(left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep,
1623 : ab, r, &dst, stride);
1624 : }
1625 0 : }
1626 :
1627 : // -----------------------------------------------------------------------------
1628 : // 16xN
1629 :
1630 0 : static INLINE void load_right_weights_16(const uint16_t *const above,
1631 : __m256i *const r, __m256i *const weights)
1632 : {
1633 0 : *r = _mm256_set1_epi16((uint16_t)above[15]);
1634 :
1635 : // 0 1 2 3 8 9 10 11
1636 0 : weights[0] = _mm256_load_si256((const __m256i *)(sm_weights_16 + 0x00));
1637 : // 4 5 6 7 12 13 14 15
1638 0 : weights[1] = _mm256_load_si256((const __m256i *)(sm_weights_16 + 0x10));
1639 0 : }
1640 :
1641 0 : static INLINE void prepare_ab(const uint16_t *const above, const __m256i b,
1642 : __m256i *const ab)
1643 : {
1644 0 : const __m256i a = _mm256_loadu_si256((const __m256i *)above);
1645 0 : ab[0] = _mm256_unpacklo_epi16(a, b);
1646 0 : ab[1] = _mm256_unpackhi_epi16(a, b);
1647 0 : }
1648 :
1649 0 : static INLINE void init_16(const uint16_t *const above,
1650 : const uint16_t *const left, const int32_t h, __m256i *const ab,
1651 : __m256i *const r, __m256i *const weights_w, __m256i *const rep)
1652 : {
1653 0 : const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
1654 0 : prepare_ab(above, b, ab);
1655 0 : load_right_weights_16(above, r, weights_w);
1656 :
1657 0 : rep[0] = _mm256_set1_epi32(0x03020100);
1658 0 : rep[1] = _mm256_set1_epi32(0x07060504);
1659 0 : rep[2] = _mm256_set1_epi32(0x0B0A0908);
1660 0 : rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
1661 0 : }
1662 :
1663 0 : static INLINE void smooth_pred_16(const __m256i *const weights_w,
1664 : const __m256i weights_h, const __m256i rep, const __m256i *const ab,
1665 : const __m256i lr, uint16_t **const dst, const ptrdiff_t stride)
1666 : {
1667 0 : const __m256i d = smooth_pred_kernel(weights_w, weights_h, rep, ab, lr);
1668 0 : _mm256_storeu_si256((__m256i *)*dst, d);
1669 0 : *dst += stride;
1670 0 : }
1671 :
1672 0 : static INLINE void smooth_pred_16x4(const __m256i *const weights_w,
1673 : const uint16_t *const sm_weights_h, const __m256i *const rep,
1674 : const __m256i *const ab, const __m256i lr, uint16_t **const dst,
1675 : const ptrdiff_t stride)
1676 : {
1677 0 : const __m256i weights_h = _mm256_load_si256((const __m256i *)sm_weights_h);
1678 0 : smooth_pred_16(weights_w, weights_h, rep[0], ab, lr, dst, stride);
1679 0 : smooth_pred_16(weights_w, weights_h, rep[1], ab, lr, dst, stride);
1680 0 : smooth_pred_16(weights_w, weights_h, rep[2], ab, lr, dst, stride);
1681 0 : smooth_pred_16(weights_w, weights_h, rep[3], ab, lr, dst, stride);
1682 0 : }
1683 :
1684 0 : static INLINE void smooth_pred_16x8(const uint16_t *const left,
1685 : const __m256i *const weights_w, const uint16_t *const sm_weights_h,
1686 : const __m256i *const rep, const __m256i *const ab, const __m256i r,
1687 : uint16_t **const dst, const ptrdiff_t stride)
1688 : {
1689 : __m256i lr[2];
1690 0 : load_left_8(left, r, lr);
1691 :
1692 0 : smooth_pred_16x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
1693 0 : smooth_pred_16x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
1694 0 : }
1695 :
1696 : // 16x4
1697 :
1698 0 : void eb_aom_highbd_smooth_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
1699 : const uint16_t *above, const uint16_t *left, int32_t bd)
1700 : {
1701 : __m256i ab[2], r, lr, weights_w[2], rep[4];
1702 : (void)bd;
1703 :
1704 0 : init_16(above, left, 4, ab, &r, weights_w, rep);
1705 0 : lr = load_left_4(left, r);
1706 0 : smooth_pred_16x4(weights_w, sm_weights_d_4, rep, ab, lr, &dst, stride);
1707 0 : }
1708 :
1709 : // 16x8
1710 :
1711 0 : void eb_aom_highbd_smooth_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
1712 : const uint16_t *above, const uint16_t *left, int32_t bd)
1713 : {
1714 : __m256i ab[2], r, weights_w[2], rep[4];
1715 : (void)bd;
1716 :
1717 0 : init_16(above, left, 8, ab, &r, weights_w, rep);
1718 :
1719 0 : smooth_pred_16x8(left, weights_w, sm_weights_d_8, rep, ab, r, &dst, stride);
1720 0 : }
1721 :
1722 : // 16x16
1723 :
1724 0 : void eb_aom_highbd_smooth_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
1725 : const uint16_t *above, const uint16_t *left, int32_t bd)
1726 : {
1727 : __m256i ab[2], r, weights_w[2], rep[4];
1728 : (void)bd;
1729 :
1730 0 : init_16(above, left, 16, ab, &r, weights_w, rep);
1731 :
1732 0 : for (int32_t i = 0; i < 2; i++) {
1733 0 : smooth_pred_16x8(left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep,
1734 : ab, r, &dst, stride);
1735 : }
1736 0 : }
1737 :
1738 : // 16x32
1739 :
1740 0 : void eb_aom_highbd_smooth_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
1741 : const uint16_t *above, const uint16_t *left, int32_t bd)
1742 : {
1743 : __m256i ab[2], r, weights_w[2], rep[4];
1744 : (void)bd;
1745 :
1746 0 : init_16(above, left, 32, ab, &r, weights_w, rep);
1747 :
1748 0 : for (int32_t i = 0; i < 4; i++) {
1749 0 : smooth_pred_16x8(left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep,
1750 : ab, r, &dst, stride);
1751 : }
1752 0 : }
1753 :
1754 : // 16x64
1755 :
1756 0 : void eb_aom_highbd_smooth_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
1757 : const uint16_t *above, const uint16_t *left, int32_t bd)
1758 : {
1759 : __m256i ab[2], r, weights_w[2], rep[4];
1760 : (void)bd;
1761 :
1762 0 : init_16(above, left, 64, ab, &r, weights_w, rep);
1763 :
1764 0 : for (int32_t i = 0; i < 8; i++) {
1765 0 : smooth_pred_16x8(left + 8 * i, weights_w, sm_weights_d_64 + 32 * i, rep,
1766 : ab, r, &dst, stride);
1767 : }
1768 0 : }
1769 :
1770 : // -----------------------------------------------------------------------------
1771 : // 32xN
1772 :
1773 0 : static INLINE void load_right_weights_32(const uint16_t *const above,
1774 : __m256i *const r, __m256i *const weights)
1775 : {
1776 0 : *r = _mm256_set1_epi16((uint16_t)above[31]);
1777 :
1778 : // 0 1 2 3 8 9 10 11
1779 0 : weights[0] = _mm256_load_si256((const __m256i *)(sm_weights_32 + 0x00));
1780 : // 4 5 6 7 12 13 14 15
1781 0 : weights[1] = _mm256_load_si256((const __m256i *)(sm_weights_32 + 0x10));
1782 : // 16 17 18 19 24 25 26 27
1783 0 : weights[2] = _mm256_load_si256((const __m256i *)(sm_weights_32 + 0x20));
1784 : // 20 21 22 23 28 29 30 31
1785 0 : weights[3] = _mm256_load_si256((const __m256i *)(sm_weights_32 + 0x30));
1786 0 : }
1787 :
1788 0 : static INLINE void init_32(const uint16_t *const above,
1789 : const uint16_t *const left, const int32_t h, __m256i *const ab,
1790 : __m256i *const r, __m256i *const weights_w, __m256i *const rep)
1791 : {
1792 0 : const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
1793 0 : prepare_ab(above + 0x00, b, ab + 0);
1794 0 : prepare_ab(above + 0x10, b, ab + 2);
1795 0 : load_right_weights_32(above, r, weights_w);
1796 :
1797 0 : rep[0] = _mm256_set1_epi32(0x03020100);
1798 0 : rep[1] = _mm256_set1_epi32(0x07060504);
1799 0 : rep[2] = _mm256_set1_epi32(0x0B0A0908);
1800 0 : rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
1801 0 : }
1802 :
1803 0 : static INLINE void smooth_pred_32(const __m256i *const weights_w,
1804 : const __m256i weights_h, const __m256i rep, const __m256i *const ab,
1805 : const __m256i lr, uint16_t **const dst, const ptrdiff_t stride)
1806 : {
1807 : __m256i d;
1808 :
1809 : // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1810 0 : d = smooth_pred_kernel(weights_w + 0, weights_h, rep, ab + 0, lr);
1811 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
1812 :
1813 : // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1814 0 : d = smooth_pred_kernel(weights_w + 2, weights_h, rep, ab + 2, lr);
1815 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
1816 0 : *dst += stride;
1817 0 : }
1818 :
1819 0 : static INLINE void smooth_pred_32x4(const __m256i *const weights_w,
1820 : const uint16_t *const sm_weights_h, const __m256i *const rep,
1821 : const __m256i *const ab, const __m256i lr, uint16_t **const dst,
1822 : const ptrdiff_t stride)
1823 : {
1824 0 : const __m256i weights_h = _mm256_load_si256((const __m256i *)sm_weights_h);
1825 0 : smooth_pred_32(weights_w, weights_h, rep[0], ab, lr, dst, stride);
1826 0 : smooth_pred_32(weights_w, weights_h, rep[1], ab, lr, dst, stride);
1827 0 : smooth_pred_32(weights_w, weights_h, rep[2], ab, lr, dst, stride);
1828 0 : smooth_pred_32(weights_w, weights_h, rep[3], ab, lr, dst, stride);
1829 0 : }
1830 :
1831 0 : static INLINE void smooth_pred_32x8(const uint16_t *const left,
1832 : const __m256i *const weights_w, const uint16_t *const sm_weights_h,
1833 : const __m256i *const rep, const __m256i *const ab, const __m256i r,
1834 : uint16_t **const dst, const ptrdiff_t stride)
1835 : {
1836 : __m256i lr[2];
1837 0 : load_left_8(left, r, lr);
1838 :
1839 0 : smooth_pred_32x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
1840 0 : smooth_pred_32x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
1841 0 : }
1842 :
1843 : // 32x8
1844 :
1845 0 : void eb_aom_highbd_smooth_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
1846 : const uint16_t *above, const uint16_t *left, int32_t bd)
1847 : {
1848 : __m256i ab[4], r, weights_w[4], rep[4];
1849 : (void)bd;
1850 :
1851 0 : init_32(above, left, 8, ab, &r, weights_w, rep);
1852 :
1853 0 : smooth_pred_32x8(left, weights_w, sm_weights_d_8, rep, ab, r, &dst, stride);
1854 0 : }
1855 :
1856 : // 32x16
1857 :
1858 0 : void eb_aom_highbd_smooth_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
1859 : const uint16_t *above, const uint16_t *left, int32_t bd)
1860 : {
1861 : __m256i ab[4], r, weights_w[4], rep[4];
1862 : (void)bd;
1863 :
1864 0 : init_32(above, left, 16, ab, &r, weights_w, rep);
1865 :
1866 0 : for (int32_t i = 0; i < 2; i++) {
1867 0 : smooth_pred_32x8(left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep,
1868 : ab, r, &dst, stride);
1869 : }
1870 0 : }
1871 :
1872 : // 32x32
1873 :
1874 0 : void eb_aom_highbd_smooth_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
1875 : const uint16_t *above, const uint16_t *left, int32_t bd)
1876 : {
1877 : __m256i ab[4], r, weights_w[4], rep[4];
1878 : (void)bd;
1879 :
1880 0 : init_32(above, left, 32, ab, &r, weights_w, rep);
1881 :
1882 0 : for (int32_t i = 0; i < 4; i++) {
1883 0 : smooth_pred_32x8(left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep,
1884 : ab, r, &dst, stride);
1885 : }
1886 0 : }
1887 :
1888 : // 32x64
1889 :
1890 0 : void eb_aom_highbd_smooth_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
1891 : const uint16_t *above, const uint16_t *left, int32_t bd)
1892 : {
1893 : __m256i ab[4], r, weights_w[4], rep[4];
1894 : (void)bd;
1895 :
1896 0 : init_32(above, left, 64, ab, &r, weights_w, rep);
1897 :
1898 0 : for (int32_t i = 0; i < 8; i++) {
1899 0 : smooth_pred_32x8(left + 8 * i, weights_w, sm_weights_d_64 + 32 * i, rep,
1900 : ab, r, &dst, stride);
1901 : }
1902 0 : }
1903 :
1904 : // -----------------------------------------------------------------------------
1905 : // 64xN
1906 :
1907 0 : static INLINE void load_right_weights_64(const uint16_t *const above,
1908 : __m256i *const r, __m256i *const weights)
1909 : {
1910 0 : *r = _mm256_set1_epi16((uint16_t)above[63]);
1911 :
1912 : // 0 1 2 3 8 9 10 11
1913 0 : weights[0] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x00));
1914 : // 4 5 6 7 12 13 14 15
1915 0 : weights[1] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x10));
1916 : // 16 17 18 19 24 25 26 27
1917 0 : weights[2] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x20));
1918 : // 20 21 22 23 28 29 30 31
1919 0 : weights[3] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x30));
1920 : // 32 33 34 35 40 41 42 43
1921 0 : weights[4] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x40));
1922 : // 36 37 38 39 44 45 46 47
1923 0 : weights[5] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x50));
1924 : // 48 49 50 51 56 57 58 59
1925 0 : weights[6] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x60));
1926 : // 52 53 54 55 60 61 62 63
1927 0 : weights[7] = _mm256_load_si256((const __m256i *)(sm_weights_64 + 0x70));
1928 0 : }
1929 :
1930 0 : static INLINE void init_64(const uint16_t *const above,
1931 : const uint16_t *const left, const int32_t h, __m256i *const ab,
1932 : __m256i *const r, __m256i *const weights_w, __m256i *const rep)
1933 : {
1934 0 : const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
1935 0 : prepare_ab(above + 0x00, b, ab + 0);
1936 0 : prepare_ab(above + 0x10, b, ab + 2);
1937 0 : prepare_ab(above + 0x20, b, ab + 4);
1938 0 : prepare_ab(above + 0x30, b, ab + 6);
1939 0 : load_right_weights_64(above, r, weights_w);
1940 :
1941 0 : rep[0] = _mm256_set1_epi32(0x03020100);
1942 0 : rep[1] = _mm256_set1_epi32(0x07060504);
1943 0 : rep[2] = _mm256_set1_epi32(0x0B0A0908);
1944 0 : rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
1945 0 : }
1946 :
1947 0 : static INLINE void smooth_pred_64(const __m256i *const weights_w,
1948 : const __m256i weights_h, const __m256i rep, const __m256i *const ab,
1949 : const __m256i lr, uint16_t **const dst, const ptrdiff_t stride)
1950 : {
1951 : __m256i d;
1952 :
1953 : // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1954 0 : d = smooth_pred_kernel(weights_w + 0, weights_h, rep, ab + 0, lr);
1955 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
1956 :
1957 : // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1958 0 : d = smooth_pred_kernel(weights_w + 2, weights_h, rep, ab + 2, lr);
1959 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
1960 :
1961 : // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1962 0 : d = smooth_pred_kernel(weights_w + 4, weights_h, rep, ab + 4, lr);
1963 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x20), d);
1964 :
1965 : // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1966 0 : d = smooth_pred_kernel(weights_w + 6, weights_h, rep, ab + 6, lr);
1967 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x30), d);
1968 0 : *dst += stride;
1969 0 : }
1970 :
1971 0 : static INLINE void smooth_pred_64x4(const __m256i *const weights_w,
1972 : const uint16_t *const sm_weights_h, const __m256i *const rep,
1973 : const __m256i *const ab, const __m256i lr, uint16_t **const dst,
1974 : const ptrdiff_t stride)
1975 : {
1976 0 : const __m256i weights_h = _mm256_load_si256((const __m256i *)sm_weights_h);
1977 0 : smooth_pred_64(weights_w, weights_h, rep[0], ab, lr, dst, stride);
1978 0 : smooth_pred_64(weights_w, weights_h, rep[1], ab, lr, dst, stride);
1979 0 : smooth_pred_64(weights_w, weights_h, rep[2], ab, lr, dst, stride);
1980 0 : smooth_pred_64(weights_w, weights_h, rep[3], ab, lr, dst, stride);
1981 0 : }
1982 :
1983 0 : static INLINE void smooth_pred_64x8(const uint16_t *const left,
1984 : const __m256i *const weights_w, const uint16_t *const sm_weights_h,
1985 : const __m256i *const rep, const __m256i *const ab, const __m256i r,
1986 : uint16_t **const dst, const ptrdiff_t stride)
1987 : {
1988 : __m256i lr[2];
1989 0 : load_left_8(left, r, lr);
1990 :
1991 0 : smooth_pred_64x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
1992 0 : smooth_pred_64x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
1993 0 : }
1994 :
1995 : // 64x16
1996 :
1997 0 : void eb_aom_highbd_smooth_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
1998 : const uint16_t *above, const uint16_t *left, int32_t bd)
1999 : {
2000 : __m256i ab[8], r, weights_w[8], rep[4];
2001 : (void)bd;
2002 :
2003 0 : init_64(above, left, 16, ab, &r, weights_w, rep);
2004 :
2005 0 : for (int32_t i = 0; i < 2; i++) {
2006 0 : smooth_pred_64x8(left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep,
2007 : ab, r, &dst, stride);
2008 : }
2009 0 : }
2010 :
2011 : // 64x32
2012 :
2013 0 : void eb_aom_highbd_smooth_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
2014 : const uint16_t *above, const uint16_t *left, int32_t bd)
2015 : {
2016 : __m256i ab[8], r, weights_w[8], rep[4];
2017 : (void)bd;
2018 :
2019 0 : init_64(above, left, 32, ab, &r, weights_w, rep);
2020 :
2021 0 : for (int32_t i = 0; i < 4; i++) {
2022 0 : smooth_pred_64x8(left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep,
2023 : ab, r, &dst, stride);
2024 : }
2025 0 : }
2026 :
2027 : // 64x64
2028 :
2029 0 : void eb_aom_highbd_smooth_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
2030 : const uint16_t *above, const uint16_t *left, int32_t bd)
2031 : {
2032 : __m256i ab[8], r, weights_w[8], rep[4];
2033 : (void)bd;
2034 :
2035 0 : init_64(above, left, 64, ab, &r, weights_w, rep);
2036 :
2037 0 : for (int32_t i = 0; i < 8; i++) {
2038 0 : smooth_pred_64x8(left + 8 * i, weights_w, sm_weights_d_64 + 32 * i, rep,
2039 : ab, r, &dst, stride);
2040 : }
2041 0 : }
2042 :
2043 : // =============================================================================
2044 :
2045 : // SMOOTH_H_PRED
2046 :
2047 : // 8xN
2048 :
2049 0 : static INLINE __m256i smooth_h_pred_kernel(const __m256i *const weights,
2050 : const __m256i lr)
2051 : {
2052 0 : const __m256i round = _mm256_set1_epi32((1 << (sm_weight_log2_scale - 1)));
2053 : __m256i sum[2];
2054 : // width 8: 00 01 02 03 10 11 12 13
2055 : // width 16: 0 1 2 3 8 9 A B
2056 0 : sum[0] = _mm256_madd_epi16(lr, weights[0]);
2057 : // width 8: 04 05 06 07 14 15 16 17
2058 : // width 16: 4 5 6 7 C D E F
2059 0 : sum[1] = _mm256_madd_epi16(lr, weights[1]);
2060 0 : sum[0] = _mm256_add_epi32(sum[0], round);
2061 0 : sum[1] = _mm256_add_epi32(sum[1], round);
2062 0 : sum[0] = _mm256_srai_epi32(sum[0], sm_weight_log2_scale);
2063 0 : sum[1] = _mm256_srai_epi32(sum[1], sm_weight_log2_scale);
2064 : // width 8: 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
2065 : // width 16: 0 1 2 3 4 5 6 7 8 9 A B C D E F
2066 0 : return _mm256_packs_epi32(sum[0], sum[1]);
2067 : }
2068 :
2069 0 : static INLINE void smooth_h_pred_8x2(const __m256i *const weights,
2070 : __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
2071 : {
2072 0 : const __m256i rep = _mm256_set1_epi32(0x03020100);
2073 : // lr: 0 1 2 3 1 2 3 4
2074 0 : const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0 1 1 1 1
2075 : // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
2076 0 : const __m256i d = smooth_h_pred_kernel(weights, t);
2077 0 : _mm_store_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 0));
2078 0 : *dst += stride;
2079 0 : _mm_store_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 1));
2080 0 : *dst += stride;
2081 0 : *lr = _mm256_srli_si256(*lr, 8); // 2 3 x x 3 4 x x
2082 0 : }
2083 :
2084 0 : static INLINE void smooth_h_pred_8x4(const __m256i *const weights,
2085 : __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
2086 : {
2087 0 : smooth_h_pred_8x2(weights, lr, dst, stride);
2088 0 : smooth_h_pred_8x2(weights, lr, dst, stride);
2089 0 : }
2090 :
2091 0 : static INLINE void smooth_h_pred_8x8(const uint16_t *const left,
2092 : const __m256i r, const __m256i *const weights, uint16_t **const dst,
2093 : const ptrdiff_t stride)
2094 : {
2095 0 : const __m128i l0 = _mm_load_si128((const __m128i *)left);
2096 0 : const __m128i l1 = _mm_srli_si128(l0, 2);
2097 : // 0 1 2 3 4 5 6 7 1 2 3 4 5 6 7 x
2098 : const __m256i l =
2099 0 : _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l1, 1);
2100 : __m256i lr[2];
2101 0 : lr[0] = _mm256_unpacklo_epi16(l, r); // 0 1 2 3 1 2 3 4
2102 0 : lr[1] = _mm256_unpackhi_epi16(l, r); // 4 5 6 7 5 6 7 x
2103 0 : smooth_h_pred_8x4(weights, &lr[0], dst, stride);
2104 0 : smooth_h_pred_8x4(weights, &lr[1], dst, stride);
2105 0 : }
2106 :
2107 : // 8x4
2108 :
2109 0 : void eb_aom_highbd_smooth_h_predictor_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
2110 : const uint16_t *above, const uint16_t *left, int32_t bd)
2111 : {
2112 0 : const __m128i l0 = _mm_loadl_epi64((const __m128i *)left);
2113 0 : const __m128i l1 = _mm_srli_si128(l0, 2);
2114 : // 0 1 2 3 x x x x 1 2 3 4 x x x x
2115 : const __m256i l =
2116 0 : _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l1, 1);
2117 : __m256i r, weights[2];
2118 : (void)bd;
2119 :
2120 0 : load_right_weights_8(above, &r, weights);
2121 0 : __m256i lr = _mm256_unpacklo_epi16(l, r); // 0 1 2 3 1 2 3 4
2122 0 : smooth_h_pred_8x4(weights, &lr, &dst, stride);
2123 0 : }
2124 :
2125 : // 8x8
2126 :
2127 0 : void eb_aom_highbd_smooth_h_predictor_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
2128 : const uint16_t *above, const uint16_t *left, int32_t bd)
2129 : {
2130 : __m256i r, weights[2];
2131 : (void)bd;
2132 :
2133 0 : load_right_weights_8(above, &r, weights);
2134 0 : smooth_h_pred_8x8(left, r, weights, &dst, stride);
2135 0 : }
2136 :
2137 : // 8x16
2138 :
2139 0 : void eb_aom_highbd_smooth_h_predictor_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
2140 : const uint16_t *above, const uint16_t *left, int32_t bd)
2141 : {
2142 : __m256i r, weights[2];
2143 : (void)bd;
2144 :
2145 0 : load_right_weights_8(above, &r, weights);
2146 0 : smooth_h_pred_8x8(left + 0, r, weights, &dst, stride);
2147 0 : smooth_h_pred_8x8(left + 8, r, weights, &dst, stride);
2148 0 : }
2149 :
2150 : // 8x32
2151 :
2152 0 : void eb_aom_highbd_smooth_h_predictor_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
2153 : const uint16_t *above, const uint16_t *left, int32_t bd)
2154 : {
2155 : __m256i r, weights[2];
2156 : (void)bd;
2157 :
2158 0 : load_right_weights_8(above, &r, weights);
2159 :
2160 0 : for (int32_t i = 0; i < 2; i++) {
2161 0 : smooth_h_pred_8x8(left + 0, r, weights, &dst, stride);
2162 0 : smooth_h_pred_8x8(left + 8, r, weights, &dst, stride);
2163 0 : left += 16;
2164 : }
2165 0 : }
2166 :
2167 : // -----------------------------------------------------------------------------
2168 : // 16xN
2169 :
2170 0 : static INLINE void smooth_h_pred_16(const __m256i *const weights,
2171 : __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
2172 : {
2173 0 : const __m256i rep = _mm256_set1_epi32(0x03020100);
2174 : // lr: 0 1 2 3 0 1 2 3
2175 0 : const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0 0 0 0 0
2176 : // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2177 0 : const __m256i d = smooth_h_pred_kernel(weights, t);
2178 0 : _mm256_storeu_si256((__m256i *)*dst, d);
2179 0 : *dst += stride;
2180 0 : *lr = _mm256_srli_si256(*lr, 4); // 1 2 3 x 1 2 3 x
2181 0 : }
2182 :
2183 0 : static INLINE void smooth_h_pred_16x4(const __m256i *const weights,
2184 : __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
2185 : {
2186 0 : smooth_h_pred_16(weights, lr, dst, stride);
2187 0 : smooth_h_pred_16(weights, lr, dst, stride);
2188 0 : smooth_h_pred_16(weights, lr, dst, stride);
2189 0 : smooth_h_pred_16(weights, lr, dst, stride);
2190 0 : }
2191 :
2192 0 : static INLINE void smooth_h_pred_16x8(const uint16_t *const left,
2193 : const __m256i r, const __m256i *const weights, uint16_t **const dst,
2194 : const ptrdiff_t stride)
2195 : {
2196 : __m256i lr[2];
2197 0 : load_left_8(left, r, lr);
2198 0 : smooth_h_pred_16x4(weights, &lr[0], dst, stride);
2199 0 : smooth_h_pred_16x4(weights, &lr[1], dst, stride);
2200 0 : }
2201 :
2202 0 : static INLINE void smooth_h_predictor_16x16(uint16_t *dst,
2203 : const ptrdiff_t stride, const uint16_t *const above, const uint16_t *left,
2204 : const int32_t n)
2205 : {
2206 : __m256i r, weights[2];
2207 :
2208 0 : load_right_weights_16(above, &r, weights);
2209 :
2210 0 : for (int32_t i = 0; i < n; i++) {
2211 0 : smooth_h_pred_16x8(left + 0, r, weights, &dst, stride);
2212 0 : smooth_h_pred_16x8(left + 8, r, weights, &dst, stride);
2213 0 : left += 16;
2214 : }
2215 0 : }
2216 :
2217 : // 16x4
2218 :
2219 0 : void eb_aom_highbd_smooth_h_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
2220 : const uint16_t *above, const uint16_t *left, int32_t bd)
2221 : {
2222 : __m256i r, lr, weights[2];
2223 : (void)bd;
2224 :
2225 0 : load_right_weights_16(above, &r, weights);
2226 0 : lr = load_left_4(left, r);
2227 0 : smooth_h_pred_16x4(weights, &lr, &dst, stride);
2228 0 : }
2229 :
2230 : // 16x8
2231 :
2232 0 : void eb_aom_highbd_smooth_h_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
2233 : const uint16_t *above, const uint16_t *left, int32_t bd)
2234 : {
2235 : __m256i r, weights[2];
2236 : (void)bd;
2237 :
2238 0 : load_right_weights_16(above, &r, weights);
2239 0 : smooth_h_pred_16x8(left, r, weights, &dst, stride);
2240 0 : }
2241 :
2242 : // 16x16
2243 :
2244 0 : void eb_aom_highbd_smooth_h_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
2245 : const uint16_t *above, const uint16_t *left, int32_t bd)
2246 : {
2247 : (void)bd;
2248 0 : smooth_h_predictor_16x16(dst, stride, above, left, 1);
2249 0 : }
2250 :
2251 : // 16x32
2252 :
2253 0 : void eb_aom_highbd_smooth_h_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
2254 : const uint16_t *above, const uint16_t *left, int32_t bd)
2255 : {
2256 : (void)bd;
2257 0 : smooth_h_predictor_16x16(dst, stride, above, left, 2);
2258 0 : }
2259 :
2260 : // 16x64
2261 :
2262 0 : void eb_aom_highbd_smooth_h_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
2263 : const uint16_t *above, const uint16_t *left, int32_t bd)
2264 : {
2265 : (void)bd;
2266 0 : smooth_h_predictor_16x16(dst, stride, above, left, 4);
2267 0 : }
2268 :
2269 : // -----------------------------------------------------------------------------
2270 : // 32xN
2271 :
2272 0 : static INLINE void smooth_h_pred_32(const __m256i *const weights,
2273 : __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
2274 : {
2275 0 : const __m256i rep = _mm256_set1_epi32(0x03020100);
2276 : // lr: 0 1 2 3 0 1 2 3
2277 0 : const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0 0 0 0 0
2278 : __m256i d;
2279 :
2280 : // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2281 0 : d = smooth_h_pred_kernel(weights + 0, t);
2282 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
2283 :
2284 : // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2285 0 : d = smooth_h_pred_kernel(weights + 2, t);
2286 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
2287 0 : *dst += stride;
2288 0 : *lr = _mm256_srli_si256(*lr, 4); // 1 2 3 x 1 2 3 x
2289 0 : }
2290 :
2291 0 : static INLINE void smooth_h_pred_32x4(const __m256i *const weights,
2292 : __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
2293 : {
2294 0 : smooth_h_pred_32(weights, lr, dst, stride);
2295 0 : smooth_h_pred_32(weights, lr, dst, stride);
2296 0 : smooth_h_pred_32(weights, lr, dst, stride);
2297 0 : smooth_h_pred_32(weights, lr, dst, stride);
2298 0 : }
2299 :
2300 0 : static INLINE void smooth_h_pred_32x8(uint16_t *dst,
2301 : const ptrdiff_t stride, const uint16_t *const above, const uint16_t *left,
2302 : const int32_t n)
2303 : {
2304 : __m256i r, lr[2], weights[4];
2305 :
2306 0 : load_right_weights_32(above, &r, weights);
2307 :
2308 0 : for (int32_t i = 0; i < n; i++) {
2309 0 : load_left_8(left, r, lr);
2310 0 : smooth_h_pred_32x4(weights, &lr[0], &dst, stride);
2311 0 : smooth_h_pred_32x4(weights, &lr[1], &dst, stride);
2312 0 : left += 8;
2313 : }
2314 0 : }
2315 :
2316 : // 32x8
2317 :
2318 0 : void eb_aom_highbd_smooth_h_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
2319 : const uint16_t *above, const uint16_t *left, int32_t bd)
2320 : {
2321 : (void)bd;
2322 0 : smooth_h_pred_32x8(dst, stride, above, left, 1);
2323 0 : }
2324 :
2325 : // 32x16
2326 :
2327 0 : void eb_aom_highbd_smooth_h_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
2328 : const uint16_t *above, const uint16_t *left, int32_t bd)
2329 : {
2330 : (void)bd;
2331 0 : smooth_h_pred_32x8(dst, stride, above, left, 2);
2332 0 : }
2333 :
2334 : // 32x32
2335 :
2336 0 : void eb_aom_highbd_smooth_h_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
2337 : const uint16_t *above, const uint16_t *left, int32_t bd)
2338 : {
2339 : (void)bd;
2340 0 : smooth_h_pred_32x8(dst, stride, above, left, 4);
2341 0 : }
2342 :
2343 : // 32x64
2344 :
2345 0 : void eb_aom_highbd_smooth_h_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
2346 : const uint16_t *above, const uint16_t *left, int32_t bd)
2347 : {
2348 : (void)bd;
2349 0 : smooth_h_pred_32x8(dst, stride, above, left, 8);
2350 0 : }
2351 :
2352 : // -----------------------------------------------------------------------------
2353 : // 64xN
2354 :
2355 0 : static INLINE void smooth_h_pred_64(const __m256i *const weights,
2356 : __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
2357 : {
2358 0 : const __m256i rep = _mm256_set1_epi32(0x03020100);
2359 : // lr: 0 1 2 3 0 1 2 3
2360 0 : const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0 0 0 0 0
2361 : __m256i d;
2362 :
2363 : // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2364 0 : d = smooth_h_pred_kernel(weights + 0, t);
2365 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
2366 :
2367 : // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2368 0 : d = smooth_h_pred_kernel(weights + 2, t);
2369 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
2370 :
2371 : // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
2372 0 : d = smooth_h_pred_kernel(weights + 4, t);
2373 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x20), d);
2374 :
2375 : // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
2376 0 : d = smooth_h_pred_kernel(weights + 6, t);
2377 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x30), d);
2378 0 : *dst += stride;
2379 0 : *lr = _mm256_srli_si256(*lr, 4); // 1 2 3 x 1 2 3 x
2380 0 : }
2381 :
2382 0 : static INLINE void smooth_h_pred_64x4(const __m256i *const weights,
2383 : __m256i *const lr, uint16_t **const dst, const ptrdiff_t stride)
2384 : {
2385 0 : smooth_h_pred_64(weights, lr, dst, stride);
2386 0 : smooth_h_pred_64(weights, lr, dst, stride);
2387 0 : smooth_h_pred_64(weights, lr, dst, stride);
2388 0 : smooth_h_pred_64(weights, lr, dst, stride);
2389 0 : }
2390 :
2391 0 : static INLINE void smooth_h_pred_64x8(uint16_t *dst,
2392 : const ptrdiff_t stride, const uint16_t *const above, const uint16_t *left,
2393 : const int32_t n)
2394 : {
2395 : __m256i r, lr[2], weights[8];
2396 :
2397 0 : load_right_weights_64(above, &r, weights);
2398 :
2399 0 : for (int32_t i = 0; i < n; i++) {
2400 0 : load_left_8(left, r, lr);
2401 0 : smooth_h_pred_64x4(weights, &lr[0], &dst, stride);
2402 0 : smooth_h_pred_64x4(weights, &lr[1], &dst, stride);
2403 0 : left += 8;
2404 : }
2405 0 : }
2406 :
2407 : // 64x16
2408 :
2409 0 : void eb_aom_highbd_smooth_h_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
2410 : const uint16_t *above, const uint16_t *left, int32_t bd)
2411 : {
2412 : (void)bd;
2413 0 : smooth_h_pred_64x8(dst, stride, above, left, 2);
2414 0 : }
2415 :
2416 : // 64x32
2417 :
2418 0 : void eb_aom_highbd_smooth_h_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
2419 : const uint16_t *above, const uint16_t *left, int32_t bd)
2420 : {
2421 : (void)bd;
2422 0 : smooth_h_pred_64x8(dst, stride, above, left, 4);
2423 0 : }
2424 :
2425 : // 64x64
2426 :
2427 0 : void eb_aom_highbd_smooth_h_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
2428 : const uint16_t *above, const uint16_t *left, int32_t bd)
2429 : {
2430 : (void)bd;
2431 0 : smooth_h_pred_64x8(dst, stride, above, left, 8);
2432 0 : }
2433 :
2434 : // =============================================================================
2435 :
2436 : // SMOOTH_V_PRED
2437 :
2438 : // 8xN
2439 :
2440 0 : static INLINE void smooth_v_init_8(const uint16_t *const above,
2441 : const uint16_t *const left, const int32_t h, __m256i *const ab,
2442 : __m256i *const rep)
2443 : {
2444 0 : const __m128i a0 = _mm_loadl_epi64(((const __m128i *)(above + 0)));
2445 0 : const __m128i a1 = _mm_loadl_epi64(((const __m128i *)(above + 4)));
2446 0 : const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
2447 : __m256i a[2];
2448 0 : a[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(a0), a0, 1);
2449 0 : a[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(a1), a1, 1);
2450 0 : ab[0] = _mm256_unpacklo_epi16(a[0], b);
2451 0 : ab[1] = _mm256_unpacklo_epi16(a[1], b);
2452 :
2453 0 : const __m128i rep0 = _mm_set1_epi32(0x03020100);
2454 0 : const __m128i rep1 = _mm_set1_epi32(0x07060504);
2455 0 : const __m128i rep2 = _mm_set1_epi32(0x0B0A0908);
2456 0 : const __m128i rep3 = _mm_set1_epi32(0x0F0E0D0C);
2457 0 : rep[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(rep0), rep1, 1);
2458 0 : rep[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(rep2), rep3, 1);
2459 0 : }
2460 :
2461 0 : static INLINE __m256i smooth_v_pred_kernel(const __m256i weights,
2462 : const __m256i rep, const __m256i *const ab)
2463 : {
2464 0 : const __m256i round = _mm256_set1_epi32((1 << (sm_weight_log2_scale - 1)));
2465 : __m256i sum[2];
2466 : // 0 0 0 0 1 1 1 1
2467 0 : const __m256i w = _mm256_shuffle_epi8(weights, rep);
2468 0 : sum[0] = _mm256_madd_epi16(ab[0], w);
2469 0 : sum[1] = _mm256_madd_epi16(ab[1], w);
2470 0 : sum[0] = _mm256_add_epi32(sum[0], round);
2471 0 : sum[1] = _mm256_add_epi32(sum[1], round);
2472 0 : sum[0] = _mm256_srai_epi32(sum[0], sm_weight_log2_scale);
2473 0 : sum[1] = _mm256_srai_epi32(sum[1], sm_weight_log2_scale);
2474 : // width 8: 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
2475 : // width 16: 0 1 2 3 4 5 6 7 8 9 A B C D E F
2476 0 : return _mm256_packs_epi32(sum[0], sum[1]);
2477 : }
2478 :
2479 0 : static INLINE void smooth_v_pred_8x2(const __m256i weights, const __m256i rep,
2480 : const __m256i *const ab, uint16_t **const dst, const ptrdiff_t stride)
2481 : {
2482 : // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
2483 0 : const __m256i d = smooth_v_pred_kernel(weights, rep, ab);
2484 0 : _mm_store_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 0));
2485 0 : *dst += stride;
2486 0 : _mm_store_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 1));
2487 0 : *dst += stride;
2488 0 : }
2489 :
2490 0 : static INLINE void smooth_v_pred_8x4(const uint16_t *const sm_weights_h,
2491 : const __m256i *const rep, const __m256i *const ab, uint16_t **const dst,
2492 : const ptrdiff_t stride)
2493 : {
2494 0 : const __m256i weights = _mm256_load_si256((const __m256i *)sm_weights_h);
2495 0 : smooth_v_pred_8x2(weights, rep[0], ab, dst, stride);
2496 0 : smooth_v_pred_8x2(weights, rep[1], ab, dst, stride);
2497 0 : }
2498 :
2499 0 : static INLINE void smooth_v_pred_8x8(const uint16_t *const sm_weights_h,
2500 : const __m256i *const rep, const __m256i *const ab, uint16_t **const dst,
2501 : const ptrdiff_t stride)
2502 : {
2503 0 : smooth_v_pred_8x4(sm_weights_h + 0, rep, ab, dst, stride);
2504 0 : smooth_v_pred_8x4(sm_weights_h + 16, rep, ab, dst, stride);
2505 0 : }
2506 :
2507 : // 8x4
2508 :
2509 0 : void eb_aom_highbd_smooth_v_predictor_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
2510 : const uint16_t *above, const uint16_t *left, int32_t bd)
2511 : {
2512 : __m256i ab[2], rep[2];
2513 : (void)bd;
2514 :
2515 0 : smooth_v_init_8(above, left, 4, ab, rep);
2516 0 : smooth_v_pred_8x4(sm_weights_d_4, rep, ab, &dst, stride);
2517 0 : }
2518 :
2519 : // 8x8
2520 :
2521 0 : void eb_aom_highbd_smooth_v_predictor_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
2522 : const uint16_t *above, const uint16_t *left, int32_t bd)
2523 : {
2524 : __m256i ab[2], rep[2];
2525 : (void)bd;
2526 :
2527 0 : smooth_v_init_8(above, left, 8, ab, rep);
2528 :
2529 0 : smooth_v_pred_8x8(sm_weights_d_8, rep, ab, &dst, stride);
2530 0 : }
2531 :
2532 : // 8x16
2533 :
2534 0 : void eb_aom_highbd_smooth_v_predictor_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
2535 : const uint16_t *above, const uint16_t *left, int32_t bd)
2536 : {
2537 : __m256i ab[2], rep[2];
2538 : (void)bd;
2539 :
2540 0 : smooth_v_init_8(above, left, 16, ab, rep);
2541 :
2542 0 : for (int32_t i = 0; i < 2; i++)
2543 0 : smooth_v_pred_8x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
2544 0 : }
2545 :
2546 : // 8x32
2547 :
2548 0 : void eb_aom_highbd_smooth_v_predictor_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
2549 : const uint16_t *above, const uint16_t *left, int32_t bd)
2550 : {
2551 : __m256i ab[2], rep[2];
2552 : (void)bd;
2553 :
2554 0 : smooth_v_init_8(above, left, 32, ab, rep);
2555 :
2556 0 : for (int32_t i = 0; i < 4; i++)
2557 0 : smooth_v_pred_8x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
2558 0 : }
2559 :
2560 : // -----------------------------------------------------------------------------
2561 : // 16xN
2562 :
2563 0 : static INLINE void smooth_v_prepare_ab(const uint16_t *const above,
2564 : const __m256i b, __m256i *const ab)
2565 : {
2566 0 : const __m256i a = _mm256_loadu_si256((const __m256i *)above);
2567 0 : ab[0] = _mm256_unpacklo_epi16(a, b);
2568 0 : ab[1] = _mm256_unpackhi_epi16(a, b);
2569 0 : }
2570 :
2571 0 : static INLINE void smooth_v_init_16(const uint16_t *const above,
2572 : const uint16_t *const left, const int32_t h, __m256i *const ab,
2573 : __m256i *const rep)
2574 : {
2575 0 : const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
2576 0 : smooth_v_prepare_ab(above, b, ab);
2577 :
2578 0 : rep[0] = _mm256_set1_epi32(0x03020100);
2579 0 : rep[1] = _mm256_set1_epi32(0x07060504);
2580 0 : rep[2] = _mm256_set1_epi32(0x0B0A0908);
2581 0 : rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
2582 0 : }
2583 :
2584 0 : static INLINE void smooth_v_pred_16(const __m256i weights, const __m256i rep,
2585 : const __m256i *const ab, uint16_t **const dst, const ptrdiff_t stride)
2586 : {
2587 0 : const __m256i d = smooth_v_pred_kernel(weights, rep, ab);
2588 0 : _mm256_storeu_si256((__m256i *)*dst, d);
2589 0 : *dst += stride;
2590 0 : }
2591 :
2592 0 : static INLINE void smooth_v_pred_16x4(const uint16_t *const sm_weights_h,
2593 : const __m256i *const rep, const __m256i *const ab, uint16_t **const dst,
2594 : const ptrdiff_t stride)
2595 : {
2596 0 : const __m256i weights = _mm256_load_si256((const __m256i *)sm_weights_h);
2597 0 : smooth_v_pred_16(weights, rep[0], ab, dst, stride);
2598 0 : smooth_v_pred_16(weights, rep[1], ab, dst, stride);
2599 0 : smooth_v_pred_16(weights, rep[2], ab, dst, stride);
2600 0 : smooth_v_pred_16(weights, rep[3], ab, dst, stride);
2601 0 : }
2602 :
2603 0 : static INLINE void smooth_v_pred_16x8(const uint16_t *const sm_weights_h,
2604 : const __m256i *const rep, const __m256i *const ab, uint16_t **const dst,
2605 : const ptrdiff_t stride)
2606 : {
2607 0 : smooth_v_pred_16x4(sm_weights_h + 0, rep, ab, dst, stride);
2608 0 : smooth_v_pred_16x4(sm_weights_h + 16, rep, ab, dst, stride);
2609 0 : }
2610 :
2611 : // 16x4
2612 :
2613 0 : void eb_aom_highbd_smooth_v_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
2614 : const uint16_t *above, const uint16_t *left, int32_t bd)
2615 : {
2616 : __m256i ab[2], rep[4];
2617 : (void)bd;
2618 :
2619 0 : smooth_v_init_16(above, left, 4, ab, rep);
2620 0 : smooth_v_pred_16x4(sm_weights_d_4, rep, ab, &dst, stride);
2621 0 : }
2622 :
2623 : // 16x8
2624 :
2625 0 : void eb_aom_highbd_smooth_v_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
2626 : const uint16_t *above, const uint16_t *left, int32_t bd)
2627 : {
2628 : __m256i ab[2], rep[4];
2629 : (void)bd;
2630 :
2631 0 : smooth_v_init_16(above, left, 8, ab, rep);
2632 :
2633 0 : smooth_v_pred_16x8(sm_weights_d_8, rep, ab, &dst, stride);
2634 0 : }
2635 :
2636 : // 16x16
2637 :
2638 0 : void eb_aom_highbd_smooth_v_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
2639 : const uint16_t *above, const uint16_t *left, int32_t bd)
2640 : {
2641 : __m256i ab[2], rep[4];
2642 : (void)bd;
2643 :
2644 0 : smooth_v_init_16(above, left, 16, ab, rep);
2645 :
2646 0 : for (int32_t i = 0; i < 2; i++)
2647 0 : smooth_v_pred_16x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
2648 0 : }
2649 :
2650 : // 16x32
2651 :
2652 0 : void eb_aom_highbd_smooth_v_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
2653 : const uint16_t *above, const uint16_t *left, int32_t bd)
2654 : {
2655 : __m256i ab[2], rep[4];
2656 : (void)bd;
2657 :
2658 0 : smooth_v_init_16(above, left, 32, ab, rep);
2659 :
2660 0 : for (int32_t i = 0; i < 4; i++)
2661 0 : smooth_v_pred_16x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
2662 0 : }
2663 :
2664 : // 16x64
2665 :
2666 0 : void eb_aom_highbd_smooth_v_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
2667 : const uint16_t *above, const uint16_t *left, int32_t bd)
2668 : {
2669 : __m256i ab[2], rep[4];
2670 : (void)bd;
2671 :
2672 0 : smooth_v_init_16(above, left, 64, ab, rep);
2673 :
2674 0 : for (int32_t i = 0; i < 8; i++)
2675 0 : smooth_v_pred_16x8(sm_weights_d_64 + 32 * i, rep, ab, &dst, stride);
2676 0 : }
2677 :
2678 : // -----------------------------------------------------------------------------
2679 : // 32xN
2680 :
2681 0 : static INLINE void smooth_v_init_32(const uint16_t *const above,
2682 : const uint16_t *const left, const int32_t h, __m256i *const ab,
2683 : __m256i *const rep)
2684 : {
2685 0 : const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
2686 0 : smooth_v_prepare_ab(above + 0x00, b, ab + 0);
2687 0 : smooth_v_prepare_ab(above + 0x10, b, ab + 2);
2688 :
2689 0 : rep[0] = _mm256_set1_epi32(0x03020100);
2690 0 : rep[1] = _mm256_set1_epi32(0x07060504);
2691 0 : rep[2] = _mm256_set1_epi32(0x0B0A0908);
2692 0 : rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
2693 0 : }
2694 :
2695 0 : static INLINE void smooth_v_pred_32(const __m256i weights, const __m256i rep,
2696 : const __m256i *const ab, uint16_t **const dst, const ptrdiff_t stride)
2697 : {
2698 : __m256i d;
2699 :
2700 : // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2701 0 : d = smooth_v_pred_kernel(weights, rep, ab + 0);
2702 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
2703 :
2704 : // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2705 0 : d = smooth_v_pred_kernel(weights, rep, ab + 2);
2706 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
2707 0 : *dst += stride;
2708 0 : }
2709 :
2710 0 : static INLINE void smooth_v_pred_32x4(const uint16_t *const sm_weights_h,
2711 : const __m256i *const rep, const __m256i *const ab, uint16_t **const dst,
2712 : const ptrdiff_t stride)
2713 : {
2714 0 : const __m256i weights = _mm256_load_si256((const __m256i *)sm_weights_h);
2715 0 : smooth_v_pred_32(weights, rep[0], ab, dst, stride);
2716 0 : smooth_v_pred_32(weights, rep[1], ab, dst, stride);
2717 0 : smooth_v_pred_32(weights, rep[2], ab, dst, stride);
2718 0 : smooth_v_pred_32(weights, rep[3], ab, dst, stride);
2719 0 : }
2720 :
2721 0 : static INLINE void smooth_v_pred_32x8(const uint16_t *const sm_weights_h,
2722 : const __m256i *const rep, const __m256i *const ab, uint16_t **const dst,
2723 : const ptrdiff_t stride)
2724 : {
2725 0 : smooth_v_pred_32x4(sm_weights_h + 0, rep, ab, dst, stride);
2726 0 : smooth_v_pred_32x4(sm_weights_h + 16, rep, ab, dst, stride);
2727 0 : }
2728 :
2729 : // 32x8
2730 :
2731 0 : void eb_aom_highbd_smooth_v_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
2732 : const uint16_t *above, const uint16_t *left, int32_t bd)
2733 : {
2734 : __m256i ab[4], rep[4];
2735 : (void)bd;
2736 :
2737 0 : smooth_v_init_32(above, left, 8, ab, rep);
2738 :
2739 0 : smooth_v_pred_32x8(sm_weights_d_8, rep, ab, &dst, stride);
2740 0 : }
2741 :
2742 : // 32x16
2743 :
2744 0 : void eb_aom_highbd_smooth_v_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
2745 : const uint16_t *above, const uint16_t *left, int32_t bd)
2746 : {
2747 : __m256i ab[4], rep[4];
2748 : (void)bd;
2749 :
2750 0 : smooth_v_init_32(above, left, 16, ab, rep);
2751 :
2752 0 : for (int32_t i = 0; i < 2; i++)
2753 0 : smooth_v_pred_32x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
2754 0 : }
2755 :
2756 : // 32x32
2757 :
2758 0 : void eb_aom_highbd_smooth_v_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
2759 : const uint16_t *above, const uint16_t *left, int32_t bd)
2760 : {
2761 : __m256i ab[4], rep[4];
2762 : (void)bd;
2763 :
2764 0 : smooth_v_init_32(above, left, 32, ab, rep);
2765 :
2766 0 : for (int32_t i = 0; i < 4; i++)
2767 0 : smooth_v_pred_32x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
2768 0 : }
2769 :
2770 : // 32x64
2771 :
2772 0 : void eb_aom_highbd_smooth_v_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
2773 : const uint16_t *above, const uint16_t *left, int32_t bd)
2774 : {
2775 : __m256i ab[4], rep[4];
2776 : (void)bd;
2777 :
2778 0 : smooth_v_init_32(above, left, 64, ab, rep);
2779 :
2780 0 : for (int32_t i = 0; i < 8; i++)
2781 0 : smooth_v_pred_32x8(sm_weights_d_64 + 32 * i, rep, ab, &dst, stride);
2782 0 : }
2783 :
2784 : // -----------------------------------------------------------------------------
2785 : // 64xN
2786 :
2787 0 : static INLINE void smooth_v_init_64(const uint16_t *const above,
2788 : const uint16_t *const left, const int32_t h, __m256i *const ab,
2789 : __m256i *const rep)
2790 : {
2791 0 : const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
2792 0 : smooth_v_prepare_ab(above + 0x00, b, ab + 0);
2793 0 : smooth_v_prepare_ab(above + 0x10, b, ab + 2);
2794 0 : smooth_v_prepare_ab(above + 0x20, b, ab + 4);
2795 0 : smooth_v_prepare_ab(above + 0x30, b, ab + 6);
2796 :
2797 0 : rep[0] = _mm256_set1_epi32(0x03020100);
2798 0 : rep[1] = _mm256_set1_epi32(0x07060504);
2799 0 : rep[2] = _mm256_set1_epi32(0x0B0A0908);
2800 0 : rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
2801 0 : }
2802 :
2803 0 : static INLINE void smooth_v_pred_64(const __m256i weights, const __m256i rep,
2804 : const __m256i *const ab, uint16_t **const dst, const ptrdiff_t stride)
2805 : {
2806 : __m256i d;
2807 :
2808 : // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2809 0 : d = smooth_v_pred_kernel(weights, rep, ab + 0);
2810 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
2811 :
2812 : // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2813 0 : d = smooth_v_pred_kernel(weights, rep, ab + 2);
2814 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
2815 :
2816 : // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
2817 0 : d = smooth_v_pred_kernel(weights, rep, ab + 4);
2818 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x20), d);
2819 :
2820 : // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
2821 0 : d = smooth_v_pred_kernel(weights, rep, ab + 6);
2822 0 : _mm256_storeu_si256((__m256i *)(*dst + 0x30), d);
2823 0 : *dst += stride;
2824 0 : }
2825 :
2826 0 : static INLINE void smooth_v_pred_64x4(const uint16_t *const sm_weights_h,
2827 : const __m256i *const rep, const __m256i *const ab, uint16_t **const dst,
2828 : const ptrdiff_t stride)
2829 : {
2830 0 : const __m256i weights = _mm256_load_si256((const __m256i *)sm_weights_h);
2831 0 : smooth_v_pred_64(weights, rep[0], ab, dst, stride);
2832 0 : smooth_v_pred_64(weights, rep[1], ab, dst, stride);
2833 0 : smooth_v_pred_64(weights, rep[2], ab, dst, stride);
2834 0 : smooth_v_pred_64(weights, rep[3], ab, dst, stride);
2835 0 : }
2836 :
2837 0 : static INLINE void smooth_v_pred_64x8(const uint16_t *const sm_weights_h,
2838 : const __m256i *const rep, const __m256i *const ab,
2839 : uint16_t **const dst, const ptrdiff_t stride)
2840 : {
2841 0 : smooth_v_pred_64x4(sm_weights_h + 0, rep, ab, dst, stride);
2842 0 : smooth_v_pred_64x4(sm_weights_h + 16, rep, ab, dst, stride);
2843 0 : }
2844 :
2845 : // 64x16
2846 :
2847 0 : void eb_aom_highbd_smooth_v_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
2848 : const uint16_t *above, const uint16_t *left, int32_t bd)
2849 : {
2850 : __m256i ab[8], rep[4];
2851 : (void)bd;
2852 :
2853 0 : smooth_v_init_64(above, left, 16, ab, rep);
2854 :
2855 0 : for (int32_t i = 0; i < 2; i++)
2856 0 : smooth_v_pred_64x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
2857 0 : }
2858 :
2859 : // 64x32
2860 :
2861 0 : void eb_aom_highbd_smooth_v_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
2862 : const uint16_t *above, const uint16_t *left, int32_t bd)
2863 : {
2864 : __m256i ab[8], rep[4];
2865 : (void)bd;
2866 :
2867 0 : smooth_v_init_64(above, left, 32, ab, rep);
2868 :
2869 0 : for (int32_t i = 0; i < 4; i++)
2870 0 : smooth_v_pred_64x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
2871 0 : }
2872 :
2873 : // 64x64
2874 :
2875 0 : void eb_aom_highbd_smooth_v_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
2876 : const uint16_t *above, const uint16_t *left, int32_t bd)
2877 : {
2878 : __m256i ab[8], rep[4];
2879 : (void)bd;
2880 :
2881 0 : smooth_v_init_64(above, left, 64, ab, rep);
2882 :
2883 0 : for (int32_t i = 0; i < 8; i++)
2884 0 : smooth_v_pred_64x8(sm_weights_d_64 + 32 * i, rep, ab, &dst, stride);
2885 0 : }
|