Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <immintrin.h>
13 : #include "aom_dsp_rtcd.h"
14 : #include "EbBitstreamUnit.h"
15 : #include "EbCdef.h"
16 : #include "EbDefinitions.h"
17 : #include "EbMemory_AVX2.h"
18 :
19 : /* partial A is a 16-bit vector of the form:
20 : [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
21 : [0 y1 y2 y3 y4 y5 y6 y7].
22 : This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
23 : (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
24 : and const2. */
25 512182 : static INLINE __m256i fold_mul_and_sum(__m256i partial, __m256i const_var) {
26 1024360 : partial = _mm256_shuffle_epi8(partial,
27 : _mm256_set_epi32(0x0f0e0100,
28 : 0x03020504,
29 : 0x07060908,
30 : 0x0b0a0d0c,
31 : 0x0f0e0d0c,
32 : 0x0b0a0908,
33 : 0x07060504,
34 : 0x03020100));
35 1024360 : partial = _mm256_permutevar8x32_epi32(
36 : partial, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
37 1024360 : partial = _mm256_shuffle_epi8(partial,
38 : _mm256_set_epi32(0x0f0e0b0a,
39 : 0x0d0c0908,
40 : 0x07060302,
41 : 0x05040100,
42 : 0x0f0e0b0a,
43 : 0x0d0c0908,
44 : 0x07060302,
45 : 0x05040100));
46 512182 : partial = _mm256_madd_epi16(partial, partial);
47 512182 : partial = _mm256_mullo_epi32(partial, const_var);
48 512182 : return partial;
49 : }
50 :
51 171545 : static INLINE __m128i hsum4(__m128i x0, __m128i x1, __m128i x2, __m128i x3) {
52 : __m128i t0, t1, t2, t3;
53 171545 : t0 = _mm_unpacklo_epi32(x0, x1);
54 171545 : t1 = _mm_unpacklo_epi32(x2, x3);
55 171545 : t2 = _mm_unpackhi_epi32(x0, x1);
56 171545 : t3 = _mm_unpackhi_epi32(x2, x3);
57 171545 : x0 = _mm_unpacklo_epi64(t0, t1);
58 171545 : x1 = _mm_unpackhi_epi64(t0, t1);
59 171545 : x2 = _mm_unpacklo_epi64(t2, t3);
60 171545 : x3 = _mm_unpackhi_epi64(t2, t3);
61 514635 : return _mm_add_epi32(_mm_add_epi32(x0, x1), _mm_add_epi32(x2, x3));
62 : }
63 :
64 : /* Computes cost for directions 0, 5, 6 and 7. We can call this function again
65 : to compute the remaining directions. */
66 171249 : static INLINE void compute_directions(__m128i lines[8], int32_t tmp_cost1[4]) {
67 : __m128i partial6;
68 : __m128i tmp;
69 :
70 : __m256i partial4;
71 : __m256i partial5;
72 : __m256i partial7;
73 : __m256i tmp_avx2;
74 : /* Partial sums for lines 0 and 1. */
75 171249 : partial4 = _mm256_setr_m128i(_mm_slli_si128(lines[0], 14),
76 : _mm_srli_si128(lines[0], 2));
77 342498 : tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[1], 12),
78 : _mm_srli_si128(lines[1], 4));
79 171249 : partial4 = _mm256_add_epi16(partial4, tmp_avx2);
80 171249 : tmp = _mm_add_epi16(lines[0], lines[1]);
81 171249 : partial5 =
82 171249 : _mm256_setr_m128i(_mm_slli_si128(tmp, 10), _mm_srli_si128(tmp, 6));
83 171249 : partial7 =
84 171249 : _mm256_setr_m128i(_mm_slli_si128(tmp, 4), _mm_srli_si128(tmp, 12));
85 171249 : partial6 = tmp;
86 :
87 : /* Partial sums for lines 2 and 3. */
88 342498 : tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[2], 10),
89 : _mm_srli_si128(lines[2], 6));
90 171249 : partial4 = _mm256_add_epi16(partial4, tmp_avx2);
91 342498 : tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[3], 8),
92 : _mm_srli_si128(lines[3], 8));
93 171249 : partial4 = _mm256_add_epi16(partial4, tmp_avx2);
94 171249 : tmp = _mm_add_epi16(lines[2], lines[3]);
95 171249 : tmp_avx2 =
96 342498 : _mm256_setr_m128i(_mm_slli_si128(tmp, 8), _mm_srli_si128(tmp, 8));
97 171249 : partial5 = _mm256_add_epi16(partial5, tmp_avx2);
98 171249 : tmp_avx2 =
99 342498 : _mm256_setr_m128i(_mm_slli_si128(tmp, 6), _mm_srli_si128(tmp, 10));
100 171249 : partial7 = _mm256_add_epi16(partial7, tmp_avx2);
101 171249 : partial6 = _mm_add_epi16(partial6, tmp);
102 :
103 : /* Partial sums for lines 4 and 5. */
104 342498 : tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[4], 6),
105 : _mm_srli_si128(lines[4], 10));
106 171249 : partial4 = _mm256_add_epi16(partial4, tmp_avx2);
107 342498 : tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[5], 4),
108 : _mm_srli_si128(lines[5], 12));
109 171249 : partial4 = _mm256_add_epi16(partial4, tmp_avx2);
110 171249 : tmp = _mm_add_epi16(lines[4], lines[5]);
111 171249 : tmp_avx2 =
112 342498 : _mm256_setr_m128i(_mm_slli_si128(tmp, 6), _mm_srli_si128(tmp, 10));
113 171249 : partial5 = _mm256_add_epi16(partial5, tmp_avx2);
114 171249 : tmp_avx2 =
115 513747 : _mm256_setr_m128i(_mm_slli_si128(tmp, 8), _mm_srli_si128(tmp, 8));
116 171249 : partial7 = _mm256_add_epi16(partial7, tmp_avx2);
117 171249 : partial6 = _mm_add_epi16(partial6, tmp);
118 :
119 : /* Partial sums for lines 6 and 7. */
120 342498 : tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[6], 2),
121 : _mm_srli_si128(lines[6], 14));
122 171249 : partial4 = _mm256_add_epi16(partial4, tmp_avx2);
123 342498 : tmp_avx2 = _mm256_insertf128_si256(_mm256_setzero_si256(), lines[7], 0x0);
124 171249 : partial4 = _mm256_add_epi16(partial4, tmp_avx2);
125 171249 : tmp = _mm_add_epi16(lines[6], lines[7]);
126 171249 : tmp_avx2 =
127 342498 : _mm256_setr_m128i(_mm_slli_si128(tmp, 4), _mm_srli_si128(tmp, 12));
128 171249 : partial5 = _mm256_add_epi16(partial5, tmp_avx2);
129 171249 : tmp_avx2 =
130 342498 : _mm256_setr_m128i(_mm_slli_si128(tmp, 10), _mm_srli_si128(tmp, 6));
131 171249 : partial7 = _mm256_add_epi16(partial7, tmp_avx2);
132 171249 : partial6 = _mm_add_epi16(partial6, tmp);
133 :
134 : /* Compute costs in terms of partial sums. */
135 171249 : partial4 = fold_mul_and_sum(
136 : partial4, _mm256_set_epi32(105, 120, 140, 168, 210, 280, 420, 840));
137 171512 : partial7 = fold_mul_and_sum(
138 : partial7, _mm256_set_epi32(105, 105, 105, 140, 210, 420, 0, 0));
139 171549 : partial5 = fold_mul_and_sum(
140 : partial5, _mm256_set_epi32(105, 105, 105, 140, 210, 420, 0, 0));
141 171557 : partial6 = _mm_madd_epi16(partial6, partial6);
142 171557 : partial6 = _mm_mullo_epi32(partial6, _mm_set1_epi32(105));
143 : __m128i a, b, c;
144 171557 : a = _mm_add_epi32(_mm256_castsi256_si128(partial4),
145 171557 : _mm256_extracti128_si256(partial4, 1));
146 171557 : b = _mm_add_epi32(_mm256_castsi256_si128(partial5),
147 171557 : _mm256_extracti128_si256(partial5, 1));
148 171557 : c = _mm_add_epi32(_mm256_castsi256_si128(partial7),
149 171557 : _mm256_extracti128_si256(partial7, 1));
150 :
151 171557 : _mm_storeu_si128((__m128i *)tmp_cost1, hsum4(a, b, partial6, c));
152 171560 : }
153 :
154 : /* transpose and reverse the order of the lines -- equivalent to a 90-degree
155 : counter-clockwise rotation of the pixels. */
156 85774 : static INLINE void array_reverse_transpose_8x8(__m128i *in, __m128i *res) {
157 85774 : const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
158 85774 : const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
159 85774 : const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
160 85774 : const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
161 85774 : const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
162 85774 : const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
163 85774 : const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
164 171548 : const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
165 :
166 85774 : const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
167 85774 : const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
168 85774 : const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
169 85774 : const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
170 85774 : const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
171 85774 : const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
172 85774 : const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
173 85774 : const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
174 :
175 85774 : res[7] = _mm_unpacklo_epi64(tr1_0, tr1_1);
176 85774 : res[6] = _mm_unpackhi_epi64(tr1_0, tr1_1);
177 85774 : res[5] = _mm_unpacklo_epi64(tr1_2, tr1_3);
178 85774 : res[4] = _mm_unpackhi_epi64(tr1_2, tr1_3);
179 85774 : res[3] = _mm_unpacklo_epi64(tr1_4, tr1_5);
180 85774 : res[2] = _mm_unpackhi_epi64(tr1_4, tr1_5);
181 171548 : res[1] = _mm_unpacklo_epi64(tr1_6, tr1_7);
182 85774 : res[0] = _mm_unpackhi_epi64(tr1_6, tr1_7);
183 85774 : }
184 :
185 85743 : int32_t eb_cdef_find_dir_avx2(const uint16_t *img, int32_t stride, int32_t *var,
186 : int32_t coeff_shift) {
187 : int32_t i;
188 : int32_t cost[8];
189 85743 : int32_t best_cost = 0;
190 85743 : int32_t best_dir = 0;
191 : __m128i lines[8];
192 85743 : __m128i const_128 = _mm_set1_epi16(128);
193 765501 : for (i = 0; i < 8; i++) {
194 1357860 : lines[i] = _mm_lddqu_si128((__m128i *)&img[i * stride]);
195 2039270 : lines[i] = _mm_sub_epi16(
196 : _mm_sra_epi16(lines[i], _mm_cvtsi32_si128(coeff_shift)), const_128);
197 : }
198 :
199 : /* Compute "mostly vertical" directions. */
200 87403 : compute_directions(lines, cost + 4);
201 :
202 85793 : array_reverse_transpose_8x8(lines, lines);
203 :
204 : /* Compute "mostly horizontal" directions. */
205 85839 : compute_directions(lines, cost);
206 :
207 771545 : for (i = 0; i < 8; i++) {
208 685741 : if (cost[i] > best_cost) {
209 292195 : best_cost = cost[i];
210 292195 : best_dir = i;
211 : }
212 : }
213 :
214 : /* Difference between the optimal variance and the variance along the
215 : orthogonal direction. Again, the sum(x^2) terms cancel out. */
216 85804 : *var = best_cost - cost[(best_dir + 4) & 7];
217 : /* We'd normally divide by 840, but dividing by 1024 is close enough
218 : for what we're going to do with this. */
219 85804 : *var >>= 10;
220 85804 : return best_dir;
221 : }
222 :
223 : // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
224 32556600 : static INLINE __m256i constrain16(const __m256i in0, const __m256i in1,
225 : const __m256i threshold,
226 : const uint32_t adjdamp) {
227 32556600 : const __m256i diff = _mm256_sub_epi16(in0, in1);
228 32556600 : const __m256i sign = _mm256_srai_epi16(diff, 15);
229 32556600 : const __m256i a = _mm256_abs_epi16(diff);
230 97669900 : const __m256i l = _mm256_srl_epi16(a, _mm_cvtsi32_si128(adjdamp));
231 32556600 : const __m256i s = _mm256_subs_epu16(threshold, l);
232 32556600 : const __m256i m = _mm256_min_epi16(a, s);
233 32556600 : const __m256i d = _mm256_add_epi16(sign, m);
234 32556600 : return _mm256_xor_si256(d, sign);
235 : }
236 :
237 2322410 : static void eb_cdef_filter_block_4x4_8_avx2(
238 : uint8_t *dst, int32_t dstride, const uint16_t *in, int32_t pri_strength,
239 : int32_t sec_strength, int32_t dir, int32_t pri_damping, int32_t sec_damping,
240 : int32_t coeff_shift) {
241 : __m256i p0, p1, p2, p3, sum, row, res;
242 2322410 : __m256i max, min, large = _mm256_set1_epi16(CDEF_VERY_LARGE);
243 2322410 : int32_t po1 = eb_cdef_directions[dir][0];
244 2322410 : int32_t po2 = eb_cdef_directions[dir][1];
245 2322410 : int32_t s1o1 = eb_cdef_directions[(dir + 2) & 7][0];
246 2322410 : int32_t s1o2 = eb_cdef_directions[(dir + 2) & 7][1];
247 2322410 : int32_t s2o1 = eb_cdef_directions[(dir + 6) & 7][0];
248 2322410 : int32_t s2o2 = eb_cdef_directions[(dir + 6) & 7][1];
249 :
250 2322410 : const int32_t *pri_taps =
251 2322410 : eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
252 2322410 : const int32_t *sec_taps =
253 2322410 : eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
254 2322410 : __m256i pri_strength_256 = _mm256_set1_epi16(pri_strength);
255 2322410 : __m256i sec_strength_256 = _mm256_set1_epi16(sec_strength);
256 :
257 2322410 : if (pri_strength)
258 2069670 : pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
259 2308230 : if (sec_strength)
260 1745700 : sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
261 :
262 2285930 : sum = _mm256_setzero_si256();
263 4571860 : row = _mm256_set_epi64x(*(uint64_t *)(in),
264 2285930 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE),
265 2285930 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE),
266 2285930 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE));
267 2285930 : min = max = row;
268 :
269 : // Primary near taps
270 4571860 : p0 = _mm256_set_epi64x(*(uint64_t *)(in + po1),
271 2285930 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE + po1),
272 2285930 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE + po1),
273 2285930 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE + po1));
274 2285930 : p1 = _mm256_set_epi64x(*(uint64_t *)(in - po1),
275 2285930 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE - po1),
276 2285930 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE - po1),
277 2285930 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE - po1));
278 :
279 13715600 : max = _mm256_max_epi16(
280 : _mm256_max_epi16(
281 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
282 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
283 2285930 : min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
284 2285930 : p0 = constrain16(p0, row, pri_strength_256, pri_damping);
285 2325580 : p1 = constrain16(p1, row, pri_strength_256, pri_damping);
286 :
287 : // sum += pri_taps[0] * (p0 + p1)
288 4625720 : sum = _mm256_add_epi16(sum,
289 2312860 : _mm256_mullo_epi16(_mm256_set1_epi16(pri_taps[0]),
290 : _mm256_add_epi16(p0, p1)));
291 :
292 : // Primary far taps
293 4625720 : p0 = _mm256_set_epi64x(*(uint64_t *)(in + po2),
294 2312860 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE + po2),
295 2312860 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE + po2),
296 2312860 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE + po2));
297 2312860 : p1 = _mm256_set_epi64x(*(uint64_t *)(in - po2),
298 2312860 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE - po2),
299 2312860 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE - po2),
300 2312860 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE - po2));
301 13877100 : max = _mm256_max_epi16(
302 : _mm256_max_epi16(
303 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
304 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
305 2312860 : min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
306 2312860 : p0 = constrain16(p0, row, pri_strength_256, pri_damping);
307 2318470 : p1 = constrain16(p1, row, pri_strength_256, pri_damping);
308 :
309 : // sum += pri_taps[1] * (p0 + p1)
310 4625460 : sum = _mm256_add_epi16(sum,
311 2312730 : _mm256_mullo_epi16(_mm256_set1_epi16(pri_taps[1]),
312 : _mm256_add_epi16(p0, p1)));
313 :
314 : // Secondary near taps
315 4625460 : p0 = _mm256_set_epi64x(*(uint64_t *)(in + s1o1),
316 2312730 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s1o1),
317 2312730 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s1o1),
318 2312730 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s1o1));
319 4625460 : p1 = _mm256_set_epi64x(*(uint64_t *)(in - s1o1),
320 2312730 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s1o1),
321 2312730 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s1o1),
322 2312730 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s1o1));
323 4625460 : p2 = _mm256_set_epi64x(*(uint64_t *)(in + s2o1),
324 2312730 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s2o1),
325 2312730 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s2o1),
326 2312730 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s2o1));
327 2312730 : p3 = _mm256_set_epi64x(*(uint64_t *)(in - s2o1),
328 2312730 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s2o1),
329 2312730 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s2o1),
330 2312730 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s2o1));
331 13876400 : max = _mm256_max_epi16(
332 : _mm256_max_epi16(
333 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
334 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
335 13876400 : max = _mm256_max_epi16(
336 : _mm256_max_epi16(
337 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
338 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
339 6938200 : min = _mm256_min_epi16(
340 : _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2),
341 : p3);
342 2312730 : p0 = constrain16(p0, row, sec_strength_256, sec_damping);
343 2308310 : p1 = constrain16(p1, row, sec_strength_256, sec_damping);
344 2293170 : p2 = constrain16(p2, row, sec_strength_256, sec_damping);
345 2287990 : p3 = constrain16(p3, row, sec_strength_256, sec_damping);
346 :
347 : // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
348 9182820 : sum = _mm256_add_epi16(
349 : sum,
350 2295700 : _mm256_mullo_epi16(_mm256_set1_epi16(sec_taps[0]),
351 : _mm256_add_epi16(_mm256_add_epi16(p0, p1),
352 : _mm256_add_epi16(p2, p3))));
353 :
354 : // Secondary far taps
355 4591410 : p0 = _mm256_set_epi64x(*(uint64_t *)(in + s1o2),
356 2295700 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s1o2),
357 2295700 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s1o2),
358 2295700 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s1o2));
359 4591410 : p1 = _mm256_set_epi64x(*(uint64_t *)(in - s1o2),
360 2295700 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s1o2),
361 2295700 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s1o2),
362 2295700 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s1o2));
363 4591410 : p2 = _mm256_set_epi64x(*(uint64_t *)(in + s2o2),
364 2295700 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s2o2),
365 2295700 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s2o2),
366 2295700 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s2o2));
367 2295700 : p3 = _mm256_set_epi64x(*(uint64_t *)(in - s2o2),
368 2295700 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s2o2),
369 2295700 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s2o2),
370 2295700 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s2o2));
371 13774200 : max = _mm256_max_epi16(
372 : _mm256_max_epi16(
373 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
374 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
375 13774200 : max = _mm256_max_epi16(
376 : _mm256_max_epi16(
377 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
378 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
379 6887120 : min = _mm256_min_epi16(
380 : _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2),
381 : p3);
382 2295700 : p0 = constrain16(p0, row, sec_strength_256, sec_damping);
383 2300620 : p1 = constrain16(p1, row, sec_strength_256, sec_damping);
384 2269140 : p2 = constrain16(p2, row, sec_strength_256, sec_damping);
385 2235920 : p3 = constrain16(p3, row, sec_strength_256, sec_damping);
386 :
387 : // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
388 11313100 : sum = _mm256_add_epi16(
389 : sum,
390 2262620 : _mm256_mullo_epi16(_mm256_set1_epi16(sec_taps[1]),
391 : _mm256_add_epi16(_mm256_add_epi16(p0, p1),
392 : _mm256_add_epi16(p2, p3))));
393 :
394 : // res = row + ((sum - (sum < 0) + 8) >> 4)
395 : sum =
396 6787870 : _mm256_add_epi16(sum, _mm256_cmpgt_epi16(_mm256_setzero_si256(), sum));
397 4525240 : res = _mm256_add_epi16(sum, _mm256_set1_epi16(8));
398 2262620 : res = _mm256_srai_epi16(res, 4);
399 2262620 : res = _mm256_add_epi16(row, res);
400 6787870 : res = _mm256_min_epi16(_mm256_max_epi16(res, min), max);
401 2262620 : res = _mm256_packus_epi16(res, res);
402 :
403 2262620 : *(int32_t *)(dst + 0 * dstride) = _mm256_extract_epi32(res, 5);
404 2262620 : *(int32_t *)(dst + 1 * dstride) = _mm256_extract_epi32(res, 4);
405 2262620 : *(int32_t *)(dst + 2 * dstride) = _mm256_extract_epi32(res, 1);
406 2262620 : *(int32_t *)(dst + 3 * dstride) = _mm256_cvtsi256_si32(res);
407 2262620 : }
408 :
409 1168420 : static void eb_cdef_filter_block_8x8_8_avx2(
410 : uint8_t *dst, int32_t dstride, const uint16_t *in, int32_t pri_strength,
411 : int32_t sec_strength, int32_t dir, int32_t pri_damping, int32_t sec_damping,
412 : int32_t coeff_shift) {
413 : int32_t i;
414 : __m256i sum, p0, p1, p2, p3, row, res;
415 1168420 : __m256i max, min, large = _mm256_set1_epi16(CDEF_VERY_LARGE);
416 1168420 : int32_t po1 = eb_cdef_directions[dir][0];
417 1168420 : int32_t po2 = eb_cdef_directions[dir][1];
418 1168420 : int32_t s1o1 = eb_cdef_directions[(dir + 2) & 7][0];
419 1168420 : int32_t s1o2 = eb_cdef_directions[(dir + 2) & 7][1];
420 1168420 : int32_t s2o1 = eb_cdef_directions[(dir + 6) & 7][0];
421 1168420 : int32_t s2o2 = eb_cdef_directions[(dir + 6) & 7][1];
422 : // SSE CHKN
423 1168420 : const int32_t *pri_taps =
424 1168420 : eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
425 1168420 : const int32_t *sec_taps =
426 1168420 : eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
427 1168420 : __m256i pri_taps_0 = _mm256_set1_epi16(pri_taps[0]);
428 1168420 : __m256i pri_taps_1 = _mm256_set1_epi16(pri_taps[1]);
429 1168420 : __m256i sec_taps_0 = _mm256_set1_epi16(sec_taps[0]);
430 2336850 : __m256i sec_taps_1 = _mm256_set1_epi16(sec_taps[1]);
431 1168420 : __m256i duplicate_8 = _mm256_set1_epi16(8);
432 1168420 : __m256i pri_strength_256 = _mm256_set1_epi16(pri_strength);
433 1168420 : __m256i sec_strength_256 = _mm256_set1_epi16(sec_strength);
434 :
435 1168420 : if (pri_strength)
436 975610 : pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
437 1156770 : if (sec_strength)
438 916633 : sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
439 :
440 5617220 : for (i = 0; i < 8; i += 2) {
441 4486120 : sum = _mm256_setzero_si256();
442 13458400 : row = _mm256_setr_m128i(
443 : _mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE)),
444 : _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE)));
445 :
446 4486120 : min = max = row;
447 : // Primary near taps
448 13458400 : p0 = _mm256_setr_m128i(
449 : _mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + po1)),
450 : _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + po1)));
451 17944500 : p1 = _mm256_setr_m128i(
452 : _mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - po1)),
453 : _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - po1)));
454 26916700 : max = _mm256_max_epi16(
455 : _mm256_max_epi16(
456 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
457 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
458 4486120 : min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
459 4486120 : p0 = constrain16(p0, row, pri_strength_256, pri_damping);
460 4404800 : p1 = constrain16(p1, row, pri_strength_256, pri_damping);
461 :
462 : // sum += pri_taps[0] * (p0 + p1)
463 8691250 : sum = _mm256_add_epi16(
464 : sum, _mm256_mullo_epi16(pri_taps_0, _mm256_add_epi16(p0, p1)));
465 :
466 : // Primary far taps
467 13036900 : p0 = _mm256_setr_m128i(
468 : _mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + po2)),
469 : _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + po2)));
470 17382500 : p1 = _mm256_setr_m128i(
471 : _mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - po2)),
472 : _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - po2)));
473 26073700 : max = _mm256_max_epi16(
474 : _mm256_max_epi16(
475 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
476 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
477 4345620 : min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
478 4345620 : p0 = constrain16(p0, row, pri_strength_256, pri_damping);
479 4349900 : p1 = constrain16(p1, row, pri_strength_256, pri_damping);
480 :
481 : // sum += pri_taps[1] * (p0 + p1)
482 8838700 : sum = _mm256_add_epi16(
483 : sum, _mm256_mullo_epi16(pri_taps_1, _mm256_add_epi16(p0, p1)));
484 :
485 : // Secondary near taps
486 13258000 : p0 = _mm256_setr_m128i(
487 : _mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + s1o1)),
488 : _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + s1o1)));
489 13258000 : p1 = _mm256_setr_m128i(
490 : _mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - s1o1)),
491 : _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - s1o1)));
492 13258000 : p2 = _mm256_setr_m128i(
493 : _mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + s2o1)),
494 : _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + s2o1)));
495 17677400 : p3 = _mm256_setr_m128i(
496 : _mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - s2o1)),
497 : _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - s2o1)));
498 26516100 : max = _mm256_max_epi16(
499 : _mm256_max_epi16(
500 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
501 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
502 26516100 : max = _mm256_max_epi16(
503 : _mm256_max_epi16(
504 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
505 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
506 13258000 : min = _mm256_min_epi16(
507 : _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1),
508 : p2),
509 : p3);
510 4419350 : p0 = constrain16(p0, row, sec_strength_256, sec_damping);
511 4329080 : p1 = constrain16(p1, row, sec_strength_256, sec_damping);
512 4315440 : p2 = constrain16(p2, row, sec_strength_256, sec_damping);
513 4203980 : p3 = constrain16(p3, row, sec_strength_256, sec_damping);
514 :
515 : // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
516 17662900 : sum = _mm256_add_epi16(
517 : sum,
518 : _mm256_mullo_epi16(sec_taps_0,
519 : _mm256_add_epi16(_mm256_add_epi16(p0, p1),
520 : _mm256_add_epi16(p2, p3))));
521 :
522 : // Secondary far taps
523 13247200 : p0 = _mm256_setr_m128i(
524 : _mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + s1o2)),
525 : _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + s1o2)));
526 13247200 : p1 = _mm256_setr_m128i(
527 : _mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - s1o2)),
528 : _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - s1o2)));
529 13247200 : p2 = _mm256_setr_m128i(
530 : _mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + s2o2)),
531 : _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + s2o2)));
532 17662900 : p3 = _mm256_setr_m128i(
533 : _mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - s2o2)),
534 : _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - s2o2)));
535 26494300 : max = _mm256_max_epi16(
536 : _mm256_max_epi16(
537 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
538 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
539 26494300 : max = _mm256_max_epi16(
540 : _mm256_max_epi16(
541 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
542 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
543 13247200 : min = _mm256_min_epi16(
544 : _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1),
545 : p2),
546 : p3);
547 4415720 : p0 = constrain16(p0, row, sec_strength_256, sec_damping);
548 4432660 : p1 = constrain16(p1, row, sec_strength_256, sec_damping);
549 4405000 : p2 = constrain16(p2, row, sec_strength_256, sec_damping);
550 4463790 : p3 = constrain16(p3, row, sec_strength_256, sec_damping);
551 :
552 : // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
553 22317700 : sum = _mm256_add_epi16(
554 : sum,
555 : _mm256_mullo_epi16(sec_taps_1,
556 : _mm256_add_epi16(_mm256_add_epi16(p0, p1),
557 : _mm256_add_epi16(p2, p3))));
558 :
559 : // res = row + ((sum - (sum < 0) + 8) >> 4)
560 13390600 : sum = _mm256_add_epi16(sum,
561 : _mm256_cmpgt_epi16(_mm256_setzero_si256(), sum));
562 4463540 : res = _mm256_add_epi16(sum, duplicate_8);
563 4463540 : res = _mm256_srai_epi16(res, 4);
564 4463540 : res = _mm256_add_epi16(row, res);
565 8927070 : res = _mm256_min_epi16(_mm256_max_epi16(res, min), max);
566 4463540 : res = _mm256_packus_epi16(res, res);
567 4463540 : *(int64_t *)(dst + i * dstride) = _mm256_extract_epi64(res, 2);
568 4463540 : *(int64_t *)(dst + (i + 1) * dstride) = _mm256_extract_epi64(res, 0);
569 : }
570 1131100 : }
571 :
572 0 : static void eb_cdef_filter_block_4x4_16_avx2(
573 : uint16_t *dst, int32_t dstride, const uint16_t *in, int32_t pri_strength,
574 : int32_t sec_strength, int32_t dir, int32_t pri_damping, int32_t sec_damping,
575 : int32_t coeff_shift) {
576 : __m256i p0, p1, p2, p3, sum, row, res;
577 0 : __m256i max, min, large = _mm256_set1_epi16(CDEF_VERY_LARGE);
578 0 : int32_t po1 = eb_cdef_directions[dir][0];
579 0 : int32_t po2 = eb_cdef_directions[dir][1];
580 0 : int32_t s1o1 = eb_cdef_directions[(dir + 2) & 7][0];
581 0 : int32_t s1o2 = eb_cdef_directions[(dir + 2) & 7][1];
582 0 : int32_t s2o1 = eb_cdef_directions[(dir + 6) & 7][0];
583 0 : int32_t s2o2 = eb_cdef_directions[(dir + 6) & 7][1];
584 :
585 0 : const int32_t *pri_taps =
586 0 : eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
587 0 : const int32_t *sec_taps =
588 0 : eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
589 0 : __m256i pri_strength_256 = _mm256_set1_epi16(pri_strength);
590 0 : __m256i sec_strength_256 = _mm256_set1_epi16(sec_strength);
591 :
592 0 : if (pri_strength)
593 0 : pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
594 0 : if (sec_strength)
595 0 : sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
596 0 : sum = _mm256_setzero_si256();
597 0 : row = _mm256_set_epi64x(*(uint64_t *)(in),
598 0 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE),
599 0 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE),
600 0 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE));
601 0 : min = max = row;
602 :
603 : // Primary near taps
604 0 : p0 = _mm256_set_epi64x(*(uint64_t *)(in + po1),
605 0 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE + po1),
606 0 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE + po1),
607 0 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE + po1));
608 0 : p1 = _mm256_set_epi64x(*(uint64_t *)(in - po1),
609 0 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE - po1),
610 0 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE - po1),
611 0 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE - po1));
612 :
613 0 : max = _mm256_max_epi16(
614 : _mm256_max_epi16(
615 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
616 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
617 0 : min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
618 0 : p0 = constrain16(p0, row, pri_strength_256, pri_damping);
619 0 : p1 = constrain16(p1, row, pri_strength_256, pri_damping);
620 :
621 : // sum += pri_taps[0] * (p0 + p1)
622 0 : sum = _mm256_add_epi16(sum,
623 0 : _mm256_mullo_epi16(_mm256_set1_epi16(pri_taps[0]),
624 : _mm256_add_epi16(p0, p1)));
625 :
626 : // Primary far taps
627 0 : p0 = _mm256_set_epi64x(*(uint64_t *)(in + po2),
628 0 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE + po2),
629 0 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE + po2),
630 0 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE + po2));
631 0 : p1 = _mm256_set_epi64x(*(uint64_t *)(in - po2),
632 0 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE - po2),
633 0 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE - po2),
634 0 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE - po2));
635 0 : max = _mm256_max_epi16(
636 : _mm256_max_epi16(
637 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
638 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
639 0 : min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
640 0 : p0 = constrain16(p0, row, pri_strength_256, pri_damping);
641 0 : p1 = constrain16(p1, row, pri_strength_256, pri_damping);
642 :
643 : // sum += pri_taps[1] * (p0 + p1)
644 0 : sum = _mm256_add_epi16(sum,
645 0 : _mm256_mullo_epi16(_mm256_set1_epi16(pri_taps[1]),
646 : _mm256_add_epi16(p0, p1)));
647 :
648 : // Secondary near taps
649 0 : p0 = _mm256_set_epi64x(*(uint64_t *)(in + s1o1),
650 0 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s1o1),
651 0 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s1o1),
652 0 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s1o1));
653 0 : p1 = _mm256_set_epi64x(*(uint64_t *)(in - s1o1),
654 0 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s1o1),
655 0 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s1o1),
656 0 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s1o1));
657 0 : p2 = _mm256_set_epi64x(*(uint64_t *)(in + s2o1),
658 0 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s2o1),
659 0 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s2o1),
660 0 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s2o1));
661 0 : p3 = _mm256_set_epi64x(*(uint64_t *)(in - s2o1),
662 0 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s2o1),
663 0 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s2o1),
664 0 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s2o1));
665 0 : max = _mm256_max_epi16(
666 : _mm256_max_epi16(
667 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
668 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
669 0 : max = _mm256_max_epi16(
670 : _mm256_max_epi16(
671 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
672 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
673 0 : min = _mm256_min_epi16(
674 : _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2),
675 : p3);
676 0 : p0 = constrain16(p0, row, sec_strength_256, sec_damping);
677 0 : p1 = constrain16(p1, row, sec_strength_256, sec_damping);
678 0 : p2 = constrain16(p2, row, sec_strength_256, sec_damping);
679 0 : p3 = constrain16(p3, row, sec_strength_256, sec_damping);
680 :
681 : // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
682 0 : sum = _mm256_add_epi16(
683 : sum,
684 0 : _mm256_mullo_epi16(_mm256_set1_epi16(sec_taps[0]),
685 : _mm256_add_epi16(_mm256_add_epi16(p0, p1),
686 : _mm256_add_epi16(p2, p3))));
687 :
688 : // Secondary far taps
689 0 : p0 = _mm256_set_epi64x(*(uint64_t *)(in + s1o2),
690 0 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s1o2),
691 0 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s1o2),
692 0 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s1o2));
693 0 : p1 = _mm256_set_epi64x(*(uint64_t *)(in - s1o2),
694 0 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s1o2),
695 0 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s1o2),
696 0 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s1o2));
697 0 : p2 = _mm256_set_epi64x(*(uint64_t *)(in + s2o2),
698 0 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s2o2),
699 0 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s2o2),
700 0 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s2o2));
701 0 : p3 = _mm256_set_epi64x(*(uint64_t *)(in - s2o2),
702 0 : *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s2o2),
703 0 : *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s2o2),
704 0 : *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s2o2));
705 0 : max = _mm256_max_epi16(
706 : _mm256_max_epi16(
707 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
708 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
709 0 : max = _mm256_max_epi16(
710 : _mm256_max_epi16(
711 : max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
712 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
713 0 : min = _mm256_min_epi16(
714 : _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2),
715 : p3);
716 0 : p0 = constrain16(p0, row, sec_strength_256, sec_damping);
717 0 : p1 = constrain16(p1, row, sec_strength_256, sec_damping);
718 0 : p2 = constrain16(p2, row, sec_strength_256, sec_damping);
719 0 : p3 = constrain16(p3, row, sec_strength_256, sec_damping);
720 :
721 : // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
722 0 : sum = _mm256_add_epi16(
723 : sum,
724 0 : _mm256_mullo_epi16(_mm256_set1_epi16(sec_taps[1]),
725 : _mm256_add_epi16(_mm256_add_epi16(p0, p1),
726 : _mm256_add_epi16(p2, p3))));
727 :
728 : // res = row + ((sum - (sum < 0) + 8) >> 4)
729 : sum =
730 0 : _mm256_add_epi16(sum, _mm256_cmpgt_epi16(_mm256_setzero_si256(), sum));
731 0 : res = _mm256_add_epi16(sum, _mm256_set1_epi16(8));
732 0 : res = _mm256_srai_epi16(res, 4);
733 0 : res = _mm256_add_epi16(row, res);
734 0 : res = _mm256_min_epi16(_mm256_max_epi16(res, min), max);
735 :
736 0 : *(uint64_t *)(dst) = _mm256_extract_epi64(res, 3);
737 0 : *(uint64_t *)(dst + 1 * dstride) = _mm256_extract_epi64(res, 2);
738 0 : *(uint64_t *)(dst + 2 * dstride) = _mm256_extract_epi64(res, 1);
739 0 : *(uint64_t *)(dst + 3 * dstride) = _mm256_extract_epi64(res, 0);
740 0 : }
741 :
742 0 : static INLINE void cdef_filter_block_8x8_16_pri_avx2(
743 : const uint16_t *const in, const int32_t pri_damping, const int32_t po,
744 : const __m256i row, const __m256i pri_strength_256, const __m256i pri_taps,
745 : __m256i *const max, __m256i *const min, __m256i *const sum) {
746 0 : const __m256i large = _mm256_set1_epi16(CDEF_VERY_LARGE);
747 0 : const __m256i p0 = loadu_u16_8x2_avx2(in + po, CDEF_BSTRIDE);
748 0 : const __m256i p1 = loadu_u16_8x2_avx2(in - po, CDEF_BSTRIDE);
749 :
750 0 : *max = _mm256_max_epi16(
751 : _mm256_max_epi16(
752 : *max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
753 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
754 0 : *min = _mm256_min_epi16(_mm256_min_epi16(*min, p0), p1);
755 :
756 0 : const __m256i q0 = constrain16(p0, row, pri_strength_256, pri_damping);
757 0 : const __m256i q1 = constrain16(p1, row, pri_strength_256, pri_damping);
758 :
759 : // sum += pri_taps * (p0 + p1)
760 0 : *sum = _mm256_add_epi16(
761 : *sum, _mm256_mullo_epi16(pri_taps, _mm256_add_epi16(q0, q1)));
762 0 : }
763 :
764 0 : static INLINE void cdef_filter_block_8x8_16_sec_avx2(
765 : const uint16_t *const in, const int32_t sec_damping, const int32_t so1,
766 : const int32_t so2, const __m256i row, const __m256i sec_strength_256,
767 : const __m256i sec_taps, __m256i *const max, __m256i *const min,
768 : __m256i *const sum) {
769 0 : const __m256i large = _mm256_set1_epi16(CDEF_VERY_LARGE);
770 0 : const __m256i p0 = loadu_u16_8x2_avx2(in + so1, CDEF_BSTRIDE);
771 0 : const __m256i p1 = loadu_u16_8x2_avx2(in - so1, CDEF_BSTRIDE);
772 0 : const __m256i p2 = loadu_u16_8x2_avx2(in + so2, CDEF_BSTRIDE);
773 0 : const __m256i p3 = loadu_u16_8x2_avx2(in - so2, CDEF_BSTRIDE);
774 :
775 0 : *max = _mm256_max_epi16(
776 : _mm256_max_epi16(
777 : *max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
778 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
779 0 : *max = _mm256_max_epi16(
780 : _mm256_max_epi16(
781 : *max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
782 : _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
783 0 : *min = _mm256_min_epi16(
784 : _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(*min, p0), p1), p2),
785 : p3);
786 :
787 0 : const __m256i q0 = constrain16(p0, row, sec_strength_256, sec_damping);
788 0 : const __m256i q1 = constrain16(p1, row, sec_strength_256, sec_damping);
789 0 : const __m256i q2 = constrain16(p2, row, sec_strength_256, sec_damping);
790 0 : const __m256i q3 = constrain16(p3, row, sec_strength_256, sec_damping);
791 :
792 : // sum += sec_taps * (p0 + p1 + p2 + p3)
793 0 : *sum = _mm256_add_epi16(
794 : *sum,
795 : _mm256_mullo_epi16(sec_taps,
796 : _mm256_add_epi16(_mm256_add_epi16(q0, q1),
797 : _mm256_add_epi16(q2, q3))));
798 0 : }
799 :
800 0 : void eb_cdef_filter_block_8x8_16_avx2(
801 : const uint16_t *const in, const int32_t pri_strength,
802 : const int32_t sec_strength, const int32_t dir, int32_t pri_damping,
803 : int32_t sec_damping, const int32_t coeff_shift, uint16_t *const dst,
804 : const int32_t dstride) {
805 0 : const int32_t po1 = eb_cdef_directions[dir][0];
806 0 : const int32_t po2 = eb_cdef_directions[dir][1];
807 0 : const int32_t s1o1 = eb_cdef_directions[(dir + 2) & 7][0];
808 0 : const int32_t s1o2 = eb_cdef_directions[(dir + 2) & 7][1];
809 0 : const int32_t s2o1 = eb_cdef_directions[(dir + 6) & 7][0];
810 0 : const int32_t s2o2 = eb_cdef_directions[(dir + 6) & 7][1];
811 : // SSE CHKN
812 0 : const int32_t *pri_taps =
813 0 : eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
814 0 : const int32_t *sec_taps =
815 0 : eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
816 : int32_t i;
817 0 : const __m256i pri_taps_0 = _mm256_set1_epi16(pri_taps[0]);
818 0 : const __m256i pri_taps_1 = _mm256_set1_epi16(pri_taps[1]);
819 0 : const __m256i sec_taps_0 = _mm256_set1_epi16(sec_taps[0]);
820 0 : const __m256i sec_taps_1 = _mm256_set1_epi16(sec_taps[1]);
821 0 : const __m256i duplicate_8 = _mm256_set1_epi16(8);
822 0 : const __m256i pri_strength_256 = _mm256_set1_epi16(pri_strength);
823 0 : const __m256i sec_strength_256 = _mm256_set1_epi16(sec_strength);
824 :
825 0 : if (pri_strength)
826 0 : pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
827 0 : if (sec_strength)
828 0 : sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
829 :
830 0 : for (i = 0; i < 8; i += 2) {
831 : const __m256i row =
832 0 : loadu_u16_8x2_avx2(in + i * CDEF_BSTRIDE, CDEF_BSTRIDE);
833 : __m256i sum, res, max, min;
834 :
835 0 : min = max = row;
836 0 : sum = _mm256_setzero_si256();
837 :
838 : // Primary near taps
839 0 : cdef_filter_block_8x8_16_pri_avx2(in + i * CDEF_BSTRIDE,
840 : pri_damping,
841 : po1,
842 : row,
843 : pri_strength_256,
844 : pri_taps_0,
845 : &max,
846 : &min,
847 : &sum);
848 :
849 : // Primary far taps
850 0 : cdef_filter_block_8x8_16_pri_avx2(in + i * CDEF_BSTRIDE,
851 : pri_damping,
852 : po2,
853 : row,
854 : pri_strength_256,
855 : pri_taps_1,
856 : &max,
857 : &min,
858 : &sum);
859 :
860 : // Secondary near taps
861 0 : cdef_filter_block_8x8_16_sec_avx2(in + i * CDEF_BSTRIDE,
862 : sec_damping,
863 : s1o1,
864 : s2o1,
865 : row,
866 : sec_strength_256,
867 : sec_taps_0,
868 : &max,
869 : &min,
870 : &sum);
871 :
872 : // Secondary far taps
873 0 : cdef_filter_block_8x8_16_sec_avx2(in + i * CDEF_BSTRIDE,
874 : sec_damping,
875 : s1o2,
876 : s2o2,
877 : row,
878 : sec_strength_256,
879 : sec_taps_1,
880 : &max,
881 : &min,
882 : &sum);
883 :
884 : // res = row + ((sum - (sum < 0) + 8) >> 4)
885 0 : sum = _mm256_add_epi16(sum,
886 : _mm256_cmpgt_epi16(_mm256_setzero_si256(), sum));
887 0 : res = _mm256_add_epi16(sum, duplicate_8);
888 0 : res = _mm256_srai_epi16(res, 4);
889 0 : res = _mm256_add_epi16(row, res);
890 0 : res = _mm256_min_epi16(_mm256_max_epi16(res, min), max);
891 0 : _mm_storeu_si128((__m128i *)&dst[i * dstride],
892 : _mm256_castsi256_si128(res));
893 0 : _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride],
894 0 : _mm256_extracti128_si256(res, 1));
895 : }
896 0 : }
897 :
898 3402340 : void eb_cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int32_t dstride,
899 : const uint16_t *in, int32_t pri_strength,
900 : int32_t sec_strength, int32_t dir,
901 : int32_t pri_damping, int32_t sec_damping,
902 : int32_t bsize, int32_t coeff_shift) {
903 3402340 : if (dst8) {
904 3459380 : if (bsize == BLOCK_8X8) {
905 1180920 : eb_cdef_filter_block_8x8_8_avx2(dst8,
906 : dstride,
907 : in,
908 : pri_strength,
909 : sec_strength,
910 : dir,
911 : pri_damping,
912 : sec_damping,
913 : coeff_shift);
914 : }
915 2278450 : else if (bsize == BLOCK_4X8) {
916 0 : eb_cdef_filter_block_4x4_8_avx2(dst8,
917 : dstride,
918 : in,
919 : pri_strength,
920 : sec_strength,
921 : dir,
922 : pri_damping,
923 : sec_damping,
924 : coeff_shift);
925 0 : eb_cdef_filter_block_4x4_8_avx2(dst8 + 4 * dstride,
926 : dstride,
927 : in + 4 * CDEF_BSTRIDE,
928 : pri_strength,
929 : sec_strength,
930 : dir,
931 : pri_damping,
932 : sec_damping,
933 : coeff_shift);
934 : }
935 2278450 : else if (bsize == BLOCK_8X4) {
936 0 : eb_cdef_filter_block_4x4_8_avx2(dst8,
937 : dstride,
938 : in,
939 : pri_strength,
940 : sec_strength,
941 : dir,
942 : pri_damping,
943 : sec_damping,
944 : coeff_shift);
945 0 : eb_cdef_filter_block_4x4_8_avx2(dst8 + 4,
946 : dstride,
947 : in + 4,
948 : pri_strength,
949 : sec_strength,
950 : dir,
951 : pri_damping,
952 : sec_damping,
953 : coeff_shift);
954 : }
955 : else {
956 2278450 : eb_cdef_filter_block_4x4_8_avx2(dst8,
957 : dstride,
958 : in,
959 : pri_strength,
960 : sec_strength,
961 : dir,
962 : pri_damping,
963 : sec_damping,
964 : coeff_shift);
965 : }
966 : }
967 : else {
968 0 : if (bsize == BLOCK_8X8) {
969 0 : eb_cdef_filter_block_8x8_16(in,
970 : pri_strength,
971 : sec_strength,
972 : dir,
973 : pri_damping,
974 : sec_damping,
975 : coeff_shift,
976 : dst16,
977 : dstride);
978 : }
979 0 : else if (bsize == BLOCK_4X8) {
980 0 : eb_cdef_filter_block_4x4_16_avx2(dst16,
981 : dstride,
982 : in,
983 : pri_strength,
984 : sec_strength,
985 : dir,
986 : pri_damping,
987 : sec_damping,
988 : coeff_shift);
989 0 : eb_cdef_filter_block_4x4_16_avx2(dst16 + 4 * dstride,
990 : dstride,
991 : in + 4 * CDEF_BSTRIDE,
992 : pri_strength,
993 : sec_strength,
994 : dir,
995 : pri_damping,
996 : sec_damping,
997 : coeff_shift);
998 : }
999 0 : else if (bsize == BLOCK_8X4) {
1000 0 : eb_cdef_filter_block_4x4_16_avx2(dst16,
1001 : dstride,
1002 : in,
1003 : pri_strength,
1004 : sec_strength,
1005 : dir,
1006 : pri_damping,
1007 : sec_damping,
1008 : coeff_shift);
1009 0 : eb_cdef_filter_block_4x4_16_avx2(dst16 + 4,
1010 : dstride,
1011 : in + 4,
1012 : pri_strength,
1013 : sec_strength,
1014 : dir,
1015 : pri_damping,
1016 : sec_damping,
1017 : coeff_shift);
1018 : }
1019 : else {
1020 : assert(bsize == BLOCK_4X4);
1021 0 : eb_cdef_filter_block_4x4_16_avx2(dst16,
1022 : dstride,
1023 : in,
1024 : pri_strength,
1025 : sec_strength,
1026 : dir,
1027 : pri_damping,
1028 : sec_damping,
1029 : coeff_shift);
1030 : }
1031 : }
1032 3502980 : }
1033 :
1034 17688 : void eb_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int32_t dstride,
1035 : const uint8_t *src, int32_t sstride,
1036 : int32_t v, int32_t h) {
1037 : int32_t i, j;
1038 447074 : for (i = 0; i < v; i++) {
1039 3425410 : for (j = 0; j < (h & ~0x7); j += 8) {
1040 5992040 : __m128i row = _mm_loadl_epi64((__m128i *)&src[i * sstride + j]);
1041 5992040 : _mm_storeu_si128((__m128i *)&dst[i * dstride + j],
1042 : _mm_unpacklo_epi8(row, _mm_setzero_si128()));
1043 : }
1044 429386 : for (; j < h; j++)
1045 0 : dst[i * dstride + j] = src[i * sstride + j];
1046 : }
1047 17688 : }
1048 :
1049 0 : void eb_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int32_t dstride,
1050 : const uint16_t *src, int32_t sstride,
1051 : int32_t v, int32_t h) {
1052 : int32_t i, j;
1053 0 : for (i = 0; i < v; i++) {
1054 0 : for (j = 0; j < (h & ~0x7); j += 8) {
1055 0 : __m128i row = _mm_lddqu_si128((__m128i *)&src[i * sstride + j]);
1056 0 : _mm_storeu_si128((__m128i *)&dst[i * dstride + j], row);
1057 : }
1058 0 : for (; j < h; j++)
1059 0 : dst[i * dstride + j] = src[i * sstride + j];
1060 : }
1061 0 : }
|