Line data Source code
1 : /*
2 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include "EbDefinitions.h"
13 : #include <immintrin.h>
14 : #include "aom_dsp_rtcd.h"
15 : #include "EbRestoration.h"
16 : #include "synonyms.h"
17 : #include "synonyms_avx2.h"
18 : #include "transpose_avx2.h"
19 : #include "transpose_sse2.h"
20 :
21 15731100 : static INLINE void cvt_16to32bit_8x8(const __m128i s[8], __m256i r[8]) {
22 15731100 : r[0] = _mm256_cvtepu16_epi32(s[0]);
23 15731100 : r[1] = _mm256_cvtepu16_epi32(s[1]);
24 15731100 : r[2] = _mm256_cvtepu16_epi32(s[2]);
25 15731100 : r[3] = _mm256_cvtepu16_epi32(s[3]);
26 15731100 : r[4] = _mm256_cvtepu16_epi32(s[4]);
27 15731100 : r[5] = _mm256_cvtepu16_epi32(s[5]);
28 15731100 : r[6] = _mm256_cvtepu16_epi32(s[6]);
29 15731100 : r[7] = _mm256_cvtepu16_epi32(s[7]);
30 15731100 : }
31 :
32 31440100 : static INLINE void add_32bit_8x8(const __m256i neighbor, __m256i r[8]) {
33 31440100 : r[0] = _mm256_add_epi32(neighbor, r[0]);
34 31440100 : r[1] = _mm256_add_epi32(r[0], r[1]);
35 31440100 : r[2] = _mm256_add_epi32(r[1], r[2]);
36 31440100 : r[3] = _mm256_add_epi32(r[2], r[3]);
37 31440100 : r[4] = _mm256_add_epi32(r[3], r[4]);
38 31440100 : r[5] = _mm256_add_epi32(r[4], r[5]);
39 31440100 : r[6] = _mm256_add_epi32(r[5], r[6]);
40 31440100 : r[7] = _mm256_add_epi32(r[6], r[7]);
41 31440100 : }
42 :
43 15730200 : static INLINE void store_32bit_8x8(const __m256i r[8], int32_t *const buf,
44 : const int32_t buf_stride) {
45 15730200 : _mm256_store_si256((__m256i *)(buf + 0 * buf_stride), r[0]);
46 15730200 : _mm256_store_si256((__m256i *)(buf + 1 * buf_stride), r[1]);
47 15730200 : _mm256_store_si256((__m256i *)(buf + 2 * buf_stride), r[2]);
48 15730200 : _mm256_store_si256((__m256i *)(buf + 3 * buf_stride), r[3]);
49 15730200 : _mm256_store_si256((__m256i *)(buf + 4 * buf_stride), r[4]);
50 15730200 : _mm256_store_si256((__m256i *)(buf + 5 * buf_stride), r[5]);
51 15730200 : _mm256_store_si256((__m256i *)(buf + 6 * buf_stride), r[6]);
52 15730200 : _mm256_store_si256((__m256i *)(buf + 7 * buf_stride), r[7]);
53 15730200 : }
54 :
55 : static AOM_FORCE_INLINE void integral_images(const uint8_t *src,
56 : int32_t src_stride, int32_t width, int32_t height, int32_t *C, int32_t *D,
57 : int32_t buf_stride) {
58 186960 : const uint8_t *srcT = src;
59 186960 : int32_t *CT = C + buf_stride + 1;
60 186960 : int32_t *DT = D + buf_stride + 1;
61 :
62 186960 : memset(C, 0, sizeof(*C) * (width + 8));
63 186960 : memset(D, 0, sizeof(*D) * (width + 8));
64 :
65 186960 : int y = 0;
66 : do {
67 1138560 : __m256i CLeft = _mm256_setzero_si256();
68 1138560 : __m256i DLeft = _mm256_setzero_si256();
69 :
70 : // Zero the left column.
71 1138560 : CT[0 * buf_stride - 1] = DT[0 * buf_stride - 1] = 0;
72 1138560 : CT[1 * buf_stride - 1] = DT[1 * buf_stride - 1] = 0;
73 1138560 : CT[2 * buf_stride - 1] = DT[2 * buf_stride - 1] = 0;
74 1138560 : CT[3 * buf_stride - 1] = DT[3 * buf_stride - 1] = 0;
75 1138560 : CT[4 * buf_stride - 1] = DT[4 * buf_stride - 1] = 0;
76 1138560 : CT[5 * buf_stride - 1] = DT[5 * buf_stride - 1] = 0;
77 1138560 : CT[6 * buf_stride - 1] = DT[6 * buf_stride - 1] = 0;
78 1138560 : CT[7 * buf_stride - 1] = DT[7 * buf_stride - 1] = 0;
79 :
80 1138560 : int x = 0;
81 : do {
82 : __m128i s[8];
83 : __m256i r32[8];
84 :
85 7869840 : s[0] = _mm_loadl_epi64((__m128i *)(srcT + 0 * src_stride + x));
86 7869840 : s[1] = _mm_loadl_epi64((__m128i *)(srcT + 1 * src_stride + x));
87 7869840 : s[2] = _mm_loadl_epi64((__m128i *)(srcT + 2 * src_stride + x));
88 7869840 : s[3] = _mm_loadl_epi64((__m128i *)(srcT + 3 * src_stride + x));
89 7869840 : s[4] = _mm_loadl_epi64((__m128i *)(srcT + 4 * src_stride + x));
90 7869840 : s[5] = _mm_loadl_epi64((__m128i *)(srcT + 5 * src_stride + x));
91 7869840 : s[6] = _mm_loadl_epi64((__m128i *)(srcT + 6 * src_stride + x));
92 7869840 : s[7] = _mm_loadl_epi64((__m128i *)(srcT + 7 * src_stride + x));
93 :
94 7869840 : partial_transpose_8bit_8x8(s, s);
95 :
96 15741200 : s[7] = _mm_unpackhi_epi8(s[3], _mm_setzero_si128());
97 15741200 : s[6] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
98 15741200 : s[5] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
99 15741200 : s[4] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
100 15741200 : s[3] = _mm_unpackhi_epi8(s[1], _mm_setzero_si128());
101 15741200 : s[2] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
102 15741200 : s[1] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
103 7870610 : s[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
104 :
105 7870610 : cvt_16to32bit_8x8(s, r32);
106 7870400 : add_32bit_8x8(DLeft, r32);
107 7870310 : DLeft = r32[7];
108 :
109 7870310 : transpose_32bit_8x8_avx2(r32, r32);
110 :
111 : const __m256i DTop =
112 7870400 : _mm256_load_si256((__m256i *)(DT - buf_stride + x));
113 7870400 : add_32bit_8x8(DTop, r32);
114 7869730 : store_32bit_8x8(r32, DT + x, buf_stride);
115 :
116 7869230 : s[0] = _mm_mullo_epi16(s[0], s[0]);
117 7869230 : s[1] = _mm_mullo_epi16(s[1], s[1]);
118 7869230 : s[2] = _mm_mullo_epi16(s[2], s[2]);
119 7869230 : s[3] = _mm_mullo_epi16(s[3], s[3]);
120 7869230 : s[4] = _mm_mullo_epi16(s[4], s[4]);
121 7869230 : s[5] = _mm_mullo_epi16(s[5], s[5]);
122 7869230 : s[6] = _mm_mullo_epi16(s[6], s[6]);
123 7869230 : s[7] = _mm_mullo_epi16(s[7], s[7]);
124 :
125 7869230 : cvt_16to32bit_8x8(s, r32);
126 7869430 : add_32bit_8x8(CLeft, r32);
127 7869540 : CLeft = r32[7];
128 :
129 7869540 : transpose_32bit_8x8_avx2(r32, r32);
130 :
131 : const __m256i CTop =
132 7870260 : _mm256_load_si256((__m256i *)(CT - buf_stride + x));
133 7870260 : add_32bit_8x8(CTop, r32);
134 7869690 : store_32bit_8x8(r32, CT + x, buf_stride);
135 7869840 : x += 8;
136 7869840 : } while (x < width);
137 :
138 1138560 : srcT += 8 * src_stride;
139 1138560 : CT += 8 * buf_stride;
140 1138560 : DT += 8 * buf_stride;
141 1138560 : y += 8;
142 1138560 : } while (y < height);
143 186958 : }
144 :
145 : static AOM_FORCE_INLINE void integral_images_highbd(const uint16_t *src,
146 : int32_t src_stride, int32_t width, int32_t height, int32_t *C, int32_t *D,
147 : int32_t buf_stride) {
148 0 : const uint16_t *srcT = src;
149 0 : int32_t *CT = C + buf_stride + 1;
150 0 : int32_t *DT = D + buf_stride + 1;
151 :
152 0 : memset(C, 0, sizeof(*C) * (width + 8));
153 0 : memset(D, 0, sizeof(*D) * (width + 8));
154 :
155 0 : int y = 0;
156 : do {
157 0 : __m256i CLeft = _mm256_setzero_si256();
158 0 : __m256i DLeft = _mm256_setzero_si256();
159 :
160 : // Zero the left column.
161 0 : CT[0 * buf_stride - 1] = DT[0 * buf_stride - 1] = 0;
162 0 : CT[1 * buf_stride - 1] = DT[1 * buf_stride - 1] = 0;
163 0 : CT[2 * buf_stride - 1] = DT[2 * buf_stride - 1] = 0;
164 0 : CT[3 * buf_stride - 1] = DT[3 * buf_stride - 1] = 0;
165 0 : CT[4 * buf_stride - 1] = DT[4 * buf_stride - 1] = 0;
166 0 : CT[5 * buf_stride - 1] = DT[5 * buf_stride - 1] = 0;
167 0 : CT[6 * buf_stride - 1] = DT[6 * buf_stride - 1] = 0;
168 0 : CT[7 * buf_stride - 1] = DT[7 * buf_stride - 1] = 0;
169 :
170 0 : int x = 0;
171 : do {
172 : __m128i s[8];
173 : __m256i r32[8], a32[8];
174 :
175 0 : s[0] = _mm_loadu_si128((__m128i *)(srcT + 0 * src_stride + x));
176 0 : s[1] = _mm_loadu_si128((__m128i *)(srcT + 1 * src_stride + x));
177 0 : s[2] = _mm_loadu_si128((__m128i *)(srcT + 2 * src_stride + x));
178 0 : s[3] = _mm_loadu_si128((__m128i *)(srcT + 3 * src_stride + x));
179 0 : s[4] = _mm_loadu_si128((__m128i *)(srcT + 4 * src_stride + x));
180 0 : s[5] = _mm_loadu_si128((__m128i *)(srcT + 5 * src_stride + x));
181 0 : s[6] = _mm_loadu_si128((__m128i *)(srcT + 6 * src_stride + x));
182 0 : s[7] = _mm_loadu_si128((__m128i *)(srcT + 7 * src_stride + x));
183 :
184 0 : transpose_16bit_8x8(s, s);
185 :
186 0 : cvt_16to32bit_8x8(s, r32);
187 :
188 0 : a32[0] = _mm256_madd_epi16(r32[0], r32[0]);
189 0 : a32[1] = _mm256_madd_epi16(r32[1], r32[1]);
190 0 : a32[2] = _mm256_madd_epi16(r32[2], r32[2]);
191 0 : a32[3] = _mm256_madd_epi16(r32[3], r32[3]);
192 0 : a32[4] = _mm256_madd_epi16(r32[4], r32[4]);
193 0 : a32[5] = _mm256_madd_epi16(r32[5], r32[5]);
194 0 : a32[6] = _mm256_madd_epi16(r32[6], r32[6]);
195 0 : a32[7] = _mm256_madd_epi16(r32[7], r32[7]);
196 :
197 0 : add_32bit_8x8(CLeft, a32);
198 0 : CLeft = a32[7];
199 :
200 0 : transpose_32bit_8x8_avx2(a32, a32);
201 :
202 : const __m256i CTop =
203 0 : _mm256_load_si256((__m256i *)(CT - buf_stride + x));
204 0 : add_32bit_8x8(CTop, a32);
205 0 : store_32bit_8x8(a32, CT + x, buf_stride);
206 :
207 0 : add_32bit_8x8(DLeft, r32);
208 0 : DLeft = r32[7];
209 :
210 0 : transpose_32bit_8x8_avx2(r32, r32);
211 :
212 : const __m256i DTop =
213 0 : _mm256_load_si256((__m256i *)(DT - buf_stride + x));
214 0 : add_32bit_8x8(DTop, r32);
215 0 : store_32bit_8x8(r32, DT + x, buf_stride);
216 0 : x += 8;
217 0 : } while (x < width);
218 :
219 0 : srcT += 8 * src_stride;
220 0 : CT += 8 * buf_stride;
221 0 : DT += 8 * buf_stride;
222 0 : y += 8;
223 0 : } while (y < height);
224 0 : }
225 :
226 : // Compute 8 values of boxsum from the given integral image. ii should point
227 : // at the middle of the box (for the first value). r is the box radius.
228 138282000 : static INLINE __m256i boxsum_from_ii(const int32_t *ii, int32_t stride,
229 : int32_t r) {
230 138282000 : const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
231 138252000 : const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
232 138176000 : const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride);
233 138126000 : const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride);
234 138084000 : const __m256i u = _mm256_sub_epi32(tr, tl);
235 138084000 : const __m256i v = _mm256_sub_epi32(br, bl);
236 138084000 : return _mm256_sub_epi32(v, u);
237 : }
238 :
239 1069420 : static INLINE __m256i round_for_shift(unsigned shift) {
240 2138840 : return _mm256_set1_epi32((1 << shift) >> 1);
241 : }
242 :
243 69096000 : static INLINE __m256i compute_p(__m256i sum1, __m256i sum2, int32_t n) {
244 69096000 : const __m256i bb = _mm256_madd_epi16(sum1, sum1);
245 138192000 : const __m256i an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n));
246 69096000 : return _mm256_sub_epi32(an, bb);
247 : }
248 :
249 0 : static INLINE __m256i compute_p_highbd(__m256i sum1, __m256i sum2,
250 : int32_t bit_depth, int32_t n) {
251 0 : const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8));
252 0 : const __m256i rounding_b = round_for_shift(bit_depth - 8);
253 0 : const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
254 0 : const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
255 : const __m256i a =
256 0 : _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a);
257 : const __m256i b =
258 0 : _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b);
259 : // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
260 : // mullo to square it
261 0 : const __m256i bb = _mm256_madd_epi16(b, b);
262 : const __m256i an =
263 0 : _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb);
264 0 : return _mm256_sub_epi32(an, bb);
265 : }
266 :
267 : // Assumes that C, D are integral images for the original buffer which has been
268 : // extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
269 : // on the sides. A, B, C, D point at logical position (0, 0).
270 : static AOM_FORCE_INLINE void calc_ab(int32_t *A, int32_t *B, const int32_t *C,
271 : const int32_t *D, int32_t width, int32_t height, int32_t buf_stride,
272 : int32_t bit_depth, int32_t sgr_params_idx, int32_t radius_idx) {
273 160141 : const SgrParamsType *const params = &eb_sgr_params[sgr_params_idx];
274 160141 : const int32_t r = params->r[radius_idx];
275 160141 : const int32_t n = (2 * r + 1) * (2 * r + 1);
276 160141 : const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
277 : // one_over_n[n-1] is 2^12/n, so easily fits in an int16
278 160141 : const __m256i one_over_n = _mm256_set1_epi32(eb_one_by_x[n - 1]);
279 160141 : const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
280 160142 : const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
281 :
282 160141 : A -= buf_stride + 1;
283 160141 : B -= buf_stride + 1;
284 160141 : C -= buf_stride + 1;
285 160141 : D -= buf_stride + 1;
286 :
287 160141 : int32_t i = height + 2;
288 :
289 160141 : if (bit_depth == 8) {
290 : do {
291 6823740 : int32_t j = 0;
292 : do {
293 48070700 : const __m256i sum1 = boxsum_from_ii(D + j, buf_stride, r);
294 48041200 : const __m256i sum2 = boxsum_from_ii(C + j, buf_stride, r);
295 47973500 : const __m256i p = compute_p(sum1, sum2, n);
296 239906000 : const __m256i z = _mm256_min_epi32(
297 : _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
298 : SGRPROJ_MTABLE_BITS),
299 : _mm256_set1_epi32(255));
300 47981200 : const __m256i a_res = _mm256_i32gather_epi32(eb_x_by_xplus1, z, 4);
301 47981200 : yy_storeu_256(A + j, a_res);
302 :
303 : const __m256i a_complement =
304 96085200 : _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
305 :
306 : // sum1 might have lanes greater than 2^15, so we can't use madd to do
307 : // multiplication involving sum1. However, a_complement and one_over_n
308 : // are both less than 256, so we can multiply them first.
309 48042600 : const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
310 48042600 : const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
311 48042600 : const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
312 : SGRPROJ_RECIP_BITS);
313 48042600 : yy_storeu_256(B + j, b_res);
314 48070600 : j += 8;
315 48070600 : } while (j < width + 2);
316 :
317 6823590 : A += buf_stride;
318 6823590 : B += buf_stride;
319 6823590 : C += buf_stride;
320 6823590 : D += buf_stride;
321 6823590 : } while (--i);
322 : }
323 : else {
324 : do {
325 0 : int32_t j = 0;
326 : do {
327 0 : const __m256i sum1 = boxsum_from_ii(D + j, buf_stride, r);
328 0 : const __m256i sum2 = boxsum_from_ii(C + j, buf_stride, r);
329 0 : const __m256i p = compute_p_highbd(sum1, sum2, bit_depth, n);
330 0 : const __m256i z = _mm256_min_epi32(
331 : _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
332 : SGRPROJ_MTABLE_BITS),
333 : _mm256_set1_epi32(255));
334 0 : const __m256i a_res = _mm256_i32gather_epi32(eb_x_by_xplus1, z, 4);
335 0 : yy_storeu_256(A + j, a_res);
336 :
337 : const __m256i a_complement =
338 0 : _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
339 :
340 : // sum1 might have lanes greater than 2^15, so we can't use madd to do
341 : // multiplication involving sum1. However, a_complement and one_over_n
342 : // are both less than 256, so we can multiply them first.
343 0 : const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
344 0 : const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
345 0 : const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
346 : SGRPROJ_RECIP_BITS);
347 0 : yy_storeu_256(B + j, b_res);
348 0 : j += 8;
349 0 : } while (j < width + 2);
350 :
351 163 : A += buf_stride;
352 163 : B += buf_stride;
353 163 : C += buf_stride;
354 163 : D += buf_stride;
355 163 : } while (--i);
356 : }
357 160151 : }
358 :
359 : // Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter
360 : // where the outer four corners have weight 3 and all other pixels have weight
361 : // 4.
362 : //
363 : // Pixels are indexed as follows:
364 : // xtl xt xtr
365 : // xl x xr
366 : // xbl xb xbr
367 : //
368 : // buf points to x
369 : //
370 : // fours = xl + xt + xr + xb + x
371 : // threes = xtl + xtr + xbr + xbl
372 : // cross_sum = 4 * fours + 3 * threes
373 : // = 4 * (fours + threes) - threes
374 : // = (fours + threes) << 2 - threes
375 78928000 : static INLINE __m256i cross_sum(const int32_t *buf, int32_t stride) {
376 78928000 : const __m256i xtl = yy_loadu_256(buf - 1 - stride);
377 78921000 : const __m256i xt = yy_loadu_256(buf - stride);
378 78899700 : const __m256i xtr = yy_loadu_256(buf + 1 - stride);
379 78896200 : const __m256i xl = yy_loadu_256(buf - 1);
380 78898300 : const __m256i x = yy_loadu_256(buf);
381 78892500 : const __m256i xr = yy_loadu_256(buf + 1);
382 78881900 : const __m256i xbl = yy_loadu_256(buf - 1 + stride);
383 78876800 : const __m256i xb = yy_loadu_256(buf + stride);
384 78873200 : const __m256i xbr = yy_loadu_256(buf + 1 + stride);
385 :
386 315475000 : const __m256i fours = _mm256_add_epi32(
387 : xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x))));
388 : const __m256i threes =
389 236607000 : _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
390 :
391 236607000 : return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2),
392 : threes);
393 : }
394 :
395 : // The final filter for self-guided restoration. Computes a weighted average
396 : // across A, B with "cross sums" (see cross_sum implementation above).
397 : static AOM_FORCE_INLINE void final_filter(int32_t *dst, int32_t dst_stride,
398 : const int32_t *A, const int32_t *B, int32_t buf_stride, const uint8_t *dgd8,
399 : int32_t dgd_stride, int32_t width, int32_t height, int32_t highbd) {
400 160151 : const int32_t nb = 5;
401 : const __m256i rounding =
402 160151 : round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
403 160151 : int32_t i = height;
404 :
405 160151 : if (!highbd) {
406 : do {
407 6503410 : int32_t j = 0;
408 : do {
409 39542400 : const __m256i a = cross_sum(A + j, buf_stride);
410 39511300 : const __m256i b = cross_sum(B + j, buf_stride);
411 39496800 : const __m128i raw = xx_loadl_64(dgd8 + j);
412 39494000 : const __m256i src =_mm256_cvtepu8_epi32(raw);
413 39494000 : const __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
414 78988000 : const __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
415 : SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
416 39494000 : yy_storeu_256(dst + j, w);
417 39542400 : j += 8;
418 39542400 : } while (j < width);
419 :
420 6503370 : A += buf_stride;
421 6503370 : B += buf_stride;
422 6503370 : dgd8 += dgd_stride;
423 6503370 : dst += dst_stride;
424 6503370 : } while (--i);
425 : }
426 : else {
427 0 : const uint16_t *dgd_real = CONVERT_TO_SHORTPTR(dgd8);
428 :
429 : do {
430 0 : int32_t j = 0;
431 : do {
432 0 : const __m256i a = cross_sum(A + j, buf_stride);
433 0 : const __m256i b = cross_sum(B + j, buf_stride);
434 0 : const __m128i raw = xx_loadu_128(dgd_real + j);
435 0 : const __m256i src = _mm256_cvtepu16_epi32(raw);
436 0 : const __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
437 0 : const __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
438 : SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
439 0 : yy_storeu_256(dst + j, w);
440 0 : j += 8;
441 0 : } while (j < width);
442 :
443 37 : A += buf_stride;
444 37 : B += buf_stride;
445 37 : dgd_real += dgd_stride;
446 37 : dst += dst_stride;
447 37 : } while (--i);
448 : }
449 160150 : }
450 :
451 : // Assumes that C, D are integral images for the original buffer which has been
452 : // extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
453 : // on the sides. A, B, C, D point at logical position (0, 0).
454 : static AOM_FORCE_INLINE void calc_ab_fast(int32_t *A, int32_t *B,
455 : const int32_t *C, const int32_t *D, int32_t width, int32_t height,
456 : int32_t buf_stride, int32_t bit_depth, int32_t sgr_params_idx,
457 : int32_t radius_idx) {
458 143711 : const SgrParamsType *const params = &eb_sgr_params[sgr_params_idx];
459 143711 : const int32_t r = params->r[radius_idx];
460 143711 : const int32_t n = (2 * r + 1) * (2 * r + 1);
461 143711 : const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
462 : // one_over_n[n-1] is 2^12/n, so easily fits in an int16
463 143711 : const __m256i one_over_n = _mm256_set1_epi32(eb_one_by_x[n - 1]);
464 143711 : const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
465 143711 : const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
466 :
467 143711 : A -= buf_stride + 1;
468 143711 : B -= buf_stride + 1;
469 143711 : C -= buf_stride + 1;
470 143711 : D -= buf_stride + 1;
471 :
472 143711 : int32_t i = 0;
473 143711 : if (bit_depth == 8) {
474 : do {
475 3043490 : int32_t j = 0;
476 : do {
477 21356300 : const __m256i sum1 = boxsum_from_ii(D + j, buf_stride, r);
478 21342000 : const __m256i sum2 = boxsum_from_ii(C + j, buf_stride, r);
479 21328200 : const __m256i p = compute_p(sum1, sum2, n);
480 106646000 : const __m256i z = _mm256_min_epi32(
481 : _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
482 : SGRPROJ_MTABLE_BITS),
483 : _mm256_set1_epi32(255));
484 21329200 : const __m256i a_res = _mm256_i32gather_epi32(eb_x_by_xplus1, z, 4);
485 21329200 : yy_storeu_256(A + j, a_res);
486 :
487 : const __m256i a_complement =
488 42707800 : _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
489 :
490 : // sum1 might have lanes greater than 2^15, so we can't use madd to do
491 : // multiplication involving sum1. However, a_complement and one_over_n
492 : // are both less than 256, so we can multiply them first.
493 21353900 : const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
494 21353900 : const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
495 21353900 : const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
496 : SGRPROJ_RECIP_BITS);
497 21353900 : yy_storeu_256(B + j, b_res);
498 21356300 : j += 8;
499 21356300 : } while (j < width + 2);
500 :
501 3043520 : A += 2 * buf_stride;
502 3043520 : B += 2 * buf_stride;
503 3043520 : C += 2 * buf_stride;
504 3043520 : D += 2 * buf_stride;
505 3043520 : i += 2;
506 3043520 : } while (i < height + 2);
507 : }
508 : else {
509 : do {
510 0 : int32_t j = 0;
511 : do {
512 33 : const __m256i sum1 = boxsum_from_ii(D + j, buf_stride, r);
513 0 : const __m256i sum2 = boxsum_from_ii(C + j, buf_stride, r);
514 0 : const __m256i p = compute_p_highbd(sum1, sum2, bit_depth, n);
515 0 : const __m256i z = _mm256_min_epi32(
516 : _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
517 : SGRPROJ_MTABLE_BITS),
518 : _mm256_set1_epi32(255));
519 0 : const __m256i a_res = _mm256_i32gather_epi32(eb_x_by_xplus1, z, 4);
520 0 : yy_storeu_256(A + j, a_res);
521 :
522 : const __m256i a_complement =
523 0 : _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
524 :
525 : // sum1 might have lanes greater than 2^15, so we can't use madd to do
526 : // multiplication involving sum1. However, a_complement and one_over_n
527 : // are both less than 256, so we can multiply them first.
528 0 : const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
529 0 : const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
530 0 : const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
531 : SGRPROJ_RECIP_BITS);
532 0 : yy_storeu_256(B + j, b_res);
533 0 : j += 8;
534 0 : } while (j < width + 2);
535 :
536 0 : A += 2 * buf_stride;
537 0 : B += 2 * buf_stride;
538 0 : C += 2 * buf_stride;
539 0 : D += 2 * buf_stride;
540 0 : i += 2;
541 0 : } while (i < height + 2);
542 : }
543 143711 : }
544 :
545 : // Calculate 8 values of the "cross sum" starting at buf.
546 : //
547 : // Pixels are indexed like this:
548 : // xtl xt xtr
549 : // - buf -
550 : // xbl xb xbr
551 : //
552 : // Pixels are weighted like this:
553 : // 5 6 5
554 : // 0 0 0
555 : // 5 6 5
556 : //
557 : // fives = xtl + xtr + xbl + xbr
558 : // sixes = xt + xb
559 : // cross_sum = 6 * sixes + 5 * fives
560 : // = 5 * (fives + sixes) - sixes
561 : // = (fives + sixes) << 2 + (fives + sixes) + sixes
562 35037300 : static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf,
563 : int32_t stride) {
564 35037300 : const __m256i xtl = yy_loadu_256(buf - 1 - stride);
565 35034200 : const __m256i xt = yy_loadu_256(buf - stride);
566 35027200 : const __m256i xtr = yy_loadu_256(buf + 1 - stride);
567 35023300 : const __m256i xbl = yy_loadu_256(buf - 1 + stride);
568 35020300 : const __m256i xb = yy_loadu_256(buf + stride);
569 35018000 : const __m256i xbr = yy_loadu_256(buf + 1 + stride);
570 :
571 : const __m256i fives =
572 105049000 : _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
573 35016400 : const __m256i sixes = _mm256_add_epi32(xt, xb);
574 35016400 : const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
575 :
576 105049000 : return _mm256_add_epi32(
577 : _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
578 : fives_plus_sixes),
579 : sixes);
580 : }
581 :
582 : // Calculate 8 values of the "cross sum" starting at buf.
583 : //
584 : // Pixels are indexed like this:
585 : // xl x xr
586 : //
587 : // Pixels are weighted like this:
588 : // 5 6 5
589 : //
590 : // buf points to x
591 : //
592 : // fives = xl + xr
593 : // sixes = x
594 : // cross_sum = 5 * fives + 6 * sixes
595 : // = 4 * (fives + sixes) + (fives + sixes) + sixes
596 : // = (fives + sixes) << 2 + (fives + sixes) + sixes
597 35032900 : static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) {
598 35032900 : const __m256i xl = yy_loadu_256(buf - 1);
599 35032100 : const __m256i x = yy_loadu_256(buf);
600 35028500 : const __m256i xr = yy_loadu_256(buf + 1);
601 :
602 35026900 : const __m256i fives = _mm256_add_epi32(xl, xr);
603 35026900 : const __m256i sixes = x;
604 :
605 35026900 : const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
606 :
607 105081000 : return _mm256_add_epi32(
608 : _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
609 : fives_plus_sixes),
610 : sixes);
611 : }
612 :
613 : // The final filter for the self-guided restoration. Computes a
614 : // weighted average across A, B with "cross sums" (see cross_sum_...
615 : // implementations above).
616 : static AOM_FORCE_INLINE void final_filter_fast(int32_t *dst, int32_t dst_stride,
617 : const int32_t *A, const int32_t *B, int32_t buf_stride, const uint8_t *dgd8,
618 : int32_t dgd_stride, int32_t width, int32_t height, int32_t highbd) {
619 143711 : const int32_t nb0 = 5;
620 143711 : const int32_t nb1 = 4;
621 : const __m256i rounding0 =
622 143711 : round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
623 : const __m256i rounding1 =
624 143711 : round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
625 143711 : int32_t i = 0;
626 :
627 143711 : if (!highbd) {
628 : do {
629 5797380 : if (!(i & 1)) { // even row
630 2898560 : int32_t j = 0;
631 : do {
632 : const __m256i a =
633 17539900 : cross_sum_fast_even_row(A + j, buf_stride);
634 : const __m256i b =
635 17519000 : cross_sum_fast_even_row(B + j, buf_stride);
636 17520000 : const __m128i raw = xx_loadl_64(dgd8 + j);
637 17520300 : const __m256i src = _mm256_cvtepu8_epi32(raw);
638 17520300 : const __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
639 : const __m256i w =
640 35040600 : _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
641 : SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
642 17520300 : yy_storeu_256(dst + j, w);
643 17531800 : j += 8;
644 17531800 : } while (j < width);
645 : }
646 : else { // odd row
647 2898810 : int32_t j = 0;
648 : do {
649 17518000 : const __m256i a = cross_sum_fast_odd_row(A + j);
650 17520400 : const __m256i b = cross_sum_fast_odd_row(B + j);
651 17516500 : const __m128i raw = xx_loadl_64(dgd8 + j);
652 17515900 : const __m256i src = _mm256_cvtepu8_epi32(raw);
653 17515900 : const __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
654 : const __m256i w =
655 35031900 : _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
656 : SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
657 17515900 : yy_storeu_256(dst + j, w);
658 17526200 : j += 8;
659 17526200 : } while (j < width);
660 : }
661 :
662 5797370 : A += buf_stride;
663 5797370 : B += buf_stride;
664 5797370 : dgd8 += dgd_stride;
665 5797370 : dst += dst_stride;
666 5797370 : } while (++i < height);
667 : }
668 : else {
669 0 : const uint16_t *dgd_real = CONVERT_TO_SHORTPTR(dgd8);
670 :
671 : do {
672 0 : if (!(i & 1)) { // even row
673 0 : int32_t j = 0;
674 : do {
675 : const __m256i a =
676 0 : cross_sum_fast_even_row(A + j, buf_stride);
677 : const __m256i b =
678 0 : cross_sum_fast_even_row(B + j, buf_stride);
679 0 : const __m128i raw = xx_loadu_128(dgd_real + j);
680 0 : const __m256i src = _mm256_cvtepu16_epi32(raw);
681 0 : const __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
682 : const __m256i w =
683 0 : _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
684 : SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
685 0 : yy_storeu_256(dst + j, w);
686 0 : j += 8;
687 0 : } while (j < width);
688 : }
689 : else { // odd row
690 0 : int32_t j = 0;
691 : do {
692 1 : const __m256i a = cross_sum_fast_odd_row(A + j);
693 0 : const __m256i b = cross_sum_fast_odd_row(B + j);
694 0 : const __m128i raw = xx_loadu_128(dgd_real + j);
695 0 : const __m256i src = _mm256_cvtepu16_epi32(raw);
696 0 : const __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
697 : const __m256i w =
698 0 : _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
699 : SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
700 0 : yy_storeu_256(dst + j, w);
701 0 : j += 8;
702 0 : } while (j < width);
703 : }
704 :
705 0 : A += buf_stride;
706 0 : B += buf_stride;
707 0 : dgd_real += dgd_stride;
708 0 : dst += dst_stride;
709 0 : } while (++i < height);
710 : }
711 143702 : }
712 :
713 186960 : void eb_av1_selfguided_restoration_avx2(const uint8_t *dgd8, int32_t width,
714 : int32_t height, int32_t dgd_stride, int32_t *flt0, int32_t *flt1,
715 : int32_t flt_stride, int32_t sgr_params_idx, int32_t bit_depth,
716 : int32_t highbd) {
717 : // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
718 : // Ctl and Dtl is 32-byte aligned.
719 186960 : const int32_t buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
720 :
721 : DECLARE_ALIGNED(32, int32_t,
722 : buf[4 * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)]);
723 :
724 186960 : const int32_t width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
725 186960 : const int32_t height_ext = height + 2 * SGRPROJ_BORDER_VERT;
726 :
727 : // Adjusting the stride of A and B here appears to avoid bad cache effects,
728 : // leading to a significant speed improvement.
729 : // We also align the stride to a multiple of 32 bytes for efficiency.
730 186960 : int32_t buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3);
731 :
732 : // The "tl" pointers point at the top-left of the initialised data for the
733 : // array.
734 186960 : int32_t *Atl = buf + 0 * buf_elts + 7;
735 186960 : int32_t *Btl = buf + 1 * buf_elts + 7;
736 186960 : int32_t *Ctl = buf + 2 * buf_elts + 7;
737 186960 : int32_t *Dtl = buf + 3 * buf_elts + 7;
738 :
739 : // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
740 : // there's a zero row and column in A, B (integral images), so we move down
741 : // and right one for them.
742 186960 : const int32_t buf_diag_border =
743 186960 : SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
744 :
745 186960 : int32_t *A0 = Atl + 1 + buf_stride;
746 186960 : int32_t *B0 = Btl + 1 + buf_stride;
747 186960 : int32_t *C0 = Ctl + 1 + buf_stride;
748 186960 : int32_t *D0 = Dtl + 1 + buf_stride;
749 :
750 : // Finally, A, B, C, D point at position (0, 0).
751 186960 : int32_t *A = A0 + buf_diag_border;
752 186960 : int32_t *B = B0 + buf_diag_border;
753 186960 : int32_t *C = C0 + buf_diag_border;
754 186960 : int32_t *D = D0 + buf_diag_border;
755 :
756 186960 : const int32_t dgd_diag_border =
757 186960 : SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
758 186960 : const uint8_t *dgd0 = dgd8 - dgd_diag_border;
759 :
760 : // Generate integral images from the input. C will contain sums of squares; D
761 : // will contain just sums
762 186960 : if (highbd)
763 0 : integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
764 : height_ext, Ctl, Dtl, buf_stride);
765 : else
766 : integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
767 : buf_stride);
768 :
769 186958 : const SgrParamsType *const params = &eb_sgr_params[sgr_params_idx];
770 : // Write to flt0 and flt1
771 : // If params->r == 0 we skip the corresponding filter. We only allow one of
772 : // the radii to be 0, as having both equal to 0 would be equivalent to
773 : // skipping SGR entirely.
774 : assert(!(params->r[0] == 0 && params->r[1] == 0));
775 : assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
776 : assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
777 :
778 186958 : if (params->r[0] > 0) {
779 : calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
780 : sgr_params_idx, 0);
781 : final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
782 : width, height, highbd);
783 : }
784 :
785 186949 : if (params->r[1] > 0) {
786 : calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
787 : 1);
788 : final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
789 : height, highbd);
790 : }
791 186958 : }
792 :
793 14160 : void eb_apply_selfguided_restoration_avx2(const uint8_t *dat8, int32_t width,
794 : int32_t height, int32_t stride, int32_t eps, const int32_t *xqd,
795 : uint8_t *dst8, int32_t dst_stride, int32_t *tmpbuf, int32_t bit_depth,
796 : int32_t highbd) {
797 14160 : int32_t *flt0 = tmpbuf;
798 14160 : int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
799 : assert(width * height <= RESTORATION_UNITPELS_MAX);
800 14160 : eb_av1_selfguided_restoration_avx2(dat8, width, height, stride, flt0, flt1,
801 : width, eps, bit_depth, highbd);
802 14160 : const SgrParamsType *const params = &eb_sgr_params[eps];
803 : int32_t xq[2];
804 14160 : eb_decode_xq(xqd, xq, params);
805 :
806 14160 : const __m256i xq0 = _mm256_set1_epi32(xq[0]);
807 14160 : const __m256i xq1 = _mm256_set1_epi32(xq[1]);
808 : const __m256i rounding =
809 14160 : round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
810 :
811 14160 : int32_t i = height;
812 :
813 14160 : if (!highbd) {
814 14160 : const __m256i idx = _mm256_setr_epi32(0, 4, 1, 5, 0, 0, 0, 0);
815 :
816 : do {
817 : // Calculate output in batches of 16 pixels
818 619200 : int32_t j = 0;
819 : do {
820 2016000 : const __m128i src = xx_loadu_128(dat8 + j);
821 2016000 : const __m256i ep_0 = _mm256_cvtepu8_epi32(src);
822 4032000 : const __m256i ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src, 8));
823 2016000 : const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
824 2016000 : const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
825 2016000 : __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
826 2016000 : __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
827 :
828 2016000 : if (params->r[0] > 0) {
829 2004480 : const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[j + 0]), u_0);
830 4008960 : const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[j + 8]), u_1);
831 4008960 : v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
832 4008960 : v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
833 : }
834 :
835 2016000 : if (params->r[1] > 0) {
836 1673280 : const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[j + 0]), u_0);
837 3346560 : const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[j + 8]), u_1);
838 3346560 : v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
839 3346560 : v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
840 : }
841 :
842 4032000 : const __m256i w_0 = _mm256_srai_epi32(
843 : _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
844 4032000 : const __m256i w_1 = _mm256_srai_epi32(
845 : _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
846 :
847 : // Pack into 8 bits and clamp to [0, 256)
848 : // Note that each pack messes up the order of the bits,
849 : // so we use a permute function to correct this
850 : // 0, 1, 4, 5, 2, 3, 6, 7
851 2016000 : const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
852 : // 0, 1, 4, 5, 2, 3, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7
853 2016000 : const __m256i tmp2 = _mm256_packus_epi16(tmp, tmp);
854 : // 0, 1, 2, 3, 4, 5, 6, 7, ...
855 2016000 : const __m256i tmp3 = _mm256_permutevar8x32_epi32(tmp2, idx);
856 2016000 : const __m128i res = _mm256_castsi256_si128(tmp3);
857 2016000 : xx_storeu_128(dst8 + j, res);
858 2016000 : j += 16;
859 2016000 : } while (j < width);
860 :
861 619200 : dat8 += stride;
862 619200 : flt0 += width;
863 619200 : flt1 += width;
864 619200 : dst8 += dst_stride;
865 619200 : } while (--i);
866 : }
867 : else {
868 0 : const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
869 0 : const uint16_t *dat16 = CONVERT_TO_SHORTPTR(dat8);
870 0 : uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8);
871 :
872 : do {
873 : // Calculate output in batches of 16 pixels
874 0 : int32_t j = 0;
875 : do {
876 0 : const __m128i src_0 = xx_loadu_128(dat16 + j + 0);
877 0 : const __m128i src_1 = xx_loadu_128(dat16 + j + 8);
878 0 : const __m256i ep_0 = _mm256_cvtepu16_epi32(src_0);
879 0 : const __m256i ep_1 = _mm256_cvtepu16_epi32(src_1);
880 0 : const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
881 0 : const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
882 0 : __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
883 0 : __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
884 :
885 0 : if (params->r[0] > 0) {
886 0 : const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[j + 0]), u_0);
887 0 : const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[j + 8]), u_1);
888 0 : v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
889 0 : v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
890 : }
891 :
892 0 : if (params->r[1] > 0) {
893 0 : const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[j + 0]), u_0);
894 0 : const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[j + 8]), u_1);
895 0 : v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
896 0 : v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
897 : }
898 :
899 0 : const __m256i w_0 = _mm256_srai_epi32(
900 : _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
901 0 : const __m256i w_1 = _mm256_srai_epi32(
902 : _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
903 :
904 : // Pack into 16 bits and clamp to [0, 2^bit_depth)
905 : // Note that packing into 16 bits messes up the order of the bits,
906 : // so we use a permute function to correct this
907 0 : const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
908 0 : const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
909 0 : const __m256i res = _mm256_min_epi16(tmp2, max);
910 0 : yy_storeu_256(dst16 + j, res);
911 0 : j += 16;
912 0 : } while (j < width);
913 :
914 0 : dat16 += stride;
915 0 : flt0 += width;
916 0 : flt1 += width;
917 0 : dst16 += dst_stride;
918 0 : } while (--i);
919 : }
920 14160 : }
|