Line data Source code
1 : /*
2 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include "EbDefinitions.h"
13 : #include <immintrin.h>
14 : #include <assert.h>
15 :
16 : #include "aom_dsp_rtcd.h"
17 : #include "convolve.h"
18 : #include "synonyms.h"
19 : #include "synonyms_avx2.h"
20 : #include "convolve_avx2.h"
21 :
22 : DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = {
23 : 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255,
24 : 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255
25 : };
26 :
27 : // 128-bit xmmwords are written as [ ... ] with the MSB on the left.
28 : // 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
29 : // on the left.
30 : // A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
31 : // loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
32 :
33 : // Exploiting the range of wiener filter coefficients,
34 : // horizontal filtering can be done in 16 bit intermediate precision.
35 : // The details are as follows :
36 : // Consider the horizontal wiener filter coefficients of the following form :
37 : // [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0]
38 : // Subtracting 2^(FILTER_BITS) from the centre tap we get the following :
39 : // [C0, C1, C2, -2 * (C0 + C1 + C2), C2, C1, C0]
40 : // The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3
41 : // + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit
42 : // precision. Finally, after rounding the above result by round_0, we multiply
43 : // the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the
44 : // horizontal filter output.
45 :
46 298043 : void eb_av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
47 : uint8_t *dst, ptrdiff_t dst_stride,
48 : const int16_t *filter_x, int32_t x_step_q4,
49 : const int16_t *filter_y, int32_t y_step_q4,
50 : int32_t w, int32_t h,
51 : const ConvolveParams *conv_params) {
52 298043 : const int32_t bd = 8;
53 298043 : assert(x_step_q4 == 16 && y_step_q4 == 16);
54 298043 : assert(!(w & 7));
55 : (void)x_step_q4;
56 : (void)y_step_q4;
57 :
58 : DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]);
59 298043 : int32_t im_h = h + SUBPEL_TAPS - 1;
60 298043 : int32_t im_stride = 8;
61 : int32_t i, j;
62 298043 : const int32_t center_tap = (SUBPEL_TAPS - 1) / 2;
63 298043 : const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
64 :
65 : __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center;
66 :
67 298043 : assert(conv_params->round_0 > 0);
68 :
69 298043 : filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
70 298043 : filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
71 298043 : filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
72 298043 : filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
73 :
74 298043 : filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2);
75 :
76 298043 : const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x);
77 298043 : const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x);
78 :
79 : // coeffs 0 1 0 1 0 1 0 1
80 298043 : coeffs_h[0] =
81 596086 : _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u));
82 : // coeffs 2 3 2 3 2 3 2 3
83 298043 : coeffs_h[1] =
84 596086 : _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u));
85 : // coeffs 4 5 4 5 4 5 4 5
86 298043 : coeffs_h[2] =
87 596086 : _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u));
88 : // coeffs 6 7 6 7 6 7 6 7
89 298043 : coeffs_h[3] =
90 298043 : _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu));
91 :
92 : const __m256i round_const_h =
93 298043 : _mm256_set1_epi16((1 << (conv_params->round_0 - 1)));
94 : const __m256i round_const_horz =
95 596086 : _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1)));
96 298043 : const __m256i clamp_low = _mm256_setzero_si256();
97 : const __m256i clamp_high =
98 298043 : _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
99 596086 : const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0);
100 :
101 : // Add an offset to account for the "add_src" part of the convolve function.
102 298043 : const __m128i zero_128 = _mm_setzero_si128();
103 298043 : const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
104 596086 : const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
105 :
106 298043 : const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
107 :
108 : // coeffs 0 1 0 1 0 1 0 1
109 298043 : coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
110 : // coeffs 2 3 2 3 2 3 2 3
111 298043 : coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
112 : // coeffs 4 5 4 5 4 5 4 5
113 298043 : coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
114 : // coeffs 6 7 6 7 6 7 6 7
115 298043 : coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff);
116 :
117 : const __m256i round_const_v =
118 596086 : _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
119 298043 : (1 << (bd + conv_params->round_1 - 1)));
120 298043 : const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
121 :
122 1894680 : for (j = 0; j < w; j += 8) {
123 48730200 : for (i = 0; i < im_h; i += 2) {
124 47074100 : __m256i data = _mm256_castsi128_si256(
125 47074100 : _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
126 :
127 : // Load the next line
128 47074100 : if (i + 1 < im_h)
129 90774200 : data = _mm256_inserti128_si256(
130 : data,
131 : _mm_loadu_si128(
132 : (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
133 : 1);
134 :
135 47074100 : __m256i res = x_convolve_8tap_avx2(data, coeffs_h, filt);
136 :
137 : res =
138 94073100 : _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
139 :
140 47036600 : __m256i data_0 = _mm256_shuffle_epi8(data, filt_center);
141 :
142 : // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to
143 : // the result
144 94073100 : data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
145 47036600 : res = _mm256_add_epi16(res, data_0);
146 47036600 : res = _mm256_add_epi16(res, round_const_horz);
147 47036600 : res = _mm256_max_epi16(res, clamp_low);
148 47036600 : const __m256i res_clamped = _mm256_min_epi16(res, clamp_high);
149 47036600 : _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped);
150 : }
151 :
152 : /* Vertical filter */
153 : {
154 1656090 : __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
155 1656090 : __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
156 1656090 : __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
157 1656090 : __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
158 1656090 : __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
159 3312170 : __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
160 :
161 : __m256i s[8];
162 1656090 : s[0] = _mm256_unpacklo_epi16(src_0, src_1);
163 1656090 : s[1] = _mm256_unpacklo_epi16(src_2, src_3);
164 1656090 : s[2] = _mm256_unpacklo_epi16(src_4, src_5);
165 :
166 1656090 : s[4] = _mm256_unpackhi_epi16(src_0, src_1);
167 1656090 : s[5] = _mm256_unpackhi_epi16(src_2, src_3);
168 1656090 : s[6] = _mm256_unpackhi_epi16(src_4, src_5);
169 :
170 41895600 : for (i = 0; i < h - 1; i += 2) {
171 40298900 : const int16_t *data = &im_block[i * im_stride];
172 :
173 : const __m256i s6 =
174 40298900 : _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
175 : const __m256i s7 =
176 80597900 : _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
177 :
178 40298900 : s[3] = _mm256_unpacklo_epi16(s6, s7);
179 40298900 : s[7] = _mm256_unpackhi_epi16(s6, s7);
180 :
181 40298900 : __m256i res_a = convolve16_8tap_avx2(s, coeffs_v);
182 40273500 : __m256i res_b = convolve16_8tap_avx2(s + 4, coeffs_v);
183 :
184 80479000 : const __m256i res_a_round = _mm256_sra_epi32(
185 : _mm256_add_epi32(res_a, round_const_v), round_shift_v);
186 80479000 : const __m256i res_b_round = _mm256_sra_epi32(
187 : _mm256_add_epi32(res_b, round_const_v), round_shift_v);
188 :
189 : /* rounding code */
190 : // 16 bit conversion
191 40239500 : const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
192 : // 8 bit conversion and saturation to uint8
193 40239500 : const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
194 :
195 40239500 : const __m128i res_0 = _mm256_castsi256_si128(res_8b);
196 40239500 : const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
197 :
198 : // Store values into the destination buffer
199 40239500 : __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
200 40239500 : __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
201 :
202 40239500 : _mm_storel_epi64(p_0, res_0);
203 40239500 : _mm_storel_epi64(p_1, res_1);
204 :
205 40239500 : s[0] = s[1];
206 40239500 : s[1] = s[2];
207 40239500 : s[2] = s[3];
208 :
209 40239500 : s[4] = s[5];
210 40239500 : s[5] = s[6];
211 40239500 : s[6] = s[7];
212 : }
213 1596640 : if (h - i) {
214 0 : s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20);
215 0 : s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20);
216 0 : s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20);
217 :
218 0 : const int16_t *data = &im_block[i * im_stride];
219 0 : const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride));
220 0 : const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
221 :
222 0 : __m128i s3 = _mm_unpacklo_epi16(s6_, s7_);
223 0 : __m128i s7 = _mm_unpackhi_epi16(s6_, s7_);
224 :
225 0 : s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1);
226 0 : __m256i convolveres = convolve16_8tap_avx2(s, coeffs_v);
227 :
228 0 : const __m256i res_round = _mm256_sra_epi32(
229 : _mm256_add_epi32(convolveres, round_const_v), round_shift_v);
230 :
231 : /* rounding code */
232 : // 16 bit conversion
233 0 : __m128i reslo = _mm256_castsi256_si128(res_round);
234 0 : __m128i reshi = _mm256_extracti128_si256(res_round, 1);
235 0 : const __m128i res_16bit = _mm_packus_epi32(reslo, reshi);
236 :
237 : // 8 bit conversion and saturation to uint8
238 0 : const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit);
239 0 : __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
240 0 : _mm_storel_epi64(p_0, res_8b);
241 : }
242 : }
243 : }
244 201031 : }
245 :
246 : // 128-bit xmmwords are written as [ ... ] with the MSB on the left.
247 : // 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
248 : // on the left.
249 : // A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be
250 : // loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ].
251 0 : void eb_av1_highbd_wiener_convolve_add_src_avx2(
252 : const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
253 : ptrdiff_t dst_stride, const int16_t *filter_x, int32_t x_step_q4,
254 : const int16_t *filter_y, int32_t y_step_q4, int32_t w, int32_t h,
255 : const ConvolveParams *conv_params, int32_t bd) {
256 0 : assert(x_step_q4 == 16 && y_step_q4 == 16);
257 0 : assert(!(w & 7));
258 0 : assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
259 : (void)x_step_q4;
260 : (void)y_step_q4;
261 :
262 0 : const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
263 0 : uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
264 :
265 : DECLARE_ALIGNED(32, uint16_t,
266 : temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
267 0 : int32_t intermediate_height = h + SUBPEL_TAPS - 1;
268 0 : const int32_t center_tap = ((SUBPEL_TAPS - 1) / 2);
269 0 : const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
270 :
271 0 : const __m128i zero_128 = _mm_setzero_si128();
272 0 : const __m256i zero_256 = _mm256_setzero_si256();
273 :
274 : // Add an offset to account for the "add_src" part of the convolve function.
275 0 : const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
276 :
277 0 : const __m256i clamp_low = zero_256;
278 :
279 : /* Horizontal filter */
280 : {
281 : const __m256i clamp_high_ep =
282 0 : _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
283 :
284 : // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
285 0 : const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
286 :
287 : // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
288 0 : const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
289 : // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
290 0 : const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
291 :
292 : // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
293 0 : const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
294 : // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
295 0 : const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
296 : // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
297 0 : const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
298 : // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
299 0 : const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
300 :
301 : // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
302 0 : const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
303 : // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
304 0 : const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
305 : // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
306 0 : const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
307 : // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
308 0 : const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
309 :
310 0 : const __m256i round_const = _mm256_set1_epi32(
311 0 : (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
312 :
313 0 : for (int32_t i = 0; i < intermediate_height; ++i) {
314 0 : for (int32_t j = 0; j < w; j += 16) {
315 0 : const uint16_t *src_ij = src_ptr + i * src_stride + j;
316 :
317 : // Load 16-bit src data
318 0 : const __m256i src_0 = yy_loadu_256(src_ij + 0);
319 0 : const __m256i src_1 = yy_loadu_256(src_ij + 1);
320 0 : const __m256i src_2 = yy_loadu_256(src_ij + 2);
321 0 : const __m256i src_3 = yy_loadu_256(src_ij + 3);
322 0 : const __m256i src_4 = yy_loadu_256(src_ij + 4);
323 0 : const __m256i src_5 = yy_loadu_256(src_ij + 5);
324 0 : const __m256i src_6 = yy_loadu_256(src_ij + 6);
325 0 : const __m256i src_7 = yy_loadu_256(src_ij + 7);
326 :
327 : // Multiply src data by filter coeffs and sum pairs
328 0 : const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
329 0 : const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
330 0 : const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
331 0 : const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
332 0 : const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
333 0 : const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
334 0 : const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
335 0 : const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
336 :
337 : // Calculate scalar product for even- and odd-indices separately,
338 : // increasing to 32-bit precision
339 0 : const __m256i res_even_sum = _mm256_add_epi32(
340 : _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
341 0 : const __m256i res_even = _mm256_srai_epi32(
342 : _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
343 :
344 0 : const __m256i res_odd_sum = _mm256_add_epi32(
345 : _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
346 0 : const __m256i res_odd = _mm256_srai_epi32(
347 : _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
348 :
349 : // Reduce to 16-bit precision and pack even- and odd-index results
350 : // back into one register. The _mm256_packs_epi32 intrinsic returns
351 : // a register with the pixels ordered as follows:
352 : // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
353 0 : const __m256i res = _mm256_packs_epi32(res_even, res_odd);
354 : const __m256i res_clamped =
355 0 : _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
356 :
357 : // Store in a temporary array
358 0 : yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
359 : }
360 : }
361 : }
362 :
363 : /* Vertical filter */
364 : {
365 0 : const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
366 :
367 : // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
368 0 : const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
369 :
370 : // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
371 0 : const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
372 : // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
373 0 : const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
374 :
375 : // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
376 0 : const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
377 : // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
378 0 : const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
379 : // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
380 0 : const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
381 : // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
382 0 : const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
383 :
384 : // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
385 0 : const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
386 : // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
387 0 : const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
388 : // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
389 0 : const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
390 : // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
391 0 : const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
392 :
393 : const __m256i round_const =
394 0 : _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
395 0 : (1 << (bd + conv_params->round_1 - 1)));
396 :
397 0 : for (int32_t i = 0; i < h; ++i) {
398 0 : for (int32_t j = 0; j < w; j += 16) {
399 0 : const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
400 :
401 : // Load 16-bit data from the output of the horizontal filter in
402 : // which the pixels are ordered as follows:
403 : // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
404 0 : const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
405 0 : const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
406 0 : const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
407 0 : const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
408 0 : const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
409 0 : const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
410 0 : const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
411 0 : const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
412 :
413 : // Filter the even-indices, increasing to 32-bit precision
414 0 : const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
415 0 : const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
416 0 : const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
417 0 : const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
418 :
419 0 : const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
420 0 : const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
421 0 : const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
422 0 : const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
423 :
424 0 : const __m256i res_even = _mm256_add_epi32(
425 : _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
426 :
427 : // Filter the odd-indices, increasing to 32-bit precision
428 0 : const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
429 0 : const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
430 0 : const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
431 0 : const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
432 :
433 0 : const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
434 0 : const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
435 0 : const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
436 0 : const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
437 :
438 0 : const __m256i res_odd = _mm256_add_epi32(
439 : _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
440 :
441 : // Pixels are currently in the following order:
442 : // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
443 : // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ]
444 : //
445 : // Rearrange the pixels into the following order:
446 : // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ]
447 : // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
448 0 : const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
449 0 : const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
450 :
451 0 : const __m256i res_lo_round = _mm256_srai_epi32(
452 : _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
453 0 : const __m256i res_hi_round = _mm256_srai_epi32(
454 : _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
455 :
456 : // Reduce to 16-bit precision and pack into the correct order:
457 : // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
458 : const __m256i res_16bit =
459 0 : _mm256_packs_epi32(res_lo_round, res_hi_round);
460 0 : const __m256i res_16bit_clamped = _mm256_min_epi16(
461 : _mm256_max_epi16(res_16bit, clamp_low), clamp_high);
462 :
463 : // Store in the dst array
464 0 : yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
465 : }
466 : }
467 : }
468 0 : }
|