Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 : /*
6 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
7 : *
8 : * This source code is subject to the terms of the BSD 2 Clause License and
9 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
10 : * was not distributed with this source code in the LICENSE file, you can
11 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
12 : * Media Patent License 1.0 was not distributed with this source code in the
13 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
14 : */
15 :
16 : #include <immintrin.h>
17 : #include <assert.h>
18 :
19 : #include "EbDefinitions.h"
20 : #include "aom_dsp_rtcd.h"
21 :
22 : #include "convolve_avx2.h"
23 : #include "convolve.h"
24 :
25 0 : void eb_av1_highbd_jnt_convolve_2d_copy_avx2(
26 : const uint16_t *src, int32_t src_stride, uint16_t *dst0, int32_t dst_stride0, int32_t w,
27 : int32_t h, const InterpFilterParams *filter_params_x,
28 : const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
29 : const int32_t subpel_y_q4, ConvolveParams *conv_params, int32_t bd) {
30 0 : ConvBufType *dst = conv_params->dst;
31 0 : int32_t dst_stride = conv_params->dst_stride;
32 : (void)filter_params_x;
33 : (void)filter_params_y;
34 : (void)subpel_x_q4;
35 : (void)subpel_y_q4;
36 :
37 0 : const int32_t bits =
38 0 : FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
39 0 : const __m128i left_shift = _mm_cvtsi32_si128(bits);
40 0 : const int32_t do_average = conv_params->do_average;
41 0 : const int32_t use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
42 0 : const int32_t w0 = conv_params->fwd_offset;
43 0 : const int32_t w1 = conv_params->bck_offset;
44 0 : const __m256i wt0 = _mm256_set1_epi32(w0);
45 0 : const __m256i wt1 = _mm256_set1_epi32(w1);
46 0 : const __m256i zero = _mm256_setzero_si256();
47 : int32_t i, j;
48 :
49 0 : const int32_t offset_0 =
50 0 : bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
51 0 : const int32_t offset = (1 << offset_0) + (1 << (offset_0 - 1));
52 0 : const __m256i offset_const = _mm256_set1_epi32(offset);
53 0 : const __m256i offset_const_16b = _mm256_set1_epi16(offset);
54 0 : const int32_t rounding_shift =
55 0 : 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
56 0 : const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
57 : const __m256i clip_pixel_to_bd =
58 0 : _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
59 :
60 0 : assert(bits <= 4);
61 :
62 0 : if (!(w % 16)) {
63 0 : for (i = 0; i < h; i += 1) {
64 0 : for (j = 0; j < w; j += 16) {
65 : const __m256i src_16bit =
66 0 : _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
67 :
68 0 : const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
69 :
70 0 : if (do_average) {
71 : const __m256i data_0 =
72 0 : _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
73 :
74 0 : const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero);
75 0 : const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero);
76 :
77 0 : const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
78 0 : const __m256i res_unsigned_lo =
79 0 : _mm256_add_epi32(res_32b_lo, offset_const);
80 :
81 0 : const __m256i comp_avg_res_lo = highbd_comp_avg(
82 : &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
83 :
84 0 : const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
85 0 : const __m256i res_unsigned_hi =
86 0 : _mm256_add_epi32(res_32b_hi, offset_const);
87 :
88 0 : const __m256i comp_avg_res_hi = highbd_comp_avg(
89 : &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
90 :
91 0 : const __m256i round_result_lo = highbd_convolve_rounding(
92 : &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
93 0 : const __m256i round_result_hi = highbd_convolve_rounding(
94 : &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
95 :
96 : const __m256i res_16b =
97 0 : _mm256_packus_epi32(round_result_lo, round_result_hi);
98 0 : const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
99 :
100 0 : _mm256_storeu_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
101 : }
102 : else {
103 : const __m256i res_unsigned_16b =
104 0 : _mm256_adds_epu16(res, offset_const_16b);
105 :
106 0 : _mm256_storeu_si256((__m256i *)(&dst[i * dst_stride + j]),
107 : res_unsigned_16b);
108 : }
109 : }
110 : }
111 : }
112 0 : else if (!(w % 4)) {
113 0 : for (i = 0; i < h; i += 2) {
114 0 : for (j = 0; j < w; j += 8) {
115 : const __m128i src_row_0 =
116 0 : _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
117 : const __m128i src_row_1 =
118 0 : _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
119 : // since not all compilers yet support _mm256_set_m128i()
120 0 : const __m256i src_10 = _mm256_insertf128_si256(
121 : _mm256_castsi128_si256(src_row_0), src_row_1, 1);
122 :
123 0 : const __m256i res = _mm256_sll_epi16(src_10, left_shift);
124 :
125 0 : if (w - j < 8) {
126 0 : if (do_average) {
127 0 : const __m256i data_0 = _mm256_castsi128_si256(
128 0 : _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
129 0 : const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
130 0 : (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
131 : const __m256i data_01 =
132 0 : _mm256_permute2x128_si256(data_0, data_1, 0x20);
133 :
134 0 : const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
135 :
136 0 : const __m256i res_32b = _mm256_unpacklo_epi16(res, zero);
137 0 : const __m256i res_unsigned_lo =
138 0 : _mm256_add_epi32(res_32b, offset_const);
139 :
140 0 : const __m256i comp_avg_res = highbd_comp_avg(
141 : &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
142 :
143 0 : const __m256i round_result = highbd_convolve_rounding(
144 : &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
145 :
146 : const __m256i res_16b =
147 0 : _mm256_packus_epi32(round_result, round_result);
148 : const __m256i res_clip =
149 0 : _mm256_min_epi16(res_16b, clip_pixel_to_bd);
150 :
151 0 : const __m128i res_0 = _mm256_castsi256_si128(res_clip);
152 0 : const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
153 :
154 0 : _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
155 0 : _mm_storel_epi64(
156 0 : (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
157 : }
158 : else {
159 : const __m256i res_unsigned_16b =
160 0 : _mm256_adds_epu16(res, offset_const_16b);
161 :
162 0 : const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
163 0 : const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
164 :
165 0 : _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
166 0 : _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
167 : res_1);
168 : }
169 : }
170 : else {
171 0 : if (do_average) {
172 0 : const __m256i data_0 = _mm256_castsi128_si256(
173 0 : _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
174 0 : const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
175 0 : (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
176 : const __m256i data_01 =
177 0 : _mm256_permute2x128_si256(data_0, data_1, 0x20);
178 :
179 0 : const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
180 0 : const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
181 :
182 0 : const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
183 0 : const __m256i res_unsigned_lo =
184 0 : _mm256_add_epi32(res_32b_lo, offset_const);
185 :
186 0 : const __m256i comp_avg_res_lo = highbd_comp_avg(
187 : &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
188 :
189 0 : const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
190 0 : const __m256i res_unsigned_hi =
191 0 : _mm256_add_epi32(res_32b_hi, offset_const);
192 :
193 0 : const __m256i comp_avg_res_hi = highbd_comp_avg(
194 : &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
195 :
196 : const __m256i round_result_lo =
197 0 : highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
198 : &rounding_const, rounding_shift);
199 : const __m256i round_result_hi =
200 0 : highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
201 : &rounding_const, rounding_shift);
202 :
203 : const __m256i res_16b =
204 0 : _mm256_packus_epi32(round_result_lo, round_result_hi);
205 : const __m256i res_clip =
206 0 : _mm256_min_epi16(res_16b, clip_pixel_to_bd);
207 :
208 0 : const __m128i res_0 = _mm256_castsi256_si128(res_clip);
209 0 : const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
210 :
211 0 : _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
212 : _mm_store_si128(
213 0 : (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
214 : }
215 : else {
216 : const __m256i res_unsigned_16b =
217 0 : _mm256_adds_epu16(res, offset_const_16b);
218 0 : const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
219 0 : const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
220 :
221 0 : _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
222 0 : _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
223 : res_1);
224 : }
225 : }
226 : }
227 : }
228 : }
229 0 : }
230 :
231 0 : void eb_av1_highbd_jnt_convolve_2d_avx2(
232 : const uint16_t *src, int32_t src_stride, uint16_t *dst0, int32_t dst_stride0, int32_t w,
233 : int32_t h, const InterpFilterParams *filter_params_x,
234 : const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
235 : const int32_t subpel_y_q4, ConvolveParams *conv_params, int32_t bd) {
236 : DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
237 0 : ConvBufType *dst = conv_params->dst;
238 0 : int32_t dst_stride = conv_params->dst_stride;
239 0 : int32_t im_h = h + filter_params_y->taps - 1;
240 0 : int32_t im_stride = 8;
241 : int32_t i, j;
242 0 : const int32_t fo_vert = filter_params_y->taps / 2 - 1;
243 0 : const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
244 0 : const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
245 :
246 : // Check that, even with 12-bit input, the intermediate values will fit
247 : // into an unsigned 16-bit intermediate array.
248 0 : assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
249 :
250 : __m256i s[8], coeffs_y[4], coeffs_x[4];
251 0 : const int32_t do_average = conv_params->do_average;
252 0 : const int32_t use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
253 :
254 0 : const int32_t w0 = conv_params->fwd_offset;
255 0 : const int32_t w1 = conv_params->bck_offset;
256 0 : const __m256i wt0 = _mm256_set1_epi32(w0);
257 0 : const __m256i wt1 = _mm256_set1_epi32(w1);
258 0 : const __m256i zero = _mm256_setzero_si256();
259 :
260 0 : const __m256i round_const_x = _mm256_set1_epi32(
261 0 : ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
262 0 : const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
263 :
264 0 : const __m256i round_const_y = _mm256_set1_epi32(
265 0 : ((1 << conv_params->round_1) >> 1) -
266 0 : (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
267 0 : const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
268 :
269 0 : const int32_t offset_0 =
270 0 : bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
271 0 : const int32_t offset = (1 << offset_0) + (1 << (offset_0 - 1));
272 0 : const __m256i offset_const = _mm256_set1_epi32(offset);
273 0 : const int32_t rounding_shift =
274 0 : 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
275 0 : const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
276 :
277 : const __m256i clip_pixel_to_bd =
278 0 : _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
279 :
280 0 : prepare_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_x);
281 0 : prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_y);
282 :
283 0 : for (j = 0; j < w; j += 8) {
284 : /* Horizontal filter */
285 : {
286 0 : for (i = 0; i < im_h; i += 2) {
287 : const __m256i row0 =
288 0 : _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
289 0 : __m256i row1 = _mm256_set1_epi16(0);
290 0 : if (i + 1 < im_h)
291 : row1 =
292 0 : _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
293 :
294 0 : const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
295 0 : const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
296 :
297 : // even pixels
298 0 : s[0] = _mm256_alignr_epi8(r1, r0, 0);
299 0 : s[1] = _mm256_alignr_epi8(r1, r0, 4);
300 0 : s[2] = _mm256_alignr_epi8(r1, r0, 8);
301 0 : s[3] = _mm256_alignr_epi8(r1, r0, 12);
302 :
303 0 : __m256i res_even = convolve16_8tap_avx2(s, coeffs_x);
304 0 : res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
305 : round_shift_x);
306 :
307 : // odd pixels
308 0 : s[0] = _mm256_alignr_epi8(r1, r0, 2);
309 0 : s[1] = _mm256_alignr_epi8(r1, r0, 6);
310 0 : s[2] = _mm256_alignr_epi8(r1, r0, 10);
311 0 : s[3] = _mm256_alignr_epi8(r1, r0, 14);
312 :
313 0 : __m256i res_odd = convolve16_8tap_avx2(s, coeffs_x);
314 0 : res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
315 : round_shift_x);
316 :
317 0 : __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
318 0 : __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
319 0 : __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
320 :
321 0 : _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
322 : }
323 : }
324 :
325 : /* Vertical filter */
326 : {
327 0 : __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
328 0 : __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
329 0 : __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
330 0 : __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
331 0 : __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
332 0 : __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
333 :
334 0 : s[0] = _mm256_unpacklo_epi16(s0, s1);
335 0 : s[1] = _mm256_unpacklo_epi16(s2, s3);
336 0 : s[2] = _mm256_unpacklo_epi16(s4, s5);
337 :
338 0 : s[4] = _mm256_unpackhi_epi16(s0, s1);
339 0 : s[5] = _mm256_unpackhi_epi16(s2, s3);
340 0 : s[6] = _mm256_unpackhi_epi16(s4, s5);
341 :
342 0 : for (i = 0; i < h; i += 2) {
343 0 : const int16_t *data = &im_block[i * im_stride];
344 :
345 : const __m256i s6 =
346 0 : _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
347 : const __m256i s7 =
348 0 : _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
349 :
350 0 : s[3] = _mm256_unpacklo_epi16(s6, s7);
351 0 : s[7] = _mm256_unpackhi_epi16(s6, s7);
352 :
353 0 : const __m256i res_a = convolve16_8tap_avx2(s, coeffs_y);
354 :
355 0 : const __m256i res_a_round = _mm256_sra_epi32(
356 : _mm256_add_epi32(res_a, round_const_y), round_shift_y);
357 :
358 0 : const __m256i res_unsigned_lo =
359 0 : _mm256_add_epi32(res_a_round, offset_const);
360 :
361 0 : if (w - j < 8) {
362 0 : if (do_average) {
363 0 : const __m256i data_0 = _mm256_castsi128_si256(
364 0 : _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
365 0 : const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
366 0 : (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
367 : const __m256i data_01 =
368 0 : _mm256_permute2x128_si256(data_0, data_1, 0x20);
369 :
370 0 : const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
371 :
372 0 : const __m256i comp_avg_res = highbd_comp_avg(
373 : &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
374 :
375 0 : const __m256i round_result = highbd_convolve_rounding(
376 : &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
377 :
378 : const __m256i res_16b =
379 0 : _mm256_packus_epi32(round_result, round_result);
380 : const __m256i res_clip =
381 0 : _mm256_min_epi16(res_16b, clip_pixel_to_bd);
382 :
383 0 : const __m128i res_0 = _mm256_castsi256_si128(res_clip);
384 0 : const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
385 :
386 0 : _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
387 0 : _mm_storel_epi64(
388 0 : (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
389 : }
390 : else {
391 : __m256i res_16b =
392 0 : _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
393 0 : const __m128i res_0 = _mm256_castsi256_si128(res_16b);
394 0 : const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
395 :
396 0 : _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
397 0 : _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
398 : res_1);
399 : }
400 : }
401 : else {
402 0 : const __m256i res_b = convolve16_8tap_avx2(s + 4, coeffs_y);
403 0 : const __m256i res_b_round = _mm256_sra_epi32(
404 : _mm256_add_epi32(res_b, round_const_y), round_shift_y);
405 :
406 0 : __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
407 :
408 0 : if (do_average) {
409 0 : const __m256i data_0 = _mm256_castsi128_si256(
410 0 : _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
411 0 : const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
412 0 : (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
413 : const __m256i data_01 =
414 0 : _mm256_permute2x128_si256(data_0, data_1, 0x20);
415 :
416 0 : const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
417 0 : const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
418 :
419 0 : const __m256i comp_avg_res_lo = highbd_comp_avg(
420 : &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
421 0 : const __m256i comp_avg_res_hi = highbd_comp_avg(
422 : &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
423 :
424 : const __m256i round_result_lo =
425 0 : highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
426 : &rounding_const, rounding_shift);
427 : const __m256i round_result_hi =
428 0 : highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
429 : &rounding_const, rounding_shift);
430 :
431 : const __m256i res_16b =
432 0 : _mm256_packus_epi32(round_result_lo, round_result_hi);
433 : const __m256i res_clip =
434 0 : _mm256_min_epi16(res_16b, clip_pixel_to_bd);
435 :
436 0 : const __m128i res_0 = _mm256_castsi256_si128(res_clip);
437 0 : const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
438 :
439 0 : _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
440 : _mm_store_si128(
441 0 : (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
442 : }
443 : else {
444 : __m256i res_16b =
445 0 : _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
446 0 : const __m128i res_0 = _mm256_castsi256_si128(res_16b);
447 0 : const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
448 :
449 0 : _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
450 0 : _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
451 : res_1);
452 : }
453 : }
454 :
455 0 : s[0] = s[1];
456 0 : s[1] = s[2];
457 0 : s[2] = s[3];
458 :
459 0 : s[4] = s[5];
460 0 : s[5] = s[6];
461 0 : s[6] = s[7];
462 : }
463 : }
464 : }
465 0 : }
466 :
467 0 : void eb_av1_highbd_jnt_convolve_x_avx2(
468 : const uint16_t *src, int32_t src_stride, uint16_t *dst0, int32_t dst_stride0, int32_t w,
469 : int32_t h, const InterpFilterParams *filter_params_x,
470 : const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
471 : const int32_t subpel_y_q4, ConvolveParams *conv_params, int32_t bd) {
472 0 : ConvBufType *dst = conv_params->dst;
473 0 : int32_t dst_stride = conv_params->dst_stride;
474 0 : const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
475 0 : const uint16_t *const src_ptr = src - fo_horiz;
476 0 : const int32_t bits = FILTER_BITS - conv_params->round_1;
477 : (void)filter_params_y;
478 : (void)subpel_y_q4;
479 :
480 : int32_t i, j;
481 : __m256i s[4], coeffs_x[4];
482 :
483 0 : const int32_t do_average = conv_params->do_average;
484 0 : const int32_t use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
485 0 : const int32_t w0 = conv_params->fwd_offset;
486 0 : const int32_t w1 = conv_params->bck_offset;
487 0 : const __m256i wt0 = _mm256_set1_epi32(w0);
488 0 : const __m256i wt1 = _mm256_set1_epi32(w1);
489 0 : const __m256i zero = _mm256_setzero_si256();
490 :
491 : const __m256i round_const_x =
492 0 : _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
493 0 : const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
494 0 : const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
495 :
496 0 : const int32_t offset_0 =
497 0 : bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
498 0 : const int32_t offset = (1 << offset_0) + (1 << (offset_0 - 1));
499 0 : const __m256i offset_const = _mm256_set1_epi32(offset);
500 0 : const int32_t rounding_shift =
501 0 : 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
502 0 : const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
503 : const __m256i clip_pixel_to_bd =
504 0 : _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
505 :
506 0 : assert(bits >= 0);
507 0 : prepare_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_x);
508 :
509 0 : for (j = 0; j < w; j += 8) {
510 : /* Horizontal filter */
511 0 : for (i = 0; i < h; i += 2) {
512 : const __m256i row0 =
513 0 : _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
514 : __m256i row1 =
515 0 : _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
516 :
517 0 : const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
518 0 : const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
519 :
520 : // even pixels
521 0 : s[0] = _mm256_alignr_epi8(r1, r0, 0);
522 0 : s[1] = _mm256_alignr_epi8(r1, r0, 4);
523 0 : s[2] = _mm256_alignr_epi8(r1, r0, 8);
524 0 : s[3] = _mm256_alignr_epi8(r1, r0, 12);
525 :
526 0 : __m256i res_even = convolve16_8tap_avx2(s, coeffs_x);
527 0 : res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
528 : round_shift_x);
529 :
530 : // odd pixels
531 0 : s[0] = _mm256_alignr_epi8(r1, r0, 2);
532 0 : s[1] = _mm256_alignr_epi8(r1, r0, 6);
533 0 : s[2] = _mm256_alignr_epi8(r1, r0, 10);
534 0 : s[3] = _mm256_alignr_epi8(r1, r0, 14);
535 :
536 0 : __m256i res_odd = convolve16_8tap_avx2(s, coeffs_x);
537 0 : res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
538 : round_shift_x);
539 :
540 0 : res_even = _mm256_sll_epi32(res_even, round_shift_bits);
541 0 : res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
542 :
543 0 : __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
544 :
545 0 : __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const);
546 :
547 0 : if (w - j < 8) {
548 0 : if (do_average) {
549 0 : const __m256i data_0 = _mm256_castsi128_si256(
550 0 : _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
551 0 : const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
552 0 : (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
553 : const __m256i data_01 =
554 0 : _mm256_permute2x128_si256(data_0, data_1, 0x20);
555 :
556 0 : const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
557 :
558 0 : const __m256i comp_avg_res = highbd_comp_avg(
559 : &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
560 :
561 0 : const __m256i round_result = highbd_convolve_rounding(
562 : &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
563 :
564 : const __m256i res_16b =
565 0 : _mm256_packus_epi32(round_result, round_result);
566 0 : const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
567 :
568 0 : const __m128i res_0 = _mm256_castsi256_si128(res_clip);
569 0 : const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
570 :
571 0 : _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
572 0 : _mm_storel_epi64(
573 0 : (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
574 : }
575 : else {
576 : __m256i res_16b =
577 0 : _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
578 0 : const __m128i res_0 = _mm256_castsi256_si128(res_16b);
579 0 : const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
580 :
581 0 : _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
582 0 : _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
583 : res_1);
584 : }
585 : }
586 : else {
587 0 : __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
588 0 : __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const);
589 :
590 0 : if (do_average) {
591 0 : const __m256i data_0 = _mm256_castsi128_si256(
592 0 : _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
593 0 : const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
594 0 : (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
595 : const __m256i data_01 =
596 0 : _mm256_permute2x128_si256(data_0, data_1, 0x20);
597 :
598 0 : const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
599 0 : const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
600 :
601 0 : const __m256i comp_avg_res_lo = highbd_comp_avg(
602 : &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
603 0 : const __m256i comp_avg_res_hi = highbd_comp_avg(
604 : &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
605 :
606 0 : const __m256i round_result_lo = highbd_convolve_rounding(
607 : &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
608 0 : const __m256i round_result_hi = highbd_convolve_rounding(
609 : &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
610 :
611 : const __m256i res_16b =
612 0 : _mm256_packus_epi32(round_result_lo, round_result_hi);
613 0 : const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
614 :
615 0 : const __m128i res_0 = _mm256_castsi256_si128(res_clip);
616 0 : const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
617 :
618 0 : _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
619 0 : _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
620 : res_1);
621 : }
622 : else {
623 : __m256i res_16b =
624 0 : _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
625 0 : const __m128i res_0 = _mm256_castsi256_si128(res_16b);
626 0 : const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
627 :
628 0 : _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
629 0 : _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
630 : res_1);
631 : }
632 : }
633 : }
634 : }
635 0 : }
636 :
637 0 : void eb_av1_highbd_jnt_convolve_y_avx2(
638 : const uint16_t *src, int32_t src_stride, uint16_t *dst0, int32_t dst_stride0, int32_t w,
639 : int32_t h, const InterpFilterParams *filter_params_x,
640 : const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
641 : const int32_t subpel_y_q4, ConvolveParams *conv_params, int32_t bd) {
642 0 : ConvBufType *dst = conv_params->dst;
643 0 : int32_t dst_stride = conv_params->dst_stride;
644 0 : const int32_t fo_vert = filter_params_y->taps / 2 - 1;
645 0 : const uint16_t *const src_ptr = src - fo_vert * src_stride;
646 0 : const int32_t bits = FILTER_BITS - conv_params->round_0;
647 : (void)filter_params_x;
648 : (void)subpel_x_q4;
649 :
650 0 : assert(bits >= 0);
651 : int32_t i, j;
652 : __m256i s[8], coeffs_y[4];
653 0 : const int32_t do_average = conv_params->do_average;
654 0 : const int32_t use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
655 :
656 0 : const int32_t w0 = conv_params->fwd_offset;
657 0 : const int32_t w1 = conv_params->bck_offset;
658 0 : const __m256i wt0 = _mm256_set1_epi32(w0);
659 0 : const __m256i wt1 = _mm256_set1_epi32(w1);
660 : const __m256i round_const_y =
661 0 : _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
662 0 : const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
663 0 : const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
664 :
665 0 : const int32_t offset_0 =
666 0 : bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
667 0 : const int32_t offset = (1 << offset_0) + (1 << (offset_0 - 1));
668 0 : const __m256i offset_const = _mm256_set1_epi32(offset);
669 0 : const int32_t rounding_shift =
670 0 : 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
671 0 : const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
672 : const __m256i clip_pixel_to_bd =
673 0 : _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
674 0 : const __m256i zero = _mm256_setzero_si256();
675 :
676 0 : prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_y);
677 :
678 0 : for (j = 0; j < w; j += 8) {
679 0 : const uint16_t *data = &src_ptr[j];
680 : /* Vertical filter */
681 : {
682 : __m256i src6;
683 0 : __m256i s01 = _mm256_permute2x128_si256(
684 : _mm256_castsi128_si256(
685 : _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
686 : _mm256_castsi128_si256(
687 : _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
688 : 0x20);
689 0 : __m256i s12 = _mm256_permute2x128_si256(
690 : _mm256_castsi128_si256(
691 : _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
692 : _mm256_castsi128_si256(
693 : _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
694 : 0x20);
695 0 : __m256i s23 = _mm256_permute2x128_si256(
696 : _mm256_castsi128_si256(
697 : _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
698 : _mm256_castsi128_si256(
699 : _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
700 : 0x20);
701 0 : __m256i s34 = _mm256_permute2x128_si256(
702 : _mm256_castsi128_si256(
703 : _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
704 : _mm256_castsi128_si256(
705 : _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
706 : 0x20);
707 0 : __m256i s45 = _mm256_permute2x128_si256(
708 : _mm256_castsi128_si256(
709 : _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
710 : _mm256_castsi128_si256(
711 : _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
712 : 0x20);
713 0 : src6 = _mm256_castsi128_si256(
714 0 : _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
715 0 : __m256i s56 = _mm256_permute2x128_si256(
716 : _mm256_castsi128_si256(
717 : _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
718 : src6, 0x20);
719 :
720 0 : s[0] = _mm256_unpacklo_epi16(s01, s12);
721 0 : s[1] = _mm256_unpacklo_epi16(s23, s34);
722 0 : s[2] = _mm256_unpacklo_epi16(s45, s56);
723 :
724 0 : s[4] = _mm256_unpackhi_epi16(s01, s12);
725 0 : s[5] = _mm256_unpackhi_epi16(s23, s34);
726 0 : s[6] = _mm256_unpackhi_epi16(s45, s56);
727 :
728 0 : for (i = 0; i < h; i += 2) {
729 0 : data = &src_ptr[i * src_stride + j];
730 :
731 0 : const __m256i s67 = _mm256_permute2x128_si256(
732 : src6,
733 : _mm256_castsi128_si256(
734 : _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
735 : 0x20);
736 :
737 0 : src6 = _mm256_castsi128_si256(
738 0 : _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
739 :
740 0 : const __m256i s78 = _mm256_permute2x128_si256(
741 : _mm256_castsi128_si256(
742 : _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
743 : src6, 0x20);
744 :
745 0 : s[3] = _mm256_unpacklo_epi16(s67, s78);
746 0 : s[7] = _mm256_unpackhi_epi16(s67, s78);
747 :
748 0 : const __m256i res_a = convolve16_8tap_avx2(s, coeffs_y);
749 :
750 0 : __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
751 0 : res_a_round = _mm256_sra_epi32(
752 : _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
753 :
754 0 : __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const);
755 :
756 0 : if (w - j < 8) {
757 0 : if (do_average) {
758 0 : const __m256i data_0 = _mm256_castsi128_si256(
759 0 : _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
760 0 : const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
761 0 : (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
762 : const __m256i data_01 =
763 0 : _mm256_permute2x128_si256(data_0, data_1, 0x20);
764 :
765 0 : const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
766 :
767 0 : const __m256i comp_avg_res = highbd_comp_avg(
768 : &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
769 :
770 0 : const __m256i round_result = highbd_convolve_rounding(
771 : &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
772 :
773 : const __m256i res_16b =
774 0 : _mm256_packus_epi32(round_result, round_result);
775 : const __m256i res_clip =
776 0 : _mm256_min_epi16(res_16b, clip_pixel_to_bd);
777 :
778 0 : const __m128i res_0 = _mm256_castsi256_si128(res_clip);
779 0 : const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
780 :
781 0 : _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
782 0 : _mm_storel_epi64(
783 0 : (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
784 : }
785 : else {
786 : __m256i res_16b =
787 0 : _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
788 0 : const __m128i res_0 = _mm256_castsi256_si128(res_16b);
789 0 : const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
790 :
791 0 : _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
792 0 : _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
793 : res_1);
794 : }
795 : }
796 : else {
797 0 : const __m256i res_b = convolve16_8tap_avx2(s + 4, coeffs_y);
798 0 : __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
799 0 : res_b_round = _mm256_sra_epi32(
800 : _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
801 :
802 0 : __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
803 :
804 0 : if (do_average) {
805 0 : const __m256i data_0 = _mm256_castsi128_si256(
806 0 : _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
807 0 : const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
808 0 : (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
809 : const __m256i data_01 =
810 0 : _mm256_permute2x128_si256(data_0, data_1, 0x20);
811 :
812 0 : const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
813 0 : const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
814 :
815 0 : const __m256i comp_avg_res_lo = highbd_comp_avg(
816 : &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
817 0 : const __m256i comp_avg_res_hi = highbd_comp_avg(
818 : &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
819 :
820 : const __m256i round_result_lo =
821 0 : highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
822 : &rounding_const, rounding_shift);
823 : const __m256i round_result_hi =
824 0 : highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
825 : &rounding_const, rounding_shift);
826 :
827 : const __m256i res_16b =
828 0 : _mm256_packus_epi32(round_result_lo, round_result_hi);
829 : const __m256i res_clip =
830 0 : _mm256_min_epi16(res_16b, clip_pixel_to_bd);
831 :
832 0 : const __m128i res_0 = _mm256_castsi256_si128(res_clip);
833 0 : const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
834 :
835 0 : _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
836 : _mm_store_si128(
837 0 : (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
838 : }
839 : else {
840 : __m256i res_16b =
841 0 : _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
842 0 : const __m128i res_0 = _mm256_castsi256_si128(res_16b);
843 0 : const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
844 :
845 0 : _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
846 0 : _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
847 : res_1);
848 : }
849 : }
850 0 : s[0] = s[1];
851 0 : s[1] = s[2];
852 0 : s[2] = s[3];
853 :
854 0 : s[4] = s[5];
855 0 : s[5] = s[6];
856 0 : s[6] = s[7];
857 : }
858 : }
859 : }
860 0 : }
|