Line data Source code
1 : /*
2 : * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <immintrin.h>
13 : #include <assert.h>
14 :
15 : #include "EbDefinitions.h"
16 : #include "aom_dsp_rtcd.h"
17 :
18 : #include "convolve_avx2.h"
19 : #include "synonyms.h"
20 : #include "convolve.h"
21 :
22 0 : void eb_av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int32_t src_stride,
23 : uint16_t *dst, int32_t dst_stride, int32_t w, int32_t h,
24 : const InterpFilterParams *filter_params_x,
25 : const InterpFilterParams *filter_params_y,
26 : const int32_t subpel_x_q4,
27 : const int32_t subpel_y_q4,
28 : ConvolveParams *conv_params, int32_t bd) {
29 : DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
30 0 : int32_t im_h = h + filter_params_y->taps - 1;
31 0 : int32_t im_stride = 8;
32 : int32_t i, j;
33 0 : const int32_t fo_vert = filter_params_y->taps / 2 - 1;
34 0 : const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
35 0 : const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
36 :
37 : // Check that, even with 12-bit input, the intermediate values will fit
38 : // into an unsigned 16-bit intermediate array.
39 0 : assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
40 :
41 : __m256i s[8], coeffs_y[4], coeffs_x[4];
42 :
43 0 : const __m256i round_const_x = _mm256_set1_epi32(
44 0 : ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
45 0 : const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
46 :
47 0 : const __m256i round_const_y = _mm256_set1_epi32(
48 0 : ((1 << conv_params->round_1) >> 1) -
49 0 : (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
50 0 : const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
51 :
52 0 : const int32_t bits =
53 0 : FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
54 0 : const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
55 0 : const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
56 : const __m256i clip_pixel =
57 0 : _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
58 0 : const __m256i zero = _mm256_setzero_si256();
59 :
60 0 : prepare_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_x);
61 0 : prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_y);
62 :
63 0 : for (j = 0; j < w; j += 8) {
64 : /* Horizontal filter */
65 : {
66 0 : for (i = 0; i < im_h; i += 2) {
67 : const __m256i row0 =
68 0 : _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
69 0 : __m256i row1 = _mm256_set1_epi16(0);
70 0 : if (i + 1 < im_h)
71 : row1 =
72 0 : _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
73 :
74 0 : const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
75 0 : const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
76 :
77 : // even pixels
78 0 : s[0] = _mm256_alignr_epi8(r1, r0, 0);
79 0 : s[1] = _mm256_alignr_epi8(r1, r0, 4);
80 0 : s[2] = _mm256_alignr_epi8(r1, r0, 8);
81 0 : s[3] = _mm256_alignr_epi8(r1, r0, 12);
82 :
83 0 : __m256i res_even = convolve16_8tap_avx2(s, coeffs_x);
84 0 : res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
85 : round_shift_x);
86 :
87 : // odd pixels
88 0 : s[0] = _mm256_alignr_epi8(r1, r0, 2);
89 0 : s[1] = _mm256_alignr_epi8(r1, r0, 6);
90 0 : s[2] = _mm256_alignr_epi8(r1, r0, 10);
91 0 : s[3] = _mm256_alignr_epi8(r1, r0, 14);
92 :
93 0 : __m256i res_odd = convolve16_8tap_avx2(s, coeffs_x);
94 0 : res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
95 : round_shift_x);
96 :
97 0 : __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
98 0 : __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
99 0 : __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
100 :
101 0 : _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
102 : }
103 : }
104 :
105 : /* Vertical filter */
106 : {
107 0 : __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
108 0 : __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
109 0 : __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
110 0 : __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
111 0 : __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
112 0 : __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
113 :
114 0 : s[0] = _mm256_unpacklo_epi16(s0, s1);
115 0 : s[1] = _mm256_unpacklo_epi16(s2, s3);
116 0 : s[2] = _mm256_unpacklo_epi16(s4, s5);
117 :
118 0 : s[4] = _mm256_unpackhi_epi16(s0, s1);
119 0 : s[5] = _mm256_unpackhi_epi16(s2, s3);
120 0 : s[6] = _mm256_unpackhi_epi16(s4, s5);
121 :
122 0 : for (i = 0; i < h; i += 2) {
123 0 : const int16_t *data = &im_block[i * im_stride];
124 :
125 : const __m256i s6 =
126 0 : _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
127 : const __m256i s7 =
128 0 : _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
129 :
130 0 : s[3] = _mm256_unpacklo_epi16(s6, s7);
131 0 : s[7] = _mm256_unpackhi_epi16(s6, s7);
132 :
133 0 : const __m256i res_a = convolve16_8tap_avx2(s, coeffs_y);
134 0 : __m256i res_a_round = _mm256_sra_epi32(
135 : _mm256_add_epi32(res_a, round_const_y), round_shift_y);
136 :
137 0 : res_a_round = _mm256_sra_epi32(
138 : _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits);
139 :
140 0 : if (w - j > 4) {
141 0 : const __m256i res_b = convolve16_8tap_avx2(s + 4, coeffs_y);
142 0 : __m256i res_b_round = _mm256_sra_epi32(
143 : _mm256_add_epi32(res_b, round_const_y), round_shift_y);
144 : res_b_round =
145 0 : _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits),
146 : round_shift_bits);
147 :
148 0 : __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
149 0 : res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
150 0 : res_16bit = _mm256_max_epi16(res_16bit, zero);
151 :
152 0 : _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
153 : _mm256_castsi256_si128(res_16bit));
154 0 : _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
155 0 : _mm256_extracti128_si256(res_16bit, 1));
156 : }
157 0 : else if (w == 4) {
158 0 : res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
159 0 : res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
160 0 : res_a_round = _mm256_max_epi16(res_a_round, zero);
161 :
162 0 : _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
163 : _mm256_castsi256_si128(res_a_round));
164 0 : _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
165 0 : _mm256_extracti128_si256(res_a_round, 1));
166 : }
167 : else {
168 0 : res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
169 0 : res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
170 0 : res_a_round = _mm256_max_epi16(res_a_round, zero);
171 :
172 0 : xx_storel_32((__m128i *)&dst[i * dst_stride + j],
173 : _mm256_castsi256_si128(res_a_round));
174 0 : xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
175 0 : _mm256_extracti128_si256(res_a_round, 1));
176 : }
177 :
178 0 : s[0] = s[1];
179 0 : s[1] = s[2];
180 0 : s[2] = s[3];
181 :
182 0 : s[4] = s[5];
183 0 : s[5] = s[6];
184 0 : s[6] = s[7];
185 : }
186 : }
187 : }
188 0 : }
189 :
190 0 : static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
191 : __m256i s[4];
192 0 : s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
193 0 : s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
194 0 : s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
195 0 : s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
196 0 : _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
197 0 : _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
198 0 : _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
199 0 : _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
200 0 : }
201 :
202 0 : static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
203 : __m256i s[8];
204 0 : s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
205 0 : s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
206 0 : s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
207 0 : s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
208 0 : s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
209 0 : s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
210 0 : s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
211 0 : s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
212 :
213 0 : _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
214 0 : _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
215 0 : _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
216 0 : _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
217 0 : _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
218 0 : _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
219 0 : _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
220 0 : _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
221 0 : }
222 :
223 0 : void eb_av1_highbd_convolve_2d_copy_sr_avx2(
224 : const uint16_t *src, int32_t src_stride, uint16_t *dst, int32_t dst_stride, int32_t w,
225 : int32_t h, const InterpFilterParams *filter_params_x,
226 : const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
227 : const int32_t subpel_y_q4, ConvolveParams *conv_params, int32_t bd) {
228 : (void)filter_params_x;
229 : (void)filter_params_y;
230 : (void)subpel_x_q4;
231 : (void)subpel_y_q4;
232 : (void)conv_params;
233 : (void)bd;
234 :
235 0 : if (w >= 16) {
236 0 : assert(!((intptr_t)dst % 16));
237 0 : assert(!(dst_stride % 16));
238 : }
239 :
240 0 : if (w == 2) {
241 : do {
242 0 : memcpy(dst, src, 2 * sizeof(*src));
243 0 : src += src_stride;
244 0 : dst += dst_stride;
245 0 : memcpy(dst, src, 2 * sizeof(*src));
246 0 : src += src_stride;
247 0 : dst += dst_stride;
248 0 : h -= 2;
249 0 : } while (h);
250 : }
251 0 : else if (w == 4) {
252 : do {
253 : __m128i s[2];
254 0 : s[0] = _mm_loadl_epi64((__m128i *)src);
255 0 : src += src_stride;
256 0 : s[1] = _mm_loadl_epi64((__m128i *)src);
257 0 : src += src_stride;
258 0 : _mm_storel_epi64((__m128i *)dst, s[0]);
259 0 : dst += dst_stride;
260 0 : _mm_storel_epi64((__m128i *)dst, s[1]);
261 0 : dst += dst_stride;
262 0 : h -= 2;
263 0 : } while (h);
264 : }
265 0 : else if (w == 8) {
266 : do {
267 : __m128i s[2];
268 0 : s[0] = _mm_loadu_si128((__m128i *)src);
269 0 : src += src_stride;
270 0 : s[1] = _mm_loadu_si128((__m128i *)src);
271 0 : src += src_stride;
272 0 : _mm_store_si128((__m128i *)dst, s[0]);
273 0 : dst += dst_stride;
274 0 : _mm_store_si128((__m128i *)dst, s[1]);
275 0 : dst += dst_stride;
276 0 : h -= 2;
277 0 : } while (h);
278 : }
279 0 : else if (w == 16) {
280 : do {
281 : __m256i s[2];
282 0 : s[0] = _mm256_loadu_si256((__m256i *)src);
283 0 : src += src_stride;
284 0 : s[1] = _mm256_loadu_si256((__m256i *)src);
285 0 : src += src_stride;
286 0 : _mm256_storeu_si256((__m256i *)dst, s[0]);
287 0 : dst += dst_stride;
288 0 : _mm256_storeu_si256((__m256i *)dst, s[1]);
289 0 : dst += dst_stride;
290 0 : h -= 2;
291 0 : } while (h);
292 : }
293 0 : else if (w == 32) {
294 : do {
295 : __m256i s[4];
296 0 : s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
297 0 : s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
298 0 : src += src_stride;
299 0 : s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
300 0 : s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
301 0 : src += src_stride;
302 0 : _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
303 0 : _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
304 0 : dst += dst_stride;
305 0 : _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
306 0 : _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
307 0 : dst += dst_stride;
308 0 : h -= 2;
309 0 : } while (h);
310 : }
311 0 : else if (w == 64) {
312 : do {
313 0 : copy_64(src, dst);
314 0 : src += src_stride;
315 0 : dst += dst_stride;
316 0 : copy_64(src, dst);
317 0 : src += src_stride;
318 0 : dst += dst_stride;
319 0 : h -= 2;
320 0 : } while (h);
321 : }
322 : else {
323 : do {
324 0 : copy_128(src, dst);
325 0 : src += src_stride;
326 0 : dst += dst_stride;
327 0 : copy_128(src, dst);
328 0 : src += src_stride;
329 0 : dst += dst_stride;
330 0 : h -= 2;
331 0 : } while (h);
332 : }
333 0 : }
|