Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : /*
7 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
8 : *
9 : * This source code is subject to the terms of the BSD 2 Clause License and
10 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
11 : * was not distributed with this source code in the LICENSE file, you can
12 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
13 : * Media Patent License 1.0 was not distributed with this source code in the
14 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
15 : */
16 :
17 : #include <assert.h>
18 : #include <smmintrin.h> /* SSE4.1 */
19 :
20 : #include "EbDefinitions.h"
21 : #include "aom_dsp_rtcd.h"
22 :
23 : #include "highbd_txfm_utility_sse4.h"
24 : #include "EbTransforms.h"
25 :
26 : typedef enum ATTRIBUTE_PACKED {
27 : IDCT_1D,
28 : IADST_1D,
29 : IFLIPADST_1D = IADST_1D,
30 : IIDENTITY_1D,
31 : ITX_TYPES_1D,
32 : } ITX_TYPE_1D;
33 :
34 : static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
35 : IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
36 : IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
37 : IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
38 : IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
39 : };
40 :
41 : static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
42 : IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
43 : IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
44 : IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
45 : IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
46 : };
47 :
48 : //const int8_t **inv_cos_bit_row;
49 : //const int8_t **inv_cos_bit_col;
50 : // Transform block width in log2
51 : //static const int32_t tx_size_wide_log2[TX_SIZES_ALL] = {
52 : // 2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6,
53 : //};
54 : //
55 : //// Transform block height in log2
56 : //static const int32_t tx_size_high_log2[TX_SIZES_ALL] = {
57 : // 2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4,
58 : //};
59 :
60 : typedef void(*transform_1d_sse4_1)(__m128i *in, __m128i *out, int32_t bit,
61 : int32_t do_cols, int32_t bd, int32_t out_shift);
62 :
63 0 : static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
64 0 : in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
65 0 : in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
66 0 : in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
67 0 : in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
68 0 : }
69 :
70 0 : static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
71 : __m128i *out1, const __m128i *clamp_lo,
72 : const __m128i *clamp_hi) {
73 0 : __m128i a0 = _mm_add_epi32(in0, in1);
74 0 : __m128i a1 = _mm_sub_epi32(in0, in1);
75 :
76 0 : a0 = _mm_max_epi32(a0, *clamp_lo);
77 0 : a0 = _mm_min_epi32(a0, *clamp_hi);
78 0 : a1 = _mm_max_epi32(a1, *clamp_lo);
79 0 : a1 = _mm_min_epi32(a1, *clamp_hi);
80 :
81 0 : *out0 = a0;
82 0 : *out1 = a1;
83 0 : }
84 :
85 0 : static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1,
86 : __m128i *out0, __m128i *out1) {
87 0 : __m128i a0 = _mm_add_epi32(in0, in1);
88 0 : __m128i a1 = _mm_sub_epi32(in0, in1);
89 :
90 0 : *out0 = a0;
91 0 : *out1 = a1;
92 0 : }
93 :
94 0 : static void idct4x4_sse4_1(__m128i *in, __m128i *out, int32_t bit, int32_t do_cols,
95 : int32_t bd, int32_t out_shift) {
96 : (void)*out;
97 : (void)do_cols;
98 : (void)bd;
99 : (void)out_shift;
100 0 : const int32_t *cospi = cospi_arr(bit);
101 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
102 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
103 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
104 0 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
105 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
106 : __m128i u0, u1, u2, u3;
107 : __m128i v0, v1, v2, v3, x, y;
108 :
109 0 : v0 = _mm_unpacklo_epi32(in[0], in[1]);
110 0 : v1 = _mm_unpackhi_epi32(in[0], in[1]);
111 0 : v2 = _mm_unpacklo_epi32(in[2], in[3]);
112 0 : v3 = _mm_unpackhi_epi32(in[2], in[3]);
113 :
114 0 : u0 = _mm_unpacklo_epi64(v0, v2);
115 0 : u1 = _mm_unpackhi_epi64(v0, v2);
116 0 : u2 = _mm_unpacklo_epi64(v1, v3);
117 0 : u3 = _mm_unpackhi_epi64(v1, v3);
118 :
119 0 : x = _mm_mullo_epi32(u0, cospi32);
120 0 : y = _mm_mullo_epi32(u2, cospi32);
121 0 : v0 = _mm_add_epi32(x, y);
122 0 : v0 = _mm_add_epi32(v0, rnding);
123 0 : v0 = _mm_srai_epi32(v0, bit);
124 :
125 0 : v1 = _mm_sub_epi32(x, y);
126 0 : v1 = _mm_add_epi32(v1, rnding);
127 0 : v1 = _mm_srai_epi32(v1, bit);
128 :
129 0 : x = _mm_mullo_epi32(u1, cospi48);
130 0 : y = _mm_mullo_epi32(u3, cospim16);
131 0 : v2 = _mm_add_epi32(x, y);
132 0 : v2 = _mm_add_epi32(v2, rnding);
133 0 : v2 = _mm_srai_epi32(v2, bit);
134 :
135 0 : x = _mm_mullo_epi32(u1, cospi16);
136 0 : y = _mm_mullo_epi32(u3, cospi48);
137 0 : v3 = _mm_add_epi32(x, y);
138 0 : v3 = _mm_add_epi32(v3, rnding);
139 0 : v3 = _mm_srai_epi32(v3, bit);
140 :
141 0 : in[0] = _mm_add_epi32(v0, v3);
142 0 : in[1] = _mm_add_epi32(v1, v2);
143 0 : in[2] = _mm_sub_epi32(v1, v2);
144 0 : in[3] = _mm_sub_epi32(v0, v3);
145 0 : }
146 :
147 0 : static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int32_t bit, int32_t do_cols,
148 : int32_t bd, int32_t out_shift) {
149 : (void)*out;
150 : (void)do_cols;
151 : (void)bd;
152 : (void)out_shift;
153 0 : const int32_t *sinpi = sinpi_arr(bit);
154 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
155 0 : const __m128i sinpi1 = _mm_set1_epi32((int32_t)sinpi[1]);
156 0 : const __m128i sinpi2 = _mm_set1_epi32((int32_t)sinpi[2]);
157 0 : const __m128i sinpi3 = _mm_set1_epi32((int32_t)sinpi[3]);
158 0 : const __m128i sinpi4 = _mm_set1_epi32((int32_t)sinpi[4]);
159 : __m128i t;
160 : __m128i s0, s1, s2, s3, s4, s5, s6, s7;
161 : __m128i x0, x1, x2, x3;
162 : __m128i u0, u1, u2, u3;
163 : __m128i v0, v1, v2, v3;
164 :
165 0 : v0 = _mm_unpacklo_epi32(in[0], in[1]);
166 0 : v1 = _mm_unpackhi_epi32(in[0], in[1]);
167 0 : v2 = _mm_unpacklo_epi32(in[2], in[3]);
168 0 : v3 = _mm_unpackhi_epi32(in[2], in[3]);
169 :
170 0 : x0 = _mm_unpacklo_epi64(v0, v2);
171 0 : x1 = _mm_unpackhi_epi64(v0, v2);
172 0 : x2 = _mm_unpacklo_epi64(v1, v3);
173 0 : x3 = _mm_unpackhi_epi64(v1, v3);
174 :
175 0 : s0 = _mm_mullo_epi32(x0, sinpi1);
176 0 : s1 = _mm_mullo_epi32(x0, sinpi2);
177 0 : s2 = _mm_mullo_epi32(x1, sinpi3);
178 0 : s3 = _mm_mullo_epi32(x2, sinpi4);
179 0 : s4 = _mm_mullo_epi32(x2, sinpi1);
180 0 : s5 = _mm_mullo_epi32(x3, sinpi2);
181 0 : s6 = _mm_mullo_epi32(x3, sinpi4);
182 0 : t = _mm_sub_epi32(x0, x2);
183 0 : s7 = _mm_add_epi32(t, x3);
184 :
185 0 : t = _mm_add_epi32(s0, s3);
186 0 : s0 = _mm_add_epi32(t, s5);
187 0 : t = _mm_sub_epi32(s1, s4);
188 0 : s1 = _mm_sub_epi32(t, s6);
189 0 : s3 = s2;
190 0 : s2 = _mm_mullo_epi32(s7, sinpi3);
191 :
192 0 : u0 = _mm_add_epi32(s0, s3);
193 0 : u1 = _mm_add_epi32(s1, s3);
194 0 : u2 = s2;
195 0 : t = _mm_add_epi32(s0, s1);
196 0 : u3 = _mm_sub_epi32(t, s3);
197 :
198 0 : u0 = _mm_add_epi32(u0, rnding);
199 0 : u0 = _mm_srai_epi32(u0, bit);
200 :
201 0 : u1 = _mm_add_epi32(u1, rnding);
202 0 : u1 = _mm_srai_epi32(u1, bit);
203 :
204 0 : u2 = _mm_add_epi32(u2, rnding);
205 0 : u2 = _mm_srai_epi32(u2, bit);
206 :
207 0 : u3 = _mm_add_epi32(u3, rnding);
208 0 : u3 = _mm_srai_epi32(u3, bit);
209 :
210 0 : in[0] = u0;
211 0 : in[1] = u1;
212 0 : in[2] = u2;
213 0 : in[3] = u3;
214 0 : }
215 :
216 0 : static INLINE void round_shift_4x4(__m128i *in, int32_t shift) {
217 0 : __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
218 :
219 0 : in[0] = _mm_add_epi32(in[0], rnding);
220 0 : in[1] = _mm_add_epi32(in[1], rnding);
221 0 : in[2] = _mm_add_epi32(in[2], rnding);
222 0 : in[3] = _mm_add_epi32(in[3], rnding);
223 :
224 0 : in[0] = _mm_srai_epi32(in[0], shift);
225 0 : in[1] = _mm_srai_epi32(in[1], shift);
226 0 : in[2] = _mm_srai_epi32(in[2], shift);
227 0 : in[3] = _mm_srai_epi32(in[3], shift);
228 0 : }
229 :
230 0 : static INLINE __m128i highbd_clamp_epi16(__m128i u, int32_t bd) {
231 0 : const __m128i zero = _mm_setzero_si128();
232 0 : const __m128i one = _mm_set1_epi16(1);
233 0 : const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
234 : __m128i clamped, mask;
235 :
236 0 : mask = _mm_cmpgt_epi16(u, max);
237 0 : clamped = _mm_andnot_si128(mask, u);
238 0 : mask = _mm_and_si128(mask, max);
239 0 : clamped = _mm_or_si128(mask, clamped);
240 0 : mask = _mm_cmpgt_epi16(clamped, zero);
241 0 : clamped = _mm_and_si128(clamped, mask);
242 :
243 0 : return clamped;
244 : }
245 :
246 0 : static void write_buffer_4x4(__m128i *in,
247 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
248 : int32_t fliplr, int32_t flipud, int32_t shift, int32_t bd) {
249 0 : const __m128i zero = _mm_setzero_si128();
250 : __m128i u0, u1, u2, u3;
251 : __m128i v0, v1, v2, v3;
252 :
253 0 : round_shift_4x4(in, shift);
254 :
255 0 : v0 = _mm_loadl_epi64((__m128i const *)(output_r + 0 * stride_r));
256 0 : v1 = _mm_loadl_epi64((__m128i const *)(output_r + 1 * stride_r));
257 0 : v2 = _mm_loadl_epi64((__m128i const *)(output_r + 2 * stride_r));
258 0 : v3 = _mm_loadl_epi64((__m128i const *)(output_r + 3 * stride_r));
259 :
260 0 : v0 = _mm_unpacklo_epi16(v0, zero);
261 0 : v1 = _mm_unpacklo_epi16(v1, zero);
262 0 : v2 = _mm_unpacklo_epi16(v2, zero);
263 0 : v3 = _mm_unpacklo_epi16(v3, zero);
264 :
265 0 : if (fliplr) {
266 0 : in[0] = _mm_shuffle_epi32(in[0], 0x1B);
267 0 : in[1] = _mm_shuffle_epi32(in[1], 0x1B);
268 0 : in[2] = _mm_shuffle_epi32(in[2], 0x1B);
269 0 : in[3] = _mm_shuffle_epi32(in[3], 0x1B);
270 : }
271 :
272 0 : if (flipud) {
273 0 : u0 = _mm_add_epi32(in[3], v0);
274 0 : u1 = _mm_add_epi32(in[2], v1);
275 0 : u2 = _mm_add_epi32(in[1], v2);
276 0 : u3 = _mm_add_epi32(in[0], v3);
277 : }
278 : else {
279 0 : u0 = _mm_add_epi32(in[0], v0);
280 0 : u1 = _mm_add_epi32(in[1], v1);
281 0 : u2 = _mm_add_epi32(in[2], v2);
282 0 : u3 = _mm_add_epi32(in[3], v3);
283 : }
284 :
285 0 : v0 = _mm_packus_epi32(u0, u1);
286 0 : v2 = _mm_packus_epi32(u2, u3);
287 :
288 0 : u0 = highbd_clamp_epi16(v0, bd);
289 0 : u2 = highbd_clamp_epi16(v2, bd);
290 :
291 0 : v0 = _mm_unpacklo_epi64(u0, u0);
292 0 : v1 = _mm_unpackhi_epi64(u0, u0);
293 0 : v2 = _mm_unpacklo_epi64(u2, u2);
294 0 : v3 = _mm_unpackhi_epi64(u2, u2);
295 :
296 0 : _mm_storel_epi64((__m128i *)(output_w + 0 * stride_w), v0);
297 0 : _mm_storel_epi64((__m128i *)(output_w + 1 * stride_w), v1);
298 0 : _mm_storel_epi64((__m128i *)(output_w + 2 * stride_w), v2);
299 0 : _mm_storel_epi64((__m128i *)(output_w + 3 * stride_w), v3);
300 0 : }
301 :
302 0 : void eb_av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input,
303 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
304 : TxType tx_type, int32_t bd) {
305 : __m128i in[4];
306 0 : const int8_t *shift = eb_inv_txfm_shift_ls[TX_4X4];
307 0 : const int32_t txw_idx = get_txw_idx(TX_4X4);
308 0 : const int32_t txh_idx = get_txh_idx(TX_4X4);
309 :
310 0 : switch (tx_type) {
311 0 : case DCT_DCT:
312 0 : load_buffer_4x4(input, in);
313 0 : idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
314 0 : idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
315 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w,
316 0 : 0, 0, -shift[1], bd);
317 0 : break;
318 0 : case ADST_DCT:
319 0 : load_buffer_4x4(input, in);
320 0 : idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
321 0 : iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
322 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w,
323 0 : 0, 0, -shift[1], bd);
324 0 : break;
325 0 : case DCT_ADST:
326 0 : load_buffer_4x4(input, in);
327 0 : iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
328 0 : idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
329 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w,
330 0 : 0, 0, -shift[1], bd);
331 0 : break;
332 0 : case ADST_ADST:
333 0 : load_buffer_4x4(input, in);
334 0 : iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
335 0 : iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
336 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w,
337 0 : 0, 0, -shift[1], bd);
338 0 : break;
339 0 : case FLIPADST_DCT:
340 0 : load_buffer_4x4(input, in);
341 0 : idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
342 0 : iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
343 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w,
344 0 : 0, 1, -shift[1], bd);
345 0 : break;
346 0 : case DCT_FLIPADST:
347 0 : load_buffer_4x4(input, in);
348 0 : iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
349 0 : idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
350 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w,
351 0 : 1, 0, -shift[1], bd);
352 0 : break;
353 0 : case FLIPADST_FLIPADST:
354 0 : load_buffer_4x4(input, in);
355 0 : iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
356 0 : iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
357 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w,
358 0 : 1, 1, -shift[1], bd);
359 0 : break;
360 0 : case ADST_FLIPADST:
361 0 : load_buffer_4x4(input, in);
362 0 : iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
363 0 : iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
364 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w,
365 0 : 1, 0, -shift[1], bd);
366 0 : break;
367 0 : case FLIPADST_ADST:
368 0 : load_buffer_4x4(input, in);
369 0 : iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
370 0 : iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
371 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w,
372 0 : 0, 1, -shift[1], bd);
373 0 : break;
374 0 : default: assert(0);
375 : }
376 0 : }
377 :
378 : // 8x8
379 0 : static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
380 0 : in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
381 0 : in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
382 0 : in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
383 0 : in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
384 0 : in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
385 0 : in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
386 0 : in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
387 0 : in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
388 0 : in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
389 0 : in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
390 0 : in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
391 0 : in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
392 0 : in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
393 0 : in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
394 0 : in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
395 0 : in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
396 0 : }
397 :
398 0 : static void idct8x8_sse4_1(__m128i *in, __m128i *out, int32_t bit) {
399 0 : const int32_t *cospi = cospi_arr(bit);
400 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
401 0 : const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
402 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
403 0 : const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
404 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
405 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
406 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
407 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
408 0 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
409 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
410 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
411 : __m128i u0, u1, u2, u3, u4, u5, u6, u7;
412 : __m128i v0, v1, v2, v3, v4, v5, v6, v7;
413 : __m128i x, y;
414 : int32_t col;
415 :
416 : // Note:
417 : // Even column: 0, 2, ..., 14
418 : // Odd column: 1, 3, ..., 15
419 : // one even column plus one odd column constructs one row (8 coeffs)
420 : // total we have 8 rows (8x8).
421 0 : for (col = 0; col < 2; ++col) {
422 : // stage 0
423 : // stage 1
424 : // stage 2
425 0 : u0 = in[0 * 2 + col];
426 0 : u1 = in[4 * 2 + col];
427 0 : u2 = in[2 * 2 + col];
428 0 : u3 = in[6 * 2 + col];
429 :
430 0 : x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
431 0 : y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
432 0 : u4 = _mm_add_epi32(x, y);
433 0 : u4 = _mm_add_epi32(u4, rnding);
434 0 : u4 = _mm_srai_epi32(u4, bit);
435 :
436 0 : x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
437 0 : y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
438 0 : u7 = _mm_add_epi32(x, y);
439 0 : u7 = _mm_add_epi32(u7, rnding);
440 0 : u7 = _mm_srai_epi32(u7, bit);
441 :
442 0 : x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
443 0 : y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
444 0 : u5 = _mm_add_epi32(x, y);
445 0 : u5 = _mm_add_epi32(u5, rnding);
446 0 : u5 = _mm_srai_epi32(u5, bit);
447 :
448 0 : x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
449 0 : y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
450 0 : u6 = _mm_add_epi32(x, y);
451 0 : u6 = _mm_add_epi32(u6, rnding);
452 0 : u6 = _mm_srai_epi32(u6, bit);
453 :
454 : // stage 3
455 0 : x = _mm_mullo_epi32(u0, cospi32);
456 0 : y = _mm_mullo_epi32(u1, cospi32);
457 0 : v0 = _mm_add_epi32(x, y);
458 0 : v0 = _mm_add_epi32(v0, rnding);
459 0 : v0 = _mm_srai_epi32(v0, bit);
460 :
461 0 : v1 = _mm_sub_epi32(x, y);
462 0 : v1 = _mm_add_epi32(v1, rnding);
463 0 : v1 = _mm_srai_epi32(v1, bit);
464 :
465 0 : x = _mm_mullo_epi32(u2, cospi48);
466 0 : y = _mm_mullo_epi32(u3, cospim16);
467 0 : v2 = _mm_add_epi32(x, y);
468 0 : v2 = _mm_add_epi32(v2, rnding);
469 0 : v2 = _mm_srai_epi32(v2, bit);
470 :
471 0 : x = _mm_mullo_epi32(u2, cospi16);
472 0 : y = _mm_mullo_epi32(u3, cospi48);
473 0 : v3 = _mm_add_epi32(x, y);
474 0 : v3 = _mm_add_epi32(v3, rnding);
475 0 : v3 = _mm_srai_epi32(v3, bit);
476 :
477 0 : v4 = _mm_add_epi32(u4, u5);
478 0 : v5 = _mm_sub_epi32(u4, u5);
479 0 : v6 = _mm_sub_epi32(u7, u6);
480 0 : v7 = _mm_add_epi32(u6, u7);
481 :
482 : // stage 4
483 0 : u0 = _mm_add_epi32(v0, v3);
484 0 : u1 = _mm_add_epi32(v1, v2);
485 0 : u2 = _mm_sub_epi32(v1, v2);
486 0 : u3 = _mm_sub_epi32(v0, v3);
487 0 : u4 = v4;
488 0 : u7 = v7;
489 :
490 0 : x = _mm_mullo_epi32(v5, cospi32);
491 0 : y = _mm_mullo_epi32(v6, cospi32);
492 0 : u6 = _mm_add_epi32(y, x);
493 0 : u6 = _mm_add_epi32(u6, rnding);
494 0 : u6 = _mm_srai_epi32(u6, bit);
495 :
496 0 : u5 = _mm_sub_epi32(y, x);
497 0 : u5 = _mm_add_epi32(u5, rnding);
498 0 : u5 = _mm_srai_epi32(u5, bit);
499 :
500 : // stage 5
501 0 : out[0 * 2 + col] = _mm_add_epi32(u0, u7);
502 0 : out[1 * 2 + col] = _mm_add_epi32(u1, u6);
503 0 : out[2 * 2 + col] = _mm_add_epi32(u2, u5);
504 0 : out[3 * 2 + col] = _mm_add_epi32(u3, u4);
505 0 : out[4 * 2 + col] = _mm_sub_epi32(u3, u4);
506 0 : out[5 * 2 + col] = _mm_sub_epi32(u2, u5);
507 0 : out[6 * 2 + col] = _mm_sub_epi32(u1, u6);
508 0 : out[7 * 2 + col] = _mm_sub_epi32(u0, u7);
509 : }
510 0 : }
511 :
512 0 : static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int32_t bit) {
513 0 : const int32_t *cospi = cospi_arr(bit);
514 0 : const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
515 0 : const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
516 0 : const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
517 0 : const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
518 0 : const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
519 0 : const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
520 0 : const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
521 0 : const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
522 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
523 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
524 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
525 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
526 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
527 0 : const __m128i kZero = _mm_setzero_si128();
528 : __m128i u[8], v[8], x;
529 :
530 : // Even 8 points: 0, 2, ..., 14
531 : // stage 0
532 : // stage 1
533 : // stage 2
534 : // (1)
535 0 : u[0] = _mm_mullo_epi32(in[14], cospi4);
536 0 : x = _mm_mullo_epi32(in[0], cospi60);
537 0 : u[0] = _mm_add_epi32(u[0], x);
538 0 : u[0] = _mm_add_epi32(u[0], rnding);
539 0 : u[0] = _mm_srai_epi32(u[0], bit);
540 :
541 0 : u[1] = _mm_mullo_epi32(in[14], cospi60);
542 0 : x = _mm_mullo_epi32(in[0], cospi4);
543 0 : u[1] = _mm_sub_epi32(u[1], x);
544 0 : u[1] = _mm_add_epi32(u[1], rnding);
545 0 : u[1] = _mm_srai_epi32(u[1], bit);
546 :
547 : // (2)
548 0 : u[2] = _mm_mullo_epi32(in[10], cospi20);
549 0 : x = _mm_mullo_epi32(in[4], cospi44);
550 0 : u[2] = _mm_add_epi32(u[2], x);
551 0 : u[2] = _mm_add_epi32(u[2], rnding);
552 0 : u[2] = _mm_srai_epi32(u[2], bit);
553 :
554 0 : u[3] = _mm_mullo_epi32(in[10], cospi44);
555 0 : x = _mm_mullo_epi32(in[4], cospi20);
556 0 : u[3] = _mm_sub_epi32(u[3], x);
557 0 : u[3] = _mm_add_epi32(u[3], rnding);
558 0 : u[3] = _mm_srai_epi32(u[3], bit);
559 :
560 : // (3)
561 0 : u[4] = _mm_mullo_epi32(in[6], cospi36);
562 0 : x = _mm_mullo_epi32(in[8], cospi28);
563 0 : u[4] = _mm_add_epi32(u[4], x);
564 0 : u[4] = _mm_add_epi32(u[4], rnding);
565 0 : u[4] = _mm_srai_epi32(u[4], bit);
566 :
567 0 : u[5] = _mm_mullo_epi32(in[6], cospi28);
568 0 : x = _mm_mullo_epi32(in[8], cospi36);
569 0 : u[5] = _mm_sub_epi32(u[5], x);
570 0 : u[5] = _mm_add_epi32(u[5], rnding);
571 0 : u[5] = _mm_srai_epi32(u[5], bit);
572 :
573 : // (4)
574 0 : u[6] = _mm_mullo_epi32(in[2], cospi52);
575 0 : x = _mm_mullo_epi32(in[12], cospi12);
576 0 : u[6] = _mm_add_epi32(u[6], x);
577 0 : u[6] = _mm_add_epi32(u[6], rnding);
578 0 : u[6] = _mm_srai_epi32(u[6], bit);
579 :
580 0 : u[7] = _mm_mullo_epi32(in[2], cospi12);
581 0 : x = _mm_mullo_epi32(in[12], cospi52);
582 0 : u[7] = _mm_sub_epi32(u[7], x);
583 0 : u[7] = _mm_add_epi32(u[7], rnding);
584 0 : u[7] = _mm_srai_epi32(u[7], bit);
585 :
586 : // stage 3
587 0 : v[0] = _mm_add_epi32(u[0], u[4]);
588 0 : v[4] = _mm_sub_epi32(u[0], u[4]);
589 0 : v[1] = _mm_add_epi32(u[1], u[5]);
590 0 : v[5] = _mm_sub_epi32(u[1], u[5]);
591 0 : v[2] = _mm_add_epi32(u[2], u[6]);
592 0 : v[6] = _mm_sub_epi32(u[2], u[6]);
593 0 : v[3] = _mm_add_epi32(u[3], u[7]);
594 0 : v[7] = _mm_sub_epi32(u[3], u[7]);
595 :
596 : // stage 4
597 0 : u[0] = v[0];
598 0 : u[1] = v[1];
599 0 : u[2] = v[2];
600 0 : u[3] = v[3];
601 :
602 0 : u[4] = _mm_mullo_epi32(v[4], cospi16);
603 0 : x = _mm_mullo_epi32(v[5], cospi48);
604 0 : u[4] = _mm_add_epi32(u[4], x);
605 0 : u[4] = _mm_add_epi32(u[4], rnding);
606 0 : u[4] = _mm_srai_epi32(u[4], bit);
607 :
608 0 : u[5] = _mm_mullo_epi32(v[4], cospi48);
609 0 : x = _mm_mullo_epi32(v[5], cospi16);
610 0 : u[5] = _mm_sub_epi32(u[5], x);
611 0 : u[5] = _mm_add_epi32(u[5], rnding);
612 0 : u[5] = _mm_srai_epi32(u[5], bit);
613 :
614 0 : u[6] = _mm_mullo_epi32(v[6], cospim48);
615 0 : x = _mm_mullo_epi32(v[7], cospi16);
616 0 : u[6] = _mm_add_epi32(u[6], x);
617 0 : u[6] = _mm_add_epi32(u[6], rnding);
618 0 : u[6] = _mm_srai_epi32(u[6], bit);
619 :
620 0 : u[7] = _mm_mullo_epi32(v[6], cospi16);
621 0 : x = _mm_mullo_epi32(v[7], cospim48);
622 0 : u[7] = _mm_sub_epi32(u[7], x);
623 0 : u[7] = _mm_add_epi32(u[7], rnding);
624 0 : u[7] = _mm_srai_epi32(u[7], bit);
625 :
626 : // stage 5
627 0 : v[0] = _mm_add_epi32(u[0], u[2]);
628 0 : v[2] = _mm_sub_epi32(u[0], u[2]);
629 0 : v[1] = _mm_add_epi32(u[1], u[3]);
630 0 : v[3] = _mm_sub_epi32(u[1], u[3]);
631 0 : v[4] = _mm_add_epi32(u[4], u[6]);
632 0 : v[6] = _mm_sub_epi32(u[4], u[6]);
633 0 : v[5] = _mm_add_epi32(u[5], u[7]);
634 0 : v[7] = _mm_sub_epi32(u[5], u[7]);
635 :
636 : // stage 6
637 0 : u[0] = v[0];
638 0 : u[1] = v[1];
639 0 : u[4] = v[4];
640 0 : u[5] = v[5];
641 :
642 0 : v[0] = _mm_mullo_epi32(v[2], cospi32);
643 0 : x = _mm_mullo_epi32(v[3], cospi32);
644 0 : u[2] = _mm_add_epi32(v[0], x);
645 0 : u[2] = _mm_add_epi32(u[2], rnding);
646 0 : u[2] = _mm_srai_epi32(u[2], bit);
647 :
648 0 : u[3] = _mm_sub_epi32(v[0], x);
649 0 : u[3] = _mm_add_epi32(u[3], rnding);
650 0 : u[3] = _mm_srai_epi32(u[3], bit);
651 :
652 0 : v[0] = _mm_mullo_epi32(v[6], cospi32);
653 0 : x = _mm_mullo_epi32(v[7], cospi32);
654 0 : u[6] = _mm_add_epi32(v[0], x);
655 0 : u[6] = _mm_add_epi32(u[6], rnding);
656 0 : u[6] = _mm_srai_epi32(u[6], bit);
657 :
658 0 : u[7] = _mm_sub_epi32(v[0], x);
659 0 : u[7] = _mm_add_epi32(u[7], rnding);
660 0 : u[7] = _mm_srai_epi32(u[7], bit);
661 :
662 : // stage 7
663 0 : out[0] = u[0];
664 0 : out[2] = _mm_sub_epi32(kZero, u[4]);
665 0 : out[4] = u[6];
666 0 : out[6] = _mm_sub_epi32(kZero, u[2]);
667 0 : out[8] = u[3];
668 0 : out[10] = _mm_sub_epi32(kZero, u[7]);
669 0 : out[12] = u[5];
670 0 : out[14] = _mm_sub_epi32(kZero, u[1]);
671 :
672 : // Odd 8 points: 1, 3, ..., 15
673 : // stage 0
674 : // stage 1
675 : // stage 2
676 : // (1)
677 0 : u[0] = _mm_mullo_epi32(in[15], cospi4);
678 0 : x = _mm_mullo_epi32(in[1], cospi60);
679 0 : u[0] = _mm_add_epi32(u[0], x);
680 0 : u[0] = _mm_add_epi32(u[0], rnding);
681 0 : u[0] = _mm_srai_epi32(u[0], bit);
682 :
683 0 : u[1] = _mm_mullo_epi32(in[15], cospi60);
684 0 : x = _mm_mullo_epi32(in[1], cospi4);
685 0 : u[1] = _mm_sub_epi32(u[1], x);
686 0 : u[1] = _mm_add_epi32(u[1], rnding);
687 0 : u[1] = _mm_srai_epi32(u[1], bit);
688 :
689 : // (2)
690 0 : u[2] = _mm_mullo_epi32(in[11], cospi20);
691 0 : x = _mm_mullo_epi32(in[5], cospi44);
692 0 : u[2] = _mm_add_epi32(u[2], x);
693 0 : u[2] = _mm_add_epi32(u[2], rnding);
694 0 : u[2] = _mm_srai_epi32(u[2], bit);
695 :
696 0 : u[3] = _mm_mullo_epi32(in[11], cospi44);
697 0 : x = _mm_mullo_epi32(in[5], cospi20);
698 0 : u[3] = _mm_sub_epi32(u[3], x);
699 0 : u[3] = _mm_add_epi32(u[3], rnding);
700 0 : u[3] = _mm_srai_epi32(u[3], bit);
701 :
702 : // (3)
703 0 : u[4] = _mm_mullo_epi32(in[7], cospi36);
704 0 : x = _mm_mullo_epi32(in[9], cospi28);
705 0 : u[4] = _mm_add_epi32(u[4], x);
706 0 : u[4] = _mm_add_epi32(u[4], rnding);
707 0 : u[4] = _mm_srai_epi32(u[4], bit);
708 :
709 0 : u[5] = _mm_mullo_epi32(in[7], cospi28);
710 0 : x = _mm_mullo_epi32(in[9], cospi36);
711 0 : u[5] = _mm_sub_epi32(u[5], x);
712 0 : u[5] = _mm_add_epi32(u[5], rnding);
713 0 : u[5] = _mm_srai_epi32(u[5], bit);
714 :
715 : // (4)
716 0 : u[6] = _mm_mullo_epi32(in[3], cospi52);
717 0 : x = _mm_mullo_epi32(in[13], cospi12);
718 0 : u[6] = _mm_add_epi32(u[6], x);
719 0 : u[6] = _mm_add_epi32(u[6], rnding);
720 0 : u[6] = _mm_srai_epi32(u[6], bit);
721 :
722 0 : u[7] = _mm_mullo_epi32(in[3], cospi12);
723 0 : x = _mm_mullo_epi32(in[13], cospi52);
724 0 : u[7] = _mm_sub_epi32(u[7], x);
725 0 : u[7] = _mm_add_epi32(u[7], rnding);
726 0 : u[7] = _mm_srai_epi32(u[7], bit);
727 :
728 : // stage 3
729 0 : v[0] = _mm_add_epi32(u[0], u[4]);
730 0 : v[4] = _mm_sub_epi32(u[0], u[4]);
731 0 : v[1] = _mm_add_epi32(u[1], u[5]);
732 0 : v[5] = _mm_sub_epi32(u[1], u[5]);
733 0 : v[2] = _mm_add_epi32(u[2], u[6]);
734 0 : v[6] = _mm_sub_epi32(u[2], u[6]);
735 0 : v[3] = _mm_add_epi32(u[3], u[7]);
736 0 : v[7] = _mm_sub_epi32(u[3], u[7]);
737 :
738 : // stage 4
739 0 : u[0] = v[0];
740 0 : u[1] = v[1];
741 0 : u[2] = v[2];
742 0 : u[3] = v[3];
743 :
744 0 : u[4] = _mm_mullo_epi32(v[4], cospi16);
745 0 : x = _mm_mullo_epi32(v[5], cospi48);
746 0 : u[4] = _mm_add_epi32(u[4], x);
747 0 : u[4] = _mm_add_epi32(u[4], rnding);
748 0 : u[4] = _mm_srai_epi32(u[4], bit);
749 :
750 0 : u[5] = _mm_mullo_epi32(v[4], cospi48);
751 0 : x = _mm_mullo_epi32(v[5], cospi16);
752 0 : u[5] = _mm_sub_epi32(u[5], x);
753 0 : u[5] = _mm_add_epi32(u[5], rnding);
754 0 : u[5] = _mm_srai_epi32(u[5], bit);
755 :
756 0 : u[6] = _mm_mullo_epi32(v[6], cospim48);
757 0 : x = _mm_mullo_epi32(v[7], cospi16);
758 0 : u[6] = _mm_add_epi32(u[6], x);
759 0 : u[6] = _mm_add_epi32(u[6], rnding);
760 0 : u[6] = _mm_srai_epi32(u[6], bit);
761 :
762 0 : u[7] = _mm_mullo_epi32(v[6], cospi16);
763 0 : x = _mm_mullo_epi32(v[7], cospim48);
764 0 : u[7] = _mm_sub_epi32(u[7], x);
765 0 : u[7] = _mm_add_epi32(u[7], rnding);
766 0 : u[7] = _mm_srai_epi32(u[7], bit);
767 :
768 : // stage 5
769 0 : v[0] = _mm_add_epi32(u[0], u[2]);
770 0 : v[2] = _mm_sub_epi32(u[0], u[2]);
771 0 : v[1] = _mm_add_epi32(u[1], u[3]);
772 0 : v[3] = _mm_sub_epi32(u[1], u[3]);
773 0 : v[4] = _mm_add_epi32(u[4], u[6]);
774 0 : v[6] = _mm_sub_epi32(u[4], u[6]);
775 0 : v[5] = _mm_add_epi32(u[5], u[7]);
776 0 : v[7] = _mm_sub_epi32(u[5], u[7]);
777 :
778 : // stage 6
779 0 : u[0] = v[0];
780 0 : u[1] = v[1];
781 0 : u[4] = v[4];
782 0 : u[5] = v[5];
783 :
784 0 : v[0] = _mm_mullo_epi32(v[2], cospi32);
785 0 : x = _mm_mullo_epi32(v[3], cospi32);
786 0 : u[2] = _mm_add_epi32(v[0], x);
787 0 : u[2] = _mm_add_epi32(u[2], rnding);
788 0 : u[2] = _mm_srai_epi32(u[2], bit);
789 :
790 0 : u[3] = _mm_sub_epi32(v[0], x);
791 0 : u[3] = _mm_add_epi32(u[3], rnding);
792 0 : u[3] = _mm_srai_epi32(u[3], bit);
793 :
794 0 : v[0] = _mm_mullo_epi32(v[6], cospi32);
795 0 : x = _mm_mullo_epi32(v[7], cospi32);
796 0 : u[6] = _mm_add_epi32(v[0], x);
797 0 : u[6] = _mm_add_epi32(u[6], rnding);
798 0 : u[6] = _mm_srai_epi32(u[6], bit);
799 :
800 0 : u[7] = _mm_sub_epi32(v[0], x);
801 0 : u[7] = _mm_add_epi32(u[7], rnding);
802 0 : u[7] = _mm_srai_epi32(u[7], bit);
803 :
804 : // stage 7
805 0 : out[1] = u[0];
806 0 : out[3] = _mm_sub_epi32(kZero, u[4]);
807 0 : out[5] = u[6];
808 0 : out[7] = _mm_sub_epi32(kZero, u[2]);
809 0 : out[9] = u[3];
810 0 : out[11] = _mm_sub_epi32(kZero, u[7]);
811 0 : out[13] = u[5];
812 0 : out[15] = _mm_sub_epi32(kZero, u[1]);
813 0 : }
814 :
815 0 : static void round_shift_8x8(__m128i *in, int32_t shift) {
816 0 : round_shift_4x4(&in[0], shift);
817 0 : round_shift_4x4(&in[4], shift);
818 0 : round_shift_4x4(&in[8], shift);
819 0 : round_shift_4x4(&in[12], shift);
820 0 : }
821 :
822 0 : static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
823 : int32_t fliplr, int32_t bd) {
824 : __m128i x0, x1;
825 0 : const __m128i zero = _mm_setzero_si128();
826 :
827 0 : x0 = _mm_unpacklo_epi16(pred, zero);
828 0 : x1 = _mm_unpackhi_epi16(pred, zero);
829 :
830 0 : if (fliplr) {
831 0 : res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
832 0 : res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
833 0 : x0 = _mm_add_epi32(res_hi, x0);
834 0 : x1 = _mm_add_epi32(res_lo, x1);
835 : }
836 : else {
837 0 : x0 = _mm_add_epi32(res_lo, x0);
838 0 : x1 = _mm_add_epi32(res_hi, x1);
839 : }
840 :
841 0 : x0 = _mm_packus_epi32(x0, x1);
842 0 : return highbd_clamp_epi16(x0, bd);
843 : }
844 :
845 0 : static void write_buffer_8x8(__m128i *in,
846 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
847 : int32_t fliplr, int32_t flipud, int32_t shift, int32_t bd) {
848 : __m128i u0, u1, u2, u3, u4, u5, u6, u7;
849 : __m128i v0, v1, v2, v3, v4, v5, v6, v7;
850 :
851 0 : round_shift_8x8(in, shift);
852 :
853 0 : v0 = _mm_load_si128((__m128i const *)(output_r + 0 * stride_r));
854 0 : v1 = _mm_load_si128((__m128i const *)(output_r + 1 * stride_r));
855 0 : v2 = _mm_load_si128((__m128i const *)(output_r + 2 * stride_r));
856 0 : v3 = _mm_load_si128((__m128i const *)(output_r + 3 * stride_r));
857 0 : v4 = _mm_load_si128((__m128i const *)(output_r + 4 * stride_r));
858 0 : v5 = _mm_load_si128((__m128i const *)(output_r + 5 * stride_r));
859 0 : v6 = _mm_load_si128((__m128i const *)(output_r + 6 * stride_r));
860 0 : v7 = _mm_load_si128((__m128i const *)(output_r + 7 * stride_r));
861 :
862 0 : if (flipud) {
863 0 : u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
864 0 : u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
865 0 : u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
866 0 : u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
867 0 : u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
868 0 : u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
869 0 : u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
870 0 : u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
871 : }
872 : else {
873 0 : u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
874 0 : u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
875 0 : u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
876 0 : u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
877 0 : u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
878 0 : u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
879 0 : u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
880 0 : u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
881 : }
882 :
883 : _mm_store_si128((__m128i *)(output_w + 0 * stride_w), u0);
884 0 : _mm_store_si128((__m128i *)(output_w + 1 * stride_w), u1);
885 0 : _mm_store_si128((__m128i *)(output_w + 2 * stride_w), u2);
886 0 : _mm_store_si128((__m128i *)(output_w + 3 * stride_w), u3);
887 0 : _mm_store_si128((__m128i *)(output_w + 4 * stride_w), u4);
888 0 : _mm_store_si128((__m128i *)(output_w + 5 * stride_w), u5);
889 0 : _mm_store_si128((__m128i *)(output_w + 6 * stride_w), u6);
890 0 : _mm_store_si128((__m128i *)(output_w + 7 * stride_w), u7);
891 0 : }
892 :
893 0 : void eb_av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input,
894 : uint16_t *output_r, int32_t stride_r,
895 : uint16_t *output_w, int32_t stride_w,
896 : TxType tx_type, int32_t bd) {
897 : __m128i in[16], out[16];
898 0 : const int8_t *shift = eb_inv_txfm_shift_ls[TX_8X8];
899 0 : const int32_t txw_idx = get_txw_idx(TX_8X8);
900 0 : const int32_t txh_idx = get_txh_idx(TX_8X8);
901 :
902 0 : switch (tx_type) {
903 0 : case DCT_DCT:
904 0 : load_buffer_8x8(input, in);
905 0 : transpose_8x8(in, out);
906 0 : idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
907 0 : transpose_8x8(in, out);
908 0 : round_shift_8x8(out, -shift[0]);
909 0 : idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
910 0 : write_buffer_8x8(in, output_r, stride_r, output_w, stride_w,
911 0 : 0, 0, -shift[1], bd);
912 0 : break;
913 0 : case DCT_ADST:
914 0 : load_buffer_8x8(input, in);
915 0 : transpose_8x8(in, out);
916 0 : iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
917 0 : transpose_8x8(in, out);
918 0 : round_shift_8x8(out, -shift[0]);
919 0 : idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
920 0 : write_buffer_8x8(in, output_r, stride_r, output_w, stride_w,
921 0 : 0, 0, -shift[1], bd);
922 0 : break;
923 0 : case ADST_DCT:
924 0 : load_buffer_8x8(input, in);
925 0 : transpose_8x8(in, out);
926 0 : idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
927 0 : transpose_8x8(in, out);
928 0 : round_shift_8x8(out, -shift[0]);
929 0 : iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
930 0 : write_buffer_8x8(in, output_r, stride_r, output_w, stride_w,
931 0 : 0, 0, -shift[1], bd);
932 0 : break;
933 0 : case ADST_ADST:
934 0 : load_buffer_8x8(input, in);
935 0 : transpose_8x8(in, out);
936 0 : iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
937 0 : transpose_8x8(in, out);
938 0 : round_shift_8x8(out, -shift[0]);
939 0 : iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
940 0 : write_buffer_8x8(in, output_r, stride_r, output_w, stride_w,
941 0 : 0, 0, -shift[1], bd);
942 0 : break;
943 0 : case FLIPADST_DCT:
944 0 : load_buffer_8x8(input, in);
945 0 : transpose_8x8(in, out);
946 0 : idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
947 0 : transpose_8x8(in, out);
948 0 : round_shift_8x8(out, -shift[0]);
949 0 : iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
950 0 : write_buffer_8x8(in, output_r, stride_r, output_w, stride_w,
951 0 : 0, 1, -shift[1], bd);
952 0 : break;
953 0 : case DCT_FLIPADST:
954 0 : load_buffer_8x8(input, in);
955 0 : transpose_8x8(in, out);
956 0 : iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
957 0 : transpose_8x8(in, out);
958 0 : round_shift_8x8(out, -shift[0]);
959 0 : idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
960 0 : write_buffer_8x8(in, output_r, stride_r, output_w, stride_w,
961 0 : 1, 0, -shift[1], bd);
962 0 : break;
963 0 : case ADST_FLIPADST:
964 0 : load_buffer_8x8(input, in);
965 0 : transpose_8x8(in, out);
966 0 : iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
967 0 : transpose_8x8(in, out);
968 0 : round_shift_8x8(out, -shift[0]);
969 0 : iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
970 0 : write_buffer_8x8(in, output_r, stride_r, output_w, stride_w,
971 0 : 1, 0, -shift[1], bd);
972 0 : break;
973 0 : case FLIPADST_FLIPADST:
974 0 : load_buffer_8x8(input, in);
975 0 : transpose_8x8(in, out);
976 0 : iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
977 0 : transpose_8x8(in, out);
978 0 : round_shift_8x8(out, -shift[0]);
979 0 : iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
980 0 : write_buffer_8x8(in, output_r, stride_r, output_w, stride_w,
981 0 : 1, 1, -shift[1], bd);
982 0 : break;
983 0 : case FLIPADST_ADST:
984 0 : load_buffer_8x8(input, in);
985 0 : transpose_8x8(in, out);
986 0 : iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
987 0 : transpose_8x8(in, out);
988 0 : round_shift_8x8(out, -shift[0]);
989 0 : iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
990 0 : write_buffer_8x8(in, output_r, stride_r, output_w, stride_w,
991 0 : 0, 1, -shift[1], bd);
992 0 : break;
993 0 : default: assert(0);
994 : }
995 0 : }
996 :
997 : // 16x16
998 0 : static void load_buffer_16x16(const int32_t *coeff, __m128i *in) {
999 : int32_t i;
1000 0 : for (i = 0; i < 64; ++i)
1001 0 : in[i] = _mm_load_si128((const __m128i *)(coeff + (i << 2)));
1002 0 : }
1003 :
1004 0 : static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8,
1005 : int32_t col) {
1006 : int32_t i;
1007 0 : for (i = 0; i < 16; i += 2) {
1008 0 : in8x8[i] = in[col];
1009 0 : in8x8[i + 1] = in[col + 1];
1010 0 : col += 4;
1011 : }
1012 0 : }
1013 :
1014 0 : static void swap_addr(uint16_t **output1, uint16_t **output2) {
1015 : uint16_t *tmp;
1016 0 : tmp = *output1;
1017 0 : *output1 = *output2;
1018 0 : *output2 = tmp;
1019 0 : }
1020 :
1021 0 : static void write_buffer_16x16(__m128i *in,
1022 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
1023 : int32_t fliplr, int32_t flipud, int32_t shift, int32_t bd) {
1024 : __m128i in8x8[16];
1025 0 : uint16_t *leftUp_r = &output_r[0];
1026 0 : uint16_t *rightUp_r = &output_r[8];
1027 0 : uint16_t *leftDown_r = &output_r[8 * stride_r];
1028 0 : uint16_t *rightDown_r = &output_r[8 * stride_r + 8];
1029 0 : uint16_t *leftUp_w = &output_w[0];
1030 0 : uint16_t *rightUp_w = &output_w[8];
1031 0 : uint16_t *leftDown_w = &output_w[8 * stride_w];
1032 0 : uint16_t *rightDown_w = &output_w[8 * stride_w + 8];
1033 :
1034 0 : if (fliplr) {
1035 0 : swap_addr(&leftUp_r, &rightUp_r);
1036 0 : swap_addr(&leftDown_r, &rightDown_r);
1037 0 : swap_addr(&leftUp_w, &rightUp_w);
1038 0 : swap_addr(&leftDown_w, &rightDown_w);
1039 : }
1040 :
1041 0 : if (flipud) {
1042 0 : swap_addr(&leftUp_r, &leftDown_r);
1043 0 : swap_addr(&rightUp_r, &rightDown_r);
1044 0 : swap_addr(&leftUp_w, &leftDown_w);
1045 0 : swap_addr(&rightUp_w, &rightDown_w);
1046 : }
1047 :
1048 : // Left-up quarter
1049 0 : assign_8x8_input_from_16x16(in, in8x8, 0);
1050 0 : write_buffer_8x8(in8x8,
1051 : leftUp_r, stride_r, leftUp_w, stride_w,
1052 : fliplr, flipud, shift, bd);
1053 :
1054 : // Right-up quarter
1055 0 : assign_8x8_input_from_16x16(in, in8x8, 2);
1056 0 : write_buffer_8x8(in8x8,
1057 : rightUp_r, stride_r, rightUp_w, stride_w,
1058 : fliplr, flipud, shift, bd);
1059 :
1060 : // Left-down quarter
1061 0 : assign_8x8_input_from_16x16(in, in8x8, 32);
1062 0 : write_buffer_8x8(in8x8,
1063 : leftDown_r, stride_r, leftDown_w, stride_w,
1064 : fliplr, flipud, shift, bd);
1065 :
1066 : // Right-down quarter
1067 0 : assign_8x8_input_from_16x16(in, in8x8, 34);
1068 0 : write_buffer_8x8(in8x8,
1069 : rightDown_r, stride_r, rightDown_w, stride_w,
1070 : fliplr, flipud, shift, bd);
1071 0 : }
1072 :
1073 0 : static void idct16x16_sse4_1(__m128i *in, __m128i *out, int32_t bit) {
1074 0 : const int32_t *cospi = cospi_arr(bit);
1075 0 : const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1076 0 : const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
1077 0 : const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1078 0 : const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
1079 0 : const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1080 0 : const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1081 0 : const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
1082 0 : const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1083 0 : const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
1084 0 : const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1085 0 : const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1086 0 : const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1087 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1088 0 : const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
1089 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1090 0 : const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1091 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1092 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1093 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1094 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1095 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1096 0 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1097 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1098 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1099 : __m128i u[16], v[16], x, y;
1100 : int32_t col;
1101 :
1102 0 : for (col = 0; col < 4; ++col) {
1103 : // stage 0
1104 : // stage 1
1105 0 : u[0] = in[0 * 4 + col];
1106 0 : u[1] = in[8 * 4 + col];
1107 0 : u[2] = in[4 * 4 + col];
1108 0 : u[3] = in[12 * 4 + col];
1109 0 : u[4] = in[2 * 4 + col];
1110 0 : u[5] = in[10 * 4 + col];
1111 0 : u[6] = in[6 * 4 + col];
1112 0 : u[7] = in[14 * 4 + col];
1113 0 : u[8] = in[1 * 4 + col];
1114 0 : u[9] = in[9 * 4 + col];
1115 0 : u[10] = in[5 * 4 + col];
1116 0 : u[11] = in[13 * 4 + col];
1117 0 : u[12] = in[3 * 4 + col];
1118 0 : u[13] = in[11 * 4 + col];
1119 0 : u[14] = in[7 * 4 + col];
1120 0 : u[15] = in[15 * 4 + col];
1121 :
1122 : // stage 2
1123 0 : v[0] = u[0];
1124 0 : v[1] = u[1];
1125 0 : v[2] = u[2];
1126 0 : v[3] = u[3];
1127 0 : v[4] = u[4];
1128 0 : v[5] = u[5];
1129 0 : v[6] = u[6];
1130 0 : v[7] = u[7];
1131 :
1132 0 : v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
1133 0 : v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
1134 0 : v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
1135 0 : v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
1136 0 : v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
1137 0 : v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
1138 0 : v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
1139 0 : v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
1140 :
1141 : // stage 3
1142 0 : u[0] = v[0];
1143 0 : u[1] = v[1];
1144 0 : u[2] = v[2];
1145 0 : u[3] = v[3];
1146 0 : u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
1147 0 : u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
1148 0 : u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
1149 0 : u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
1150 0 : u[8] = _mm_add_epi32(v[8], v[9]);
1151 0 : u[9] = _mm_sub_epi32(v[8], v[9]);
1152 0 : u[10] = _mm_sub_epi32(v[11], v[10]);
1153 0 : u[11] = _mm_add_epi32(v[10], v[11]);
1154 0 : u[12] = _mm_add_epi32(v[12], v[13]);
1155 0 : u[13] = _mm_sub_epi32(v[12], v[13]);
1156 0 : u[14] = _mm_sub_epi32(v[15], v[14]);
1157 0 : u[15] = _mm_add_epi32(v[14], v[15]);
1158 :
1159 : // stage 4
1160 0 : x = _mm_mullo_epi32(u[0], cospi32);
1161 0 : y = _mm_mullo_epi32(u[1], cospi32);
1162 0 : v[0] = _mm_add_epi32(x, y);
1163 0 : v[0] = _mm_add_epi32(v[0], rnding);
1164 0 : v[0] = _mm_srai_epi32(v[0], bit);
1165 :
1166 0 : v[1] = _mm_sub_epi32(x, y);
1167 0 : v[1] = _mm_add_epi32(v[1], rnding);
1168 0 : v[1] = _mm_srai_epi32(v[1], bit);
1169 :
1170 0 : v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
1171 0 : v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
1172 0 : v[4] = _mm_add_epi32(u[4], u[5]);
1173 0 : v[5] = _mm_sub_epi32(u[4], u[5]);
1174 0 : v[6] = _mm_sub_epi32(u[7], u[6]);
1175 0 : v[7] = _mm_add_epi32(u[6], u[7]);
1176 0 : v[8] = u[8];
1177 0 : v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1178 0 : v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
1179 0 : v[11] = u[11];
1180 0 : v[12] = u[12];
1181 0 : v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
1182 0 : v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1183 0 : v[15] = u[15];
1184 :
1185 : // stage 5
1186 0 : u[0] = _mm_add_epi32(v[0], v[3]);
1187 0 : u[1] = _mm_add_epi32(v[1], v[2]);
1188 0 : u[2] = _mm_sub_epi32(v[1], v[2]);
1189 0 : u[3] = _mm_sub_epi32(v[0], v[3]);
1190 0 : u[4] = v[4];
1191 :
1192 0 : x = _mm_mullo_epi32(v[5], cospi32);
1193 0 : y = _mm_mullo_epi32(v[6], cospi32);
1194 0 : u[5] = _mm_sub_epi32(y, x);
1195 0 : u[5] = _mm_add_epi32(u[5], rnding);
1196 0 : u[5] = _mm_srai_epi32(u[5], bit);
1197 :
1198 0 : u[6] = _mm_add_epi32(y, x);
1199 0 : u[6] = _mm_add_epi32(u[6], rnding);
1200 0 : u[6] = _mm_srai_epi32(u[6], bit);
1201 :
1202 0 : u[7] = v[7];
1203 0 : u[8] = _mm_add_epi32(v[8], v[11]);
1204 0 : u[9] = _mm_add_epi32(v[9], v[10]);
1205 0 : u[10] = _mm_sub_epi32(v[9], v[10]);
1206 0 : u[11] = _mm_sub_epi32(v[8], v[11]);
1207 0 : u[12] = _mm_sub_epi32(v[15], v[12]);
1208 0 : u[13] = _mm_sub_epi32(v[14], v[13]);
1209 0 : u[14] = _mm_add_epi32(v[13], v[14]);
1210 0 : u[15] = _mm_add_epi32(v[12], v[15]);
1211 :
1212 : // stage 6
1213 0 : v[0] = _mm_add_epi32(u[0], u[7]);
1214 0 : v[1] = _mm_add_epi32(u[1], u[6]);
1215 0 : v[2] = _mm_add_epi32(u[2], u[5]);
1216 0 : v[3] = _mm_add_epi32(u[3], u[4]);
1217 0 : v[4] = _mm_sub_epi32(u[3], u[4]);
1218 0 : v[5] = _mm_sub_epi32(u[2], u[5]);
1219 0 : v[6] = _mm_sub_epi32(u[1], u[6]);
1220 0 : v[7] = _mm_sub_epi32(u[0], u[7]);
1221 0 : v[8] = u[8];
1222 0 : v[9] = u[9];
1223 :
1224 0 : x = _mm_mullo_epi32(u[10], cospi32);
1225 0 : y = _mm_mullo_epi32(u[13], cospi32);
1226 0 : v[10] = _mm_sub_epi32(y, x);
1227 0 : v[10] = _mm_add_epi32(v[10], rnding);
1228 0 : v[10] = _mm_srai_epi32(v[10], bit);
1229 :
1230 0 : v[13] = _mm_add_epi32(x, y);
1231 0 : v[13] = _mm_add_epi32(v[13], rnding);
1232 0 : v[13] = _mm_srai_epi32(v[13], bit);
1233 :
1234 0 : x = _mm_mullo_epi32(u[11], cospi32);
1235 0 : y = _mm_mullo_epi32(u[12], cospi32);
1236 0 : v[11] = _mm_sub_epi32(y, x);
1237 0 : v[11] = _mm_add_epi32(v[11], rnding);
1238 0 : v[11] = _mm_srai_epi32(v[11], bit);
1239 :
1240 0 : v[12] = _mm_add_epi32(x, y);
1241 0 : v[12] = _mm_add_epi32(v[12], rnding);
1242 0 : v[12] = _mm_srai_epi32(v[12], bit);
1243 :
1244 0 : v[14] = u[14];
1245 0 : v[15] = u[15];
1246 :
1247 : // stage 7
1248 0 : out[0 * 4 + col] = _mm_add_epi32(v[0], v[15]);
1249 0 : out[1 * 4 + col] = _mm_add_epi32(v[1], v[14]);
1250 0 : out[2 * 4 + col] = _mm_add_epi32(v[2], v[13]);
1251 0 : out[3 * 4 + col] = _mm_add_epi32(v[3], v[12]);
1252 0 : out[4 * 4 + col] = _mm_add_epi32(v[4], v[11]);
1253 0 : out[5 * 4 + col] = _mm_add_epi32(v[5], v[10]);
1254 0 : out[6 * 4 + col] = _mm_add_epi32(v[6], v[9]);
1255 0 : out[7 * 4 + col] = _mm_add_epi32(v[7], v[8]);
1256 0 : out[8 * 4 + col] = _mm_sub_epi32(v[7], v[8]);
1257 0 : out[9 * 4 + col] = _mm_sub_epi32(v[6], v[9]);
1258 0 : out[10 * 4 + col] = _mm_sub_epi32(v[5], v[10]);
1259 0 : out[11 * 4 + col] = _mm_sub_epi32(v[4], v[11]);
1260 0 : out[12 * 4 + col] = _mm_sub_epi32(v[3], v[12]);
1261 0 : out[13 * 4 + col] = _mm_sub_epi32(v[2], v[13]);
1262 0 : out[14 * 4 + col] = _mm_sub_epi32(v[1], v[14]);
1263 0 : out[15 * 4 + col] = _mm_sub_epi32(v[0], v[15]);
1264 : }
1265 0 : }
1266 :
1267 0 : static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int32_t bit) {
1268 0 : const int32_t *cospi = cospi_arr(bit);
1269 0 : const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
1270 0 : const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
1271 0 : const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
1272 0 : const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
1273 0 : const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
1274 0 : const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
1275 0 : const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
1276 0 : const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
1277 0 : const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
1278 0 : const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
1279 0 : const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
1280 0 : const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
1281 0 : const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
1282 0 : const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
1283 0 : const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
1284 0 : const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
1285 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1286 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1287 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1288 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1289 0 : const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
1290 0 : const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
1291 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1292 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1293 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1294 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1295 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1296 : __m128i u[16], v[16], x, y;
1297 0 : const int32_t col_num = 4;
1298 : int32_t col;
1299 :
1300 : // Calculate the column 0, 1, 2, 3
1301 0 : for (col = 0; col < col_num; ++col) {
1302 : // stage 0
1303 : // stage 1
1304 : // stage 2
1305 0 : v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
1306 0 : x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
1307 0 : v[0] = _mm_add_epi32(v[0], x);
1308 0 : v[0] = _mm_add_epi32(v[0], rnding);
1309 0 : v[0] = _mm_srai_epi32(v[0], bit);
1310 :
1311 0 : v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
1312 0 : x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
1313 0 : v[1] = _mm_sub_epi32(v[1], x);
1314 0 : v[1] = _mm_add_epi32(v[1], rnding);
1315 0 : v[1] = _mm_srai_epi32(v[1], bit);
1316 :
1317 0 : v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
1318 0 : x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
1319 0 : v[2] = _mm_add_epi32(v[2], x);
1320 0 : v[2] = _mm_add_epi32(v[2], rnding);
1321 0 : v[2] = _mm_srai_epi32(v[2], bit);
1322 :
1323 0 : v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
1324 0 : x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
1325 0 : v[3] = _mm_sub_epi32(v[3], x);
1326 0 : v[3] = _mm_add_epi32(v[3], rnding);
1327 0 : v[3] = _mm_srai_epi32(v[3], bit);
1328 :
1329 0 : v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
1330 0 : x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
1331 0 : v[4] = _mm_add_epi32(v[4], x);
1332 0 : v[4] = _mm_add_epi32(v[4], rnding);
1333 0 : v[4] = _mm_srai_epi32(v[4], bit);
1334 :
1335 0 : v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
1336 0 : x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
1337 0 : v[5] = _mm_sub_epi32(v[5], x);
1338 0 : v[5] = _mm_add_epi32(v[5], rnding);
1339 0 : v[5] = _mm_srai_epi32(v[5], bit);
1340 :
1341 0 : v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
1342 0 : x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
1343 0 : v[6] = _mm_add_epi32(v[6], x);
1344 0 : v[6] = _mm_add_epi32(v[6], rnding);
1345 0 : v[6] = _mm_srai_epi32(v[6], bit);
1346 :
1347 0 : v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
1348 0 : x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
1349 0 : v[7] = _mm_sub_epi32(v[7], x);
1350 0 : v[7] = _mm_add_epi32(v[7], rnding);
1351 0 : v[7] = _mm_srai_epi32(v[7], bit);
1352 :
1353 0 : v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
1354 0 : x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
1355 0 : v[8] = _mm_add_epi32(v[8], x);
1356 0 : v[8] = _mm_add_epi32(v[8], rnding);
1357 0 : v[8] = _mm_srai_epi32(v[8], bit);
1358 :
1359 0 : v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
1360 0 : x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
1361 0 : v[9] = _mm_sub_epi32(v[9], x);
1362 0 : v[9] = _mm_add_epi32(v[9], rnding);
1363 0 : v[9] = _mm_srai_epi32(v[9], bit);
1364 :
1365 0 : v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
1366 0 : x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
1367 0 : v[10] = _mm_add_epi32(v[10], x);
1368 0 : v[10] = _mm_add_epi32(v[10], rnding);
1369 0 : v[10] = _mm_srai_epi32(v[10], bit);
1370 :
1371 0 : v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
1372 0 : x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
1373 0 : v[11] = _mm_sub_epi32(v[11], x);
1374 0 : v[11] = _mm_add_epi32(v[11], rnding);
1375 0 : v[11] = _mm_srai_epi32(v[11], bit);
1376 :
1377 0 : v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
1378 0 : x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
1379 0 : v[12] = _mm_add_epi32(v[12], x);
1380 0 : v[12] = _mm_add_epi32(v[12], rnding);
1381 0 : v[12] = _mm_srai_epi32(v[12], bit);
1382 :
1383 0 : v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
1384 0 : x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
1385 0 : v[13] = _mm_sub_epi32(v[13], x);
1386 0 : v[13] = _mm_add_epi32(v[13], rnding);
1387 0 : v[13] = _mm_srai_epi32(v[13], bit);
1388 :
1389 0 : v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
1390 0 : x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
1391 0 : v[14] = _mm_add_epi32(v[14], x);
1392 0 : v[14] = _mm_add_epi32(v[14], rnding);
1393 0 : v[14] = _mm_srai_epi32(v[14], bit);
1394 :
1395 0 : v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
1396 0 : x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
1397 0 : v[15] = _mm_sub_epi32(v[15], x);
1398 0 : v[15] = _mm_add_epi32(v[15], rnding);
1399 0 : v[15] = _mm_srai_epi32(v[15], bit);
1400 :
1401 : // stage 3
1402 0 : u[0] = _mm_add_epi32(v[0], v[8]);
1403 0 : u[8] = _mm_sub_epi32(v[0], v[8]);
1404 0 : u[1] = _mm_add_epi32(v[1], v[9]);
1405 0 : u[9] = _mm_sub_epi32(v[1], v[9]);
1406 0 : u[2] = _mm_add_epi32(v[2], v[10]);
1407 0 : u[10] = _mm_sub_epi32(v[2], v[10]);
1408 0 : u[3] = _mm_add_epi32(v[3], v[11]);
1409 0 : u[11] = _mm_sub_epi32(v[3], v[11]);
1410 0 : u[4] = _mm_add_epi32(v[4], v[12]);
1411 0 : u[12] = _mm_sub_epi32(v[4], v[12]);
1412 0 : u[5] = _mm_add_epi32(v[5], v[13]);
1413 0 : u[13] = _mm_sub_epi32(v[5], v[13]);
1414 0 : u[6] = _mm_add_epi32(v[6], v[14]);
1415 0 : u[14] = _mm_sub_epi32(v[6], v[14]);
1416 0 : u[7] = _mm_add_epi32(v[7], v[15]);
1417 0 : u[15] = _mm_sub_epi32(v[7], v[15]);
1418 :
1419 : // stage 4
1420 0 : v[0] = u[0];
1421 0 : v[1] = u[1];
1422 0 : v[2] = u[2];
1423 0 : v[3] = u[3];
1424 0 : v[4] = u[4];
1425 0 : v[5] = u[5];
1426 0 : v[6] = u[6];
1427 0 : v[7] = u[7];
1428 :
1429 0 : v[8] = _mm_mullo_epi32(u[8], cospi8);
1430 0 : x = _mm_mullo_epi32(u[9], cospi56);
1431 0 : v[8] = _mm_add_epi32(v[8], x);
1432 0 : v[8] = _mm_add_epi32(v[8], rnding);
1433 0 : v[8] = _mm_srai_epi32(v[8], bit);
1434 :
1435 0 : v[9] = _mm_mullo_epi32(u[8], cospi56);
1436 0 : x = _mm_mullo_epi32(u[9], cospi8);
1437 0 : v[9] = _mm_sub_epi32(v[9], x);
1438 0 : v[9] = _mm_add_epi32(v[9], rnding);
1439 0 : v[9] = _mm_srai_epi32(v[9], bit);
1440 :
1441 0 : v[10] = _mm_mullo_epi32(u[10], cospi40);
1442 0 : x = _mm_mullo_epi32(u[11], cospi24);
1443 0 : v[10] = _mm_add_epi32(v[10], x);
1444 0 : v[10] = _mm_add_epi32(v[10], rnding);
1445 0 : v[10] = _mm_srai_epi32(v[10], bit);
1446 :
1447 0 : v[11] = _mm_mullo_epi32(u[10], cospi24);
1448 0 : x = _mm_mullo_epi32(u[11], cospi40);
1449 0 : v[11] = _mm_sub_epi32(v[11], x);
1450 0 : v[11] = _mm_add_epi32(v[11], rnding);
1451 0 : v[11] = _mm_srai_epi32(v[11], bit);
1452 :
1453 0 : v[12] = _mm_mullo_epi32(u[12], cospim56);
1454 0 : x = _mm_mullo_epi32(u[13], cospi8);
1455 0 : v[12] = _mm_add_epi32(v[12], x);
1456 0 : v[12] = _mm_add_epi32(v[12], rnding);
1457 0 : v[12] = _mm_srai_epi32(v[12], bit);
1458 :
1459 0 : v[13] = _mm_mullo_epi32(u[12], cospi8);
1460 0 : x = _mm_mullo_epi32(u[13], cospim56);
1461 0 : v[13] = _mm_sub_epi32(v[13], x);
1462 0 : v[13] = _mm_add_epi32(v[13], rnding);
1463 0 : v[13] = _mm_srai_epi32(v[13], bit);
1464 :
1465 0 : v[14] = _mm_mullo_epi32(u[14], cospim24);
1466 0 : x = _mm_mullo_epi32(u[15], cospi40);
1467 0 : v[14] = _mm_add_epi32(v[14], x);
1468 0 : v[14] = _mm_add_epi32(v[14], rnding);
1469 0 : v[14] = _mm_srai_epi32(v[14], bit);
1470 :
1471 0 : v[15] = _mm_mullo_epi32(u[14], cospi40);
1472 0 : x = _mm_mullo_epi32(u[15], cospim24);
1473 0 : v[15] = _mm_sub_epi32(v[15], x);
1474 0 : v[15] = _mm_add_epi32(v[15], rnding);
1475 0 : v[15] = _mm_srai_epi32(v[15], bit);
1476 :
1477 : // stage 5
1478 0 : u[0] = _mm_add_epi32(v[0], v[4]);
1479 0 : u[4] = _mm_sub_epi32(v[0], v[4]);
1480 0 : u[1] = _mm_add_epi32(v[1], v[5]);
1481 0 : u[5] = _mm_sub_epi32(v[1], v[5]);
1482 0 : u[2] = _mm_add_epi32(v[2], v[6]);
1483 0 : u[6] = _mm_sub_epi32(v[2], v[6]);
1484 0 : u[3] = _mm_add_epi32(v[3], v[7]);
1485 0 : u[7] = _mm_sub_epi32(v[3], v[7]);
1486 0 : u[8] = _mm_add_epi32(v[8], v[12]);
1487 0 : u[12] = _mm_sub_epi32(v[8], v[12]);
1488 0 : u[9] = _mm_add_epi32(v[9], v[13]);
1489 0 : u[13] = _mm_sub_epi32(v[9], v[13]);
1490 0 : u[10] = _mm_add_epi32(v[10], v[14]);
1491 0 : u[14] = _mm_sub_epi32(v[10], v[14]);
1492 0 : u[11] = _mm_add_epi32(v[11], v[15]);
1493 0 : u[15] = _mm_sub_epi32(v[11], v[15]);
1494 :
1495 : // stage 6
1496 0 : v[0] = u[0];
1497 0 : v[1] = u[1];
1498 0 : v[2] = u[2];
1499 0 : v[3] = u[3];
1500 :
1501 0 : v[4] = _mm_mullo_epi32(u[4], cospi16);
1502 0 : x = _mm_mullo_epi32(u[5], cospi48);
1503 0 : v[4] = _mm_add_epi32(v[4], x);
1504 0 : v[4] = _mm_add_epi32(v[4], rnding);
1505 0 : v[4] = _mm_srai_epi32(v[4], bit);
1506 :
1507 0 : v[5] = _mm_mullo_epi32(u[4], cospi48);
1508 0 : x = _mm_mullo_epi32(u[5], cospi16);
1509 0 : v[5] = _mm_sub_epi32(v[5], x);
1510 0 : v[5] = _mm_add_epi32(v[5], rnding);
1511 0 : v[5] = _mm_srai_epi32(v[5], bit);
1512 :
1513 0 : v[6] = _mm_mullo_epi32(u[6], cospim48);
1514 0 : x = _mm_mullo_epi32(u[7], cospi16);
1515 0 : v[6] = _mm_add_epi32(v[6], x);
1516 0 : v[6] = _mm_add_epi32(v[6], rnding);
1517 0 : v[6] = _mm_srai_epi32(v[6], bit);
1518 :
1519 0 : v[7] = _mm_mullo_epi32(u[6], cospi16);
1520 0 : x = _mm_mullo_epi32(u[7], cospim48);
1521 0 : v[7] = _mm_sub_epi32(v[7], x);
1522 0 : v[7] = _mm_add_epi32(v[7], rnding);
1523 0 : v[7] = _mm_srai_epi32(v[7], bit);
1524 :
1525 0 : v[8] = u[8];
1526 0 : v[9] = u[9];
1527 0 : v[10] = u[10];
1528 0 : v[11] = u[11];
1529 :
1530 0 : v[12] = _mm_mullo_epi32(u[12], cospi16);
1531 0 : x = _mm_mullo_epi32(u[13], cospi48);
1532 0 : v[12] = _mm_add_epi32(v[12], x);
1533 0 : v[12] = _mm_add_epi32(v[12], rnding);
1534 0 : v[12] = _mm_srai_epi32(v[12], bit);
1535 :
1536 0 : v[13] = _mm_mullo_epi32(u[12], cospi48);
1537 0 : x = _mm_mullo_epi32(u[13], cospi16);
1538 0 : v[13] = _mm_sub_epi32(v[13], x);
1539 0 : v[13] = _mm_add_epi32(v[13], rnding);
1540 0 : v[13] = _mm_srai_epi32(v[13], bit);
1541 :
1542 0 : v[14] = _mm_mullo_epi32(u[14], cospim48);
1543 0 : x = _mm_mullo_epi32(u[15], cospi16);
1544 0 : v[14] = _mm_add_epi32(v[14], x);
1545 0 : v[14] = _mm_add_epi32(v[14], rnding);
1546 0 : v[14] = _mm_srai_epi32(v[14], bit);
1547 :
1548 0 : v[15] = _mm_mullo_epi32(u[14], cospi16);
1549 0 : x = _mm_mullo_epi32(u[15], cospim48);
1550 0 : v[15] = _mm_sub_epi32(v[15], x);
1551 0 : v[15] = _mm_add_epi32(v[15], rnding);
1552 0 : v[15] = _mm_srai_epi32(v[15], bit);
1553 :
1554 : // stage 7
1555 0 : u[0] = _mm_add_epi32(v[0], v[2]);
1556 0 : u[2] = _mm_sub_epi32(v[0], v[2]);
1557 0 : u[1] = _mm_add_epi32(v[1], v[3]);
1558 0 : u[3] = _mm_sub_epi32(v[1], v[3]);
1559 0 : u[4] = _mm_add_epi32(v[4], v[6]);
1560 0 : u[6] = _mm_sub_epi32(v[4], v[6]);
1561 0 : u[5] = _mm_add_epi32(v[5], v[7]);
1562 0 : u[7] = _mm_sub_epi32(v[5], v[7]);
1563 0 : u[8] = _mm_add_epi32(v[8], v[10]);
1564 0 : u[10] = _mm_sub_epi32(v[8], v[10]);
1565 0 : u[9] = _mm_add_epi32(v[9], v[11]);
1566 0 : u[11] = _mm_sub_epi32(v[9], v[11]);
1567 0 : u[12] = _mm_add_epi32(v[12], v[14]);
1568 0 : u[14] = _mm_sub_epi32(v[12], v[14]);
1569 0 : u[13] = _mm_add_epi32(v[13], v[15]);
1570 0 : u[15] = _mm_sub_epi32(v[13], v[15]);
1571 :
1572 : // stage 8
1573 0 : v[0] = u[0];
1574 0 : v[1] = u[1];
1575 :
1576 0 : y = _mm_mullo_epi32(u[2], cospi32);
1577 0 : x = _mm_mullo_epi32(u[3], cospi32);
1578 0 : v[2] = _mm_add_epi32(y, x);
1579 0 : v[2] = _mm_add_epi32(v[2], rnding);
1580 0 : v[2] = _mm_srai_epi32(v[2], bit);
1581 :
1582 0 : v[3] = _mm_sub_epi32(y, x);
1583 0 : v[3] = _mm_add_epi32(v[3], rnding);
1584 0 : v[3] = _mm_srai_epi32(v[3], bit);
1585 :
1586 0 : v[4] = u[4];
1587 0 : v[5] = u[5];
1588 :
1589 0 : y = _mm_mullo_epi32(u[6], cospi32);
1590 0 : x = _mm_mullo_epi32(u[7], cospi32);
1591 0 : v[6] = _mm_add_epi32(y, x);
1592 0 : v[6] = _mm_add_epi32(v[6], rnding);
1593 0 : v[6] = _mm_srai_epi32(v[6], bit);
1594 :
1595 0 : v[7] = _mm_sub_epi32(y, x);
1596 0 : v[7] = _mm_add_epi32(v[7], rnding);
1597 0 : v[7] = _mm_srai_epi32(v[7], bit);
1598 :
1599 0 : v[8] = u[8];
1600 0 : v[9] = u[9];
1601 :
1602 0 : y = _mm_mullo_epi32(u[10], cospi32);
1603 0 : x = _mm_mullo_epi32(u[11], cospi32);
1604 0 : v[10] = _mm_add_epi32(y, x);
1605 0 : v[10] = _mm_add_epi32(v[10], rnding);
1606 0 : v[10] = _mm_srai_epi32(v[10], bit);
1607 :
1608 0 : v[11] = _mm_sub_epi32(y, x);
1609 0 : v[11] = _mm_add_epi32(v[11], rnding);
1610 0 : v[11] = _mm_srai_epi32(v[11], bit);
1611 :
1612 0 : v[12] = u[12];
1613 0 : v[13] = u[13];
1614 :
1615 0 : y = _mm_mullo_epi32(u[14], cospi32);
1616 0 : x = _mm_mullo_epi32(u[15], cospi32);
1617 0 : v[14] = _mm_add_epi32(y, x);
1618 0 : v[14] = _mm_add_epi32(v[14], rnding);
1619 0 : v[14] = _mm_srai_epi32(v[14], bit);
1620 :
1621 0 : v[15] = _mm_sub_epi32(y, x);
1622 0 : v[15] = _mm_add_epi32(v[15], rnding);
1623 0 : v[15] = _mm_srai_epi32(v[15], bit);
1624 :
1625 : // stage 9
1626 0 : out[0 * col_num + col] = v[0];
1627 0 : out[1 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
1628 0 : out[2 * col_num + col] = v[12];
1629 0 : out[3 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
1630 0 : out[4 * col_num + col] = v[6];
1631 0 : out[5 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
1632 0 : out[6 * col_num + col] = v[10];
1633 0 : out[7 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
1634 0 : out[8 * col_num + col] = v[3];
1635 0 : out[9 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
1636 0 : out[10 * col_num + col] = v[15];
1637 0 : out[11 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
1638 0 : out[12 * col_num + col] = v[5];
1639 0 : out[13 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
1640 0 : out[14 * col_num + col] = v[9];
1641 0 : out[15 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
1642 : }
1643 0 : }
1644 :
1645 0 : static void round_shift_16x16(__m128i *in, int32_t shift) {
1646 0 : round_shift_8x8(&in[0], shift);
1647 0 : round_shift_8x8(&in[16], shift);
1648 0 : round_shift_8x8(&in[32], shift);
1649 0 : round_shift_8x8(&in[48], shift);
1650 0 : }
1651 :
1652 0 : void eb_av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *input,
1653 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
1654 : TxType tx_type, int32_t bd) {
1655 : __m128i in[64], out[64];
1656 0 : const int8_t *shift = eb_inv_txfm_shift_ls[TX_16X16];
1657 0 : const int32_t txw_idx = get_txw_idx(TX_16X16);
1658 0 : const int32_t txh_idx = get_txh_idx(TX_16X16);
1659 :
1660 0 : switch (tx_type) {
1661 0 : case DCT_DCT:
1662 0 : load_buffer_16x16(input, in);
1663 0 : transpose_16x16(in, out);
1664 0 : idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
1665 0 : round_shift_16x16(in, -shift[0]);
1666 0 : transpose_16x16(in, out);
1667 0 : idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
1668 0 : write_buffer_16x16(in, output_r, stride_r, output_w, stride_w,
1669 0 : 0, 0, -shift[1], bd);
1670 0 : break;
1671 0 : case DCT_ADST:
1672 0 : load_buffer_16x16(input, in);
1673 0 : transpose_16x16(in, out);
1674 0 : iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
1675 0 : round_shift_16x16(in, -shift[0]);
1676 0 : transpose_16x16(in, out);
1677 0 : idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
1678 0 : write_buffer_16x16(in, output_r, stride_r, output_w, stride_w,
1679 0 : 0, 0, -shift[1], bd);
1680 0 : break;
1681 0 : case ADST_DCT:
1682 0 : load_buffer_16x16(input, in);
1683 0 : transpose_16x16(in, out);
1684 0 : idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
1685 0 : round_shift_16x16(in, -shift[0]);
1686 0 : transpose_16x16(in, out);
1687 0 : iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
1688 0 : write_buffer_16x16(in, output_r, stride_r, output_w, stride_w,
1689 0 : 0, 0, -shift[1], bd);
1690 0 : break;
1691 0 : case ADST_ADST:
1692 0 : load_buffer_16x16(input, in);
1693 0 : transpose_16x16(in, out);
1694 0 : iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
1695 0 : round_shift_16x16(in, -shift[0]);
1696 0 : transpose_16x16(in, out);
1697 0 : iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
1698 0 : write_buffer_16x16(in, output_r, stride_r, output_w, stride_w,
1699 0 : 0, 0, -shift[1], bd);
1700 0 : break;
1701 0 : case FLIPADST_DCT:
1702 0 : load_buffer_16x16(input, in);
1703 0 : transpose_16x16(in, out);
1704 0 : idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
1705 0 : round_shift_16x16(in, -shift[0]);
1706 0 : transpose_16x16(in, out);
1707 0 : iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
1708 0 : write_buffer_16x16(in, output_r, stride_r, output_w, stride_w,
1709 0 : 0, 1, -shift[1], bd);
1710 0 : break;
1711 0 : case DCT_FLIPADST:
1712 0 : load_buffer_16x16(input, in);
1713 0 : transpose_16x16(in, out);
1714 0 : iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
1715 0 : round_shift_16x16(in, -shift[0]);
1716 0 : transpose_16x16(in, out);
1717 0 : idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
1718 0 : write_buffer_16x16(in, output_r, stride_r, output_w, stride_w,
1719 0 : 1, 0, -shift[1], bd);
1720 0 : break;
1721 0 : case ADST_FLIPADST:
1722 0 : load_buffer_16x16(input, in);
1723 0 : transpose_16x16(in, out);
1724 0 : iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
1725 0 : round_shift_16x16(in, -shift[0]);
1726 0 : transpose_16x16(in, out);
1727 0 : iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
1728 0 : write_buffer_16x16(in, output_r, stride_r, output_w, stride_w,
1729 0 : 1, 0, -shift[1], bd);
1730 0 : break;
1731 0 : case FLIPADST_FLIPADST:
1732 0 : load_buffer_16x16(input, in);
1733 0 : transpose_16x16(in, out);
1734 0 : iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
1735 0 : round_shift_16x16(in, -shift[0]);
1736 0 : transpose_16x16(in, out);
1737 0 : iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
1738 0 : write_buffer_16x16(in, output_r, stride_r, output_w, stride_w,
1739 0 : 1, 1, -shift[1], bd);
1740 0 : break;
1741 0 : case FLIPADST_ADST:
1742 0 : load_buffer_16x16(input, in);
1743 0 : transpose_16x16(in, out);
1744 0 : iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
1745 0 : round_shift_16x16(in, -shift[0]);
1746 0 : transpose_16x16(in, out);
1747 0 : iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
1748 0 : write_buffer_16x16(in, output_r, stride_r, output_w, stride_w,
1749 0 : 0, 1, -shift[1], bd);
1750 0 : break;
1751 0 : default: assert(0);
1752 : }
1753 0 : }
1754 :
1755 0 : static void load_buffer_64x64_lower_32x32(const int32_t *coeff, __m128i *in) {
1756 : int32_t i, j;
1757 :
1758 0 : __m128i zero = _mm_setzero_si128();
1759 :
1760 0 : for (i = 0; i < 32; ++i) {
1761 0 : for (j = 0; j < 8; ++j) {
1762 0 : in[16 * i + j] =
1763 0 : _mm_loadu_si128((const __m128i *)(coeff + 32 * i + 4 * j));
1764 0 : in[16 * i + j + 8] = zero;
1765 : }
1766 : }
1767 :
1768 0 : for (i = 0; i < 512; ++i) in[512 + i] = zero;
1769 0 : }
1770 :
1771 0 : static void transpose_64x64(__m128i *in, __m128i *out, int32_t do_cols) {
1772 : int32_t i, j;
1773 0 : for (i = 0; i < (do_cols ? 16 : 8); ++i) {
1774 0 : for (j = 0; j < 8; ++j) {
1775 0 : TRANSPOSE_4X4(in[(4 * i + 0) * 16 + j], in[(4 * i + 1) * 16 + j],
1776 : in[(4 * i + 2) * 16 + j], in[(4 * i + 3) * 16 + j],
1777 : out[(4 * j + 0) * 16 + i], out[(4 * j + 1) * 16 + i],
1778 : out[(4 * j + 2) * 16 + i], out[(4 * j + 3) * 16 + i]);
1779 : }
1780 : }
1781 0 : }
1782 :
1783 0 : static void round_shift_32x32(__m128i *in, int32_t shift) {
1784 0 : round_shift_16x16(&in[0], shift);
1785 0 : round_shift_16x16(&in[64], shift);
1786 0 : round_shift_16x16(&in[128], shift);
1787 0 : round_shift_16x16(&in[192], shift);
1788 0 : }
1789 :
1790 0 : static void round_shift_64x64(__m128i *in, int32_t shift) {
1791 0 : round_shift_32x32(&in[0], shift);
1792 0 : round_shift_32x32(&in[256], shift);
1793 0 : }
1794 :
1795 0 : static void assign_16x16_input_from_32x32(const __m128i *in, __m128i *in16x16,
1796 : int32_t col) {
1797 : int32_t i;
1798 0 : for (i = 0; i < 16 * 16 / 4; i += 4) {
1799 0 : in16x16[i] = in[col];
1800 0 : in16x16[i + 1] = in[col + 1];
1801 0 : in16x16[i + 2] = in[col + 2];
1802 0 : in16x16[i + 3] = in[col + 3];
1803 0 : col += 8;
1804 : }
1805 0 : }
1806 :
1807 0 : static void write_buffer_32x32(__m128i *in,
1808 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
1809 : int32_t fliplr, int32_t flipud, int32_t shift, int32_t bd) {
1810 : __m128i in16x16[16 * 16 / 4];
1811 0 : uint16_t *leftUp_r = &output_r[0];
1812 0 : uint16_t *rightUp_r = &output_r[16];
1813 0 : uint16_t *leftDown_r = &output_r[16 * stride_r];
1814 0 : uint16_t *rightDown_r = &output_r[16 * stride_r + 16];
1815 0 : uint16_t *leftUp_w = &output_w[0];
1816 0 : uint16_t *rightUp_w = &output_w[16];
1817 0 : uint16_t *leftDown_w = &output_w[16 * stride_w];
1818 0 : uint16_t *rightDown_w = &output_w[16 * stride_w + 16];
1819 :
1820 0 : if (fliplr) {
1821 0 : swap_addr(&leftUp_r, &rightUp_r);
1822 0 : swap_addr(&leftDown_r, &rightDown_r);
1823 0 : swap_addr(&leftUp_w, &rightUp_w);
1824 0 : swap_addr(&leftDown_w, &rightDown_w);
1825 : }
1826 :
1827 0 : if (flipud) {
1828 0 : swap_addr(&leftUp_r, &leftDown_r);
1829 0 : swap_addr(&rightUp_r, &rightDown_r);
1830 0 : swap_addr(&leftUp_w, &leftDown_w);
1831 0 : swap_addr(&rightUp_w, &rightDown_w);
1832 : }
1833 :
1834 : // Left-up quarter
1835 0 : assign_16x16_input_from_32x32(in, in16x16, 0);
1836 0 : write_buffer_16x16(in16x16, leftUp_r, stride_r, leftUp_w, stride_w,
1837 : fliplr, flipud, shift, bd);
1838 :
1839 : // Right-up quarter
1840 0 : assign_16x16_input_from_32x32(in, in16x16, 32 / 2 / 4);
1841 0 : write_buffer_16x16(in16x16, rightUp_r, stride_r, rightUp_w, stride_w,
1842 : fliplr, flipud, shift, bd);
1843 :
1844 : // Left-down quarter
1845 0 : assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4);
1846 0 : write_buffer_16x16(in16x16, leftDown_r, stride_r, leftDown_w, stride_w,
1847 : fliplr, flipud, shift, bd);
1848 :
1849 : // Right-down quarter
1850 0 : assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4 + 32 / 2 / 4);
1851 0 : write_buffer_16x16(in16x16, rightDown_r, stride_r, rightDown_w, stride_w,
1852 : fliplr, flipud, shift, bd);
1853 0 : }
1854 :
1855 0 : static void assign_32x32_input_from_64x64(const __m128i *in, __m128i *in32x32,
1856 : int32_t col) {
1857 : int32_t i;
1858 0 : for (i = 0; i < 32 * 32 / 4; i += 8) {
1859 0 : in32x32[i] = in[col];
1860 0 : in32x32[i + 1] = in[col + 1];
1861 0 : in32x32[i + 2] = in[col + 2];
1862 0 : in32x32[i + 3] = in[col + 3];
1863 0 : in32x32[i + 4] = in[col + 4];
1864 0 : in32x32[i + 5] = in[col + 5];
1865 0 : in32x32[i + 6] = in[col + 6];
1866 0 : in32x32[i + 7] = in[col + 7];
1867 0 : col += 16;
1868 : }
1869 0 : }
1870 :
1871 0 : static void write_buffer_64x64(__m128i *in,
1872 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
1873 : int32_t fliplr, int32_t flipud, int32_t shift, int32_t bd) {
1874 : __m128i in32x32[32 * 32 / 4];
1875 0 : uint16_t *leftUp_r = &output_r[0];
1876 0 : uint16_t *rightUp_r = &output_r[32];
1877 0 : uint16_t *leftDown_r = &output_r[32 * stride_r];
1878 0 : uint16_t *rightDown_r = &output_r[32 * stride_r + 32];
1879 0 : uint16_t *leftUp_w = &output_w[0];
1880 0 : uint16_t *rightUp_w = &output_w[32];
1881 0 : uint16_t *leftDown_w = &output_w[32 * stride_w];
1882 0 : uint16_t *rightDown_w = &output_w[32 * stride_w + 32];
1883 :
1884 0 : if (fliplr) {
1885 0 : swap_addr(&leftUp_r, &rightUp_r);
1886 0 : swap_addr(&leftDown_r, &rightDown_r);
1887 0 : swap_addr(&leftUp_w, &rightUp_w);
1888 0 : swap_addr(&leftDown_w, &rightDown_w);
1889 : }
1890 :
1891 0 : if (flipud) {
1892 0 : swap_addr(&leftUp_r, &leftDown_r);
1893 0 : swap_addr(&rightUp_r, &rightDown_r);
1894 0 : swap_addr(&leftUp_w, &leftDown_w);
1895 0 : swap_addr(&rightUp_w, &rightDown_w);
1896 : }
1897 :
1898 : // Left-up quarter
1899 0 : assign_32x32_input_from_64x64(in, in32x32, 0);
1900 0 : write_buffer_32x32(in32x32, leftUp_r, stride_r, leftUp_w, stride_w,
1901 : fliplr, flipud, shift, bd);
1902 :
1903 : // Right-up quarter
1904 0 : assign_32x32_input_from_64x64(in, in32x32, 64 / 2 / 4);
1905 0 : write_buffer_32x32(in32x32, rightUp_r, stride_r, rightUp_w, stride_w,
1906 : fliplr, flipud, shift, bd);
1907 :
1908 : // Left-down quarter
1909 0 : assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4);
1910 0 : write_buffer_32x32(in32x32, leftDown_r, stride_r, leftDown_w, stride_w,
1911 : fliplr, flipud, shift, bd);
1912 :
1913 : // Right-down quarter
1914 0 : assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4 + 64 / 2 / 4);
1915 0 : write_buffer_32x32(in32x32, rightDown_r, stride_r, rightDown_w, stride_w,
1916 : fliplr, flipud, shift, bd);
1917 0 : }
1918 :
1919 0 : static void idct64x64_sse4_1(__m128i *in, __m128i *out, int32_t bit, int32_t do_cols,
1920 : int32_t bd) {
1921 : int32_t i, j;
1922 0 : const int32_t *cospi = cospi_arr(bit);
1923 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1924 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1925 0 : const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1926 0 : const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1927 : int32_t col;
1928 :
1929 0 : const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
1930 0 : const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
1931 0 : const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
1932 0 : const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1933 0 : const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
1934 0 : const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
1935 0 : const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
1936 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1937 0 : const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
1938 0 : const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
1939 0 : const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
1940 0 : const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1941 0 : const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
1942 0 : const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
1943 0 : const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
1944 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1945 0 : const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
1946 0 : const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
1947 0 : const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
1948 0 : const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1949 0 : const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
1950 0 : const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
1951 0 : const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
1952 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1953 0 : const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
1954 0 : const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
1955 0 : const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
1956 0 : const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1957 0 : const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
1958 0 : const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
1959 0 : const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
1960 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1961 0 : const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
1962 0 : const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1963 0 : const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
1964 0 : const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
1965 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1966 0 : const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
1967 0 : const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1968 0 : const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
1969 0 : const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
1970 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1971 0 : const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
1972 0 : const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1973 0 : const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
1974 0 : const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
1975 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1976 0 : const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
1977 0 : const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1978 0 : const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
1979 0 : const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
1980 :
1981 0 : const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
1982 0 : const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
1983 0 : const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
1984 0 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1985 0 : const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
1986 0 : const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
1987 0 : const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
1988 0 : const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
1989 0 : const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
1990 0 : const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
1991 0 : const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
1992 0 : const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
1993 0 : const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1994 0 : const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
1995 0 : const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
1996 0 : const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
1997 0 : const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
1998 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1999 0 : const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
2000 0 : const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
2001 0 : const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
2002 0 : const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
2003 0 : const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2004 0 : const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
2005 0 : const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
2006 0 : const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
2007 0 : const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
2008 :
2009 0 : for (col = 0; col < (do_cols ? 64 / 4 : 32 / 4); ++col) {
2010 : __m128i u[64], v[64];
2011 :
2012 : // stage 1
2013 0 : u[32] = in[1 * 16 + col];
2014 0 : u[34] = in[17 * 16 + col];
2015 0 : u[36] = in[9 * 16 + col];
2016 0 : u[38] = in[25 * 16 + col];
2017 0 : u[40] = in[5 * 16 + col];
2018 0 : u[42] = in[21 * 16 + col];
2019 0 : u[44] = in[13 * 16 + col];
2020 0 : u[46] = in[29 * 16 + col];
2021 0 : u[48] = in[3 * 16 + col];
2022 0 : u[50] = in[19 * 16 + col];
2023 0 : u[52] = in[11 * 16 + col];
2024 0 : u[54] = in[27 * 16 + col];
2025 0 : u[56] = in[7 * 16 + col];
2026 0 : u[58] = in[23 * 16 + col];
2027 0 : u[60] = in[15 * 16 + col];
2028 0 : u[62] = in[31 * 16 + col];
2029 :
2030 0 : v[16] = in[2 * 16 + col];
2031 0 : v[18] = in[18 * 16 + col];
2032 0 : v[20] = in[10 * 16 + col];
2033 0 : v[22] = in[26 * 16 + col];
2034 0 : v[24] = in[6 * 16 + col];
2035 0 : v[26] = in[22 * 16 + col];
2036 0 : v[28] = in[14 * 16 + col];
2037 0 : v[30] = in[30 * 16 + col];
2038 :
2039 0 : u[8] = in[4 * 16 + col];
2040 0 : u[10] = in[20 * 16 + col];
2041 0 : u[12] = in[12 * 16 + col];
2042 0 : u[14] = in[28 * 16 + col];
2043 :
2044 0 : v[4] = in[8 * 16 + col];
2045 0 : v[6] = in[24 * 16 + col];
2046 :
2047 0 : u[0] = in[0 * 16 + col];
2048 0 : u[2] = in[16 * 16 + col];
2049 :
2050 : // stage 2
2051 0 : v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
2052 0 : v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
2053 0 : v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
2054 0 : v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
2055 0 : v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
2056 0 : v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
2057 0 : v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
2058 0 : v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
2059 0 : v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
2060 0 : v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
2061 0 : v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
2062 0 : v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
2063 0 : v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
2064 0 : v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
2065 0 : v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
2066 0 : v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
2067 0 : v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
2068 0 : v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
2069 0 : v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
2070 0 : v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
2071 0 : v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
2072 0 : v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
2073 0 : v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
2074 0 : v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
2075 0 : v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
2076 0 : v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
2077 0 : v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
2078 0 : v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
2079 0 : v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
2080 0 : v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
2081 0 : v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
2082 0 : v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
2083 :
2084 : // stage 3
2085 0 : u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
2086 0 : u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
2087 0 : u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
2088 0 : u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
2089 0 : u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
2090 0 : u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
2091 0 : u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
2092 0 : u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
2093 0 : u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
2094 0 : u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
2095 0 : u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
2096 0 : u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
2097 0 : u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
2098 0 : u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
2099 0 : u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
2100 0 : u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
2101 :
2102 0 : for (i = 32; i < 64; i += 4) {
2103 0 : addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
2104 : &clamp_hi);
2105 0 : addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
2106 : &clamp_hi);
2107 : }
2108 :
2109 : // stage 4
2110 0 : v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
2111 0 : v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
2112 0 : v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
2113 0 : v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
2114 0 : v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
2115 0 : v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
2116 0 : v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
2117 0 : v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
2118 :
2119 0 : for (i = 16; i < 32; i += 4) {
2120 0 : addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
2121 : &clamp_hi);
2122 0 : addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
2123 : &clamp_hi);
2124 : }
2125 :
2126 0 : for (i = 32; i < 64; i += 4) {
2127 0 : v[i + 0] = u[i + 0];
2128 0 : v[i + 3] = u[i + 3];
2129 : }
2130 :
2131 0 : v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
2132 0 : v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
2133 0 : v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
2134 0 : v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
2135 0 : v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
2136 0 : v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
2137 0 : v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
2138 0 : v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
2139 0 : v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
2140 0 : v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
2141 0 : v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
2142 0 : v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
2143 0 : v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
2144 0 : v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
2145 0 : v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
2146 0 : v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
2147 :
2148 : // stage 5
2149 0 : u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
2150 0 : u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
2151 0 : u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
2152 0 : u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
2153 :
2154 0 : for (i = 8; i < 16; i += 4) {
2155 0 : addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
2156 : &clamp_hi);
2157 0 : addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
2158 : &clamp_hi);
2159 : }
2160 :
2161 0 : for (i = 16; i < 32; i += 4) {
2162 0 : u[i + 0] = v[i + 0];
2163 0 : u[i + 3] = v[i + 3];
2164 : }
2165 :
2166 0 : u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
2167 0 : u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
2168 0 : u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
2169 0 : u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
2170 0 : u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
2171 0 : u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
2172 0 : u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
2173 0 : u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
2174 :
2175 0 : for (i = 32; i < 64; i += 8) {
2176 0 : addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
2177 : &clamp_hi);
2178 0 : addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
2179 : &clamp_hi);
2180 :
2181 0 : addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
2182 : &clamp_hi);
2183 0 : addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
2184 : &clamp_hi);
2185 : }
2186 :
2187 : // stage 6
2188 0 : v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
2189 0 : v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
2190 0 : v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
2191 0 : v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
2192 :
2193 0 : addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
2194 0 : addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
2195 :
2196 0 : for (i = 8; i < 16; i += 4) {
2197 0 : v[i + 0] = u[i + 0];
2198 0 : v[i + 3] = u[i + 3];
2199 : }
2200 :
2201 0 : v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
2202 0 : v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
2203 0 : v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
2204 0 : v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
2205 :
2206 0 : for (i = 16; i < 32; i += 8) {
2207 0 : addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
2208 : &clamp_hi);
2209 0 : addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
2210 : &clamp_hi);
2211 :
2212 0 : addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
2213 : &clamp_hi);
2214 0 : addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
2215 : &clamp_hi);
2216 : }
2217 :
2218 0 : for (i = 32; i < 64; i += 8) {
2219 0 : v[i + 0] = u[i + 0];
2220 0 : v[i + 1] = u[i + 1];
2221 0 : v[i + 6] = u[i + 6];
2222 0 : v[i + 7] = u[i + 7];
2223 : }
2224 :
2225 0 : v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
2226 0 : v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
2227 0 : v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
2228 0 : v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
2229 0 : v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
2230 0 : v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
2231 0 : v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
2232 0 : v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
2233 0 : v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
2234 0 : v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
2235 0 : v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
2236 0 : v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
2237 0 : v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
2238 0 : v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
2239 0 : v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
2240 0 : v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
2241 :
2242 : // stage 7
2243 0 : addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2244 0 : addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2245 :
2246 0 : u[4] = v[4];
2247 0 : u[7] = v[7];
2248 0 : u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
2249 0 : u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
2250 :
2251 0 : addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2252 0 : addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2253 0 : addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2254 0 : addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2255 :
2256 0 : for (i = 16; i < 32; i += 8) {
2257 0 : u[i + 0] = v[i + 0];
2258 0 : u[i + 1] = v[i + 1];
2259 0 : u[i + 6] = v[i + 6];
2260 0 : u[i + 7] = v[i + 7];
2261 : }
2262 :
2263 0 : u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
2264 0 : u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
2265 0 : u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
2266 0 : u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
2267 0 : u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
2268 0 : u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
2269 0 : u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
2270 0 : u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
2271 :
2272 0 : for (i = 32; i < 64; i += 16) {
2273 0 : for (j = i; j < i + 4; j++) {
2274 0 : addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
2275 0 : addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
2276 : &clamp_hi);
2277 : }
2278 : }
2279 :
2280 : // stage 8
2281 0 : for (i = 0; i < 4; ++i)
2282 0 : addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
2283 0 : v[8] = u[8];
2284 0 : v[9] = u[9];
2285 0 : v[14] = u[14];
2286 0 : v[15] = u[15];
2287 :
2288 0 : v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
2289 0 : v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
2290 0 : v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
2291 0 : v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
2292 :
2293 0 : for (i = 16; i < 20; ++i) {
2294 0 : addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
2295 0 : addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
2296 : &clamp_hi);
2297 : }
2298 :
2299 0 : for (i = 32; i < 36; ++i) {
2300 0 : v[i] = u[i];
2301 0 : v[i + 12] = u[i + 12];
2302 0 : v[i + 16] = u[i + 16];
2303 0 : v[i + 28] = u[i + 28];
2304 : }
2305 :
2306 0 : v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
2307 0 : v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
2308 0 : v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
2309 0 : v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
2310 0 : v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
2311 0 : v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
2312 0 : v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
2313 0 : v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
2314 0 : v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
2315 0 : v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
2316 0 : v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
2317 0 : v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
2318 0 : v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
2319 0 : v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
2320 0 : v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
2321 0 : v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
2322 :
2323 : // stage 9
2324 0 : for (i = 0; i < 8; ++i)
2325 0 : addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
2326 0 : for (i = 16; i < 20; ++i) {
2327 0 : u[i] = v[i];
2328 0 : u[i + 12] = v[i + 12];
2329 : }
2330 :
2331 0 : u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
2332 0 : u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
2333 0 : u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
2334 0 : u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
2335 0 : u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
2336 0 : u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
2337 0 : u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
2338 0 : u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
2339 :
2340 0 : for (i = 32; i < 40; i++)
2341 0 : addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
2342 0 : for (i = 48; i < 56; i++)
2343 0 : addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
2344 : // stage 10
2345 0 : for (i = 0; i < 16; i++)
2346 0 : addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
2347 0 : for (i = 32; i < 40; i++) v[i] = u[i];
2348 :
2349 0 : v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
2350 0 : v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
2351 0 : v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
2352 0 : v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
2353 0 : v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
2354 0 : v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
2355 0 : v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
2356 0 : v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
2357 0 : v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
2358 0 : v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
2359 0 : v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
2360 0 : v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
2361 0 : v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
2362 0 : v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
2363 0 : v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
2364 0 : v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
2365 :
2366 0 : for (i = 56; i < 64; i++) v[i] = u[i];
2367 :
2368 : // stage 11
2369 0 : for (i = 0; i < 32; i++) {
2370 0 : addsub_sse4_1(v[i], v[63 - i], &out[16 * (i)+col],
2371 0 : &out[16 * (63 - i) + col], &clamp_lo, &clamp_hi);
2372 : }
2373 : }
2374 0 : }
2375 :
2376 0 : void eb_av1_inv_txfm2d_add_64x64_sse4_1(const int32_t *input,
2377 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
2378 : TxType tx_type, int32_t bd) {
2379 : __m128i in[64 * 64 / 4], out[64 * 64 / 4];
2380 0 : const int8_t *shift = eb_inv_txfm_shift_ls[TX_64X64];
2381 0 : const int32_t txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
2382 0 : const int32_t txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
2383 :
2384 0 : switch (tx_type) {
2385 0 : case DCT_DCT:
2386 0 : load_buffer_64x64_lower_32x32(input, in);
2387 0 : transpose_64x64(in, out, 0);
2388 0 : idct64x64_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
2389 : // transpose before shift, so shift can apply to 512 contiguous values
2390 0 : transpose_64x64(in, out, 1);
2391 0 : round_shift_64x64(out, -shift[0]);
2392 0 : idct64x64_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
2393 0 : write_buffer_64x64(in, output_r, stride_r, output_w, stride_w,
2394 0 : 0, 0, -shift[1], bd);
2395 0 : break;
2396 :
2397 0 : default:
2398 0 : eb_av1_inv_txfm2d_add_64x64_c(input, output_r, stride_r, output_w, stride_w,
2399 : tx_type, bd);
2400 0 : break;
2401 : }
2402 0 : }
2403 :
2404 : //4x8
2405 0 : static INLINE void load_buffer_32bit_input(const int32_t *in, int32_t stride,
2406 : __m128i *out, int32_t out_size) {
2407 0 : for (int32_t i = 0; i < out_size; ++i)
2408 0 : out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
2409 0 : }
2410 :
2411 0 : static void highbd_clamp_epi32_sse4_1(const __m128i *in, __m128i *out,
2412 : const __m128i *clamp_lo,
2413 : const __m128i *clamp_hi, int32_t size) {
2414 : __m128i a0, a1;
2415 0 : for (int32_t i = 0; i < size; i += 4) {
2416 0 : a0 = _mm_max_epi32(in[i], *clamp_lo);
2417 0 : out[i] = _mm_min_epi32(a0, *clamp_hi);
2418 :
2419 0 : a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
2420 0 : out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
2421 :
2422 0 : a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
2423 0 : out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
2424 :
2425 0 : a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
2426 0 : out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
2427 : }
2428 0 : }
2429 :
2430 0 : static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
2431 : __m128i *out0, __m128i *out1,
2432 : const __m128i *clamp_lo,
2433 : const __m128i *clamp_hi, int32_t shift) {
2434 0 : __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
2435 0 : __m128i in0_w_offset = _mm_add_epi32(in0, offset);
2436 0 : __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
2437 0 : __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
2438 :
2439 0 : a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
2440 0 : a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
2441 :
2442 0 : a0 = _mm_max_epi32(a0, *clamp_lo);
2443 0 : a0 = _mm_min_epi32(a0, *clamp_hi);
2444 0 : a1 = _mm_max_epi32(a1, *clamp_lo);
2445 0 : a1 = _mm_min_epi32(a1, *clamp_hi);
2446 :
2447 0 : *out0 = a0;
2448 0 : *out1 = a1;
2449 0 : }
2450 :
2451 0 : static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
2452 : __m128i *out0, __m128i *out1,
2453 : const __m128i *clamp_lo, const __m128i *clamp_hi,
2454 : int32_t shift) {
2455 0 : __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
2456 0 : __m128i a0 = _mm_add_epi32(offset, in0);
2457 0 : __m128i a1 = _mm_sub_epi32(offset, in1);
2458 :
2459 0 : a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
2460 0 : a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
2461 :
2462 0 : a0 = _mm_max_epi32(a0, *clamp_lo);
2463 0 : a0 = _mm_min_epi32(a0, *clamp_hi);
2464 0 : a1 = _mm_max_epi32(a1, *clamp_lo);
2465 0 : a1 = _mm_min_epi32(a1, *clamp_hi);
2466 :
2467 0 : *out0 = a0;
2468 0 : *out1 = a1;
2469 0 : }
2470 :
2471 0 : static void shift_sse4_1(const __m128i *in, __m128i *out,
2472 : const __m128i *clamp_lo, const __m128i *clamp_hi,
2473 : int32_t shift, int32_t size) {
2474 0 : __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
2475 0 : __m128i shift_vec = _mm_cvtsi32_si128(shift);
2476 : __m128i a0, a1;
2477 0 : for (int32_t i = 0; i < size; i += 4) {
2478 0 : a0 = _mm_add_epi32(in[i], offset);
2479 0 : a1 = _mm_add_epi32(in[i + 1], offset);
2480 0 : a0 = _mm_sra_epi32(a0, shift_vec);
2481 0 : a1 = _mm_sra_epi32(a1, shift_vec);
2482 0 : a0 = _mm_max_epi32(a0, *clamp_lo);
2483 0 : a1 = _mm_max_epi32(a1, *clamp_lo);
2484 0 : out[i] = _mm_min_epi32(a0, *clamp_hi);
2485 0 : out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
2486 :
2487 0 : a0 = _mm_add_epi32(in[i + 2], offset);
2488 0 : a1 = _mm_add_epi32(in[i + 3], offset);
2489 0 : a0 = _mm_sra_epi32(a0, shift_vec);
2490 0 : a1 = _mm_sra_epi32(a1, shift_vec);
2491 0 : a0 = _mm_max_epi32(a0, *clamp_lo);
2492 0 : a1 = _mm_max_epi32(a1, *clamp_lo);
2493 0 : out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
2494 0 : out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
2495 : }
2496 0 : }
2497 :
2498 0 : static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int32_t bit) {
2499 : __m128i tmp, round;
2500 0 : round = _mm_set1_epi32(1 << (bit - 1));
2501 0 : tmp = _mm_add_epi32(vec, round);
2502 0 : return _mm_srai_epi32(tmp, bit);
2503 : }
2504 :
2505 0 : static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input,
2506 : __m128i *output,
2507 : const int32_t size,
2508 : const int32_t bit) {
2509 0 : if (bit > 0) {
2510 : int32_t i;
2511 0 : for (i = 0; i < size; i++)
2512 0 : output[i] = av1_round_shift_32_sse4_1(input[i], bit);
2513 : }
2514 : else {
2515 : int32_t i;
2516 0 : for (i = 0; i < size; i++)
2517 0 : output[i] = _mm_slli_epi32(input[i], -bit);
2518 : }
2519 0 : }
2520 :
2521 0 : static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
2522 : __m128i *output,
2523 : const int32_t size,
2524 : const int32_t bit,
2525 : const int32_t val) {
2526 0 : const __m128i sqrt2 = _mm_set1_epi32(val);
2527 0 : if (bit > 0) {
2528 : int32_t i;
2529 0 : for (i = 0; i < size; i++) {
2530 0 : const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit);
2531 0 : const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
2532 0 : output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
2533 : }
2534 : }
2535 : else {
2536 : int32_t i;
2537 0 : for (i = 0; i < size; i++) {
2538 0 : const __m128i r0 = _mm_slli_epi32(input[i], -bit);
2539 0 : const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
2540 0 : output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
2541 : }
2542 : }
2543 0 : }
2544 :
2545 0 : static void iidentity4_sse4_1(__m128i *in, __m128i *out, int32_t bit, int32_t do_cols,
2546 : int32_t bd, int32_t out_shift) {
2547 : (void)bit;
2548 : (void)out_shift;
2549 : __m128i v[4];
2550 0 : __m128i fact = _mm_set1_epi32(NewSqrt2);
2551 0 : __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
2552 : __m128i a0, a1;
2553 :
2554 0 : a0 = _mm_mullo_epi32(in[0], fact);
2555 0 : a1 = _mm_mullo_epi32(in[1], fact);
2556 0 : a0 = _mm_add_epi32(a0, offset);
2557 0 : a1 = _mm_add_epi32(a1, offset);
2558 0 : out[0] = _mm_srai_epi32(a0, NewSqrt2Bits);
2559 0 : out[1] = _mm_srai_epi32(a1, NewSqrt2Bits);
2560 :
2561 0 : a0 = _mm_mullo_epi32(in[2], fact);
2562 0 : a1 = _mm_mullo_epi32(in[3], fact);
2563 0 : a0 = _mm_add_epi32(a0, offset);
2564 0 : a1 = _mm_add_epi32(a1, offset);
2565 0 : out[2] = _mm_srai_epi32(a0, NewSqrt2Bits);
2566 0 : out[3] = _mm_srai_epi32(a1, NewSqrt2Bits);
2567 :
2568 0 : if (!do_cols) {
2569 0 : const int32_t log_range = AOMMAX(16, bd + 6);
2570 0 : const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2571 0 : const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2572 :
2573 0 : highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
2574 : }
2575 :
2576 : // Transpose for 4x4
2577 0 : v[0] = _mm_unpacklo_epi32(out[0], out[1]);
2578 0 : v[1] = _mm_unpackhi_epi32(out[0], out[1]);
2579 0 : v[2] = _mm_unpacklo_epi32(out[2], out[3]);
2580 0 : v[3] = _mm_unpackhi_epi32(out[2], out[3]);
2581 :
2582 0 : out[0] = _mm_unpacklo_epi64(v[0], v[2]);
2583 0 : out[1] = _mm_unpackhi_epi64(v[0], v[2]);
2584 0 : out[2] = _mm_unpacklo_epi64(v[1], v[3]);
2585 0 : out[3] = _mm_unpackhi_epi64(v[1], v[3]);
2586 0 : }
2587 :
2588 0 : static void iidentity8_sse4_1(__m128i *in, __m128i *out, int32_t bit, int32_t do_cols,
2589 : int32_t bd, int32_t out_shift) {
2590 : (void)bit;
2591 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2592 0 : const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2593 0 : const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2594 : __m128i v[8];
2595 0 : v[0] = _mm_add_epi32(in[0], in[0]);
2596 0 : v[1] = _mm_add_epi32(in[1], in[1]);
2597 0 : v[2] = _mm_add_epi32(in[2], in[2]);
2598 0 : v[3] = _mm_add_epi32(in[3], in[3]);
2599 0 : v[4] = _mm_add_epi32(in[4], in[4]);
2600 0 : v[5] = _mm_add_epi32(in[5], in[5]);
2601 0 : v[6] = _mm_add_epi32(in[6], in[6]);
2602 0 : v[7] = _mm_add_epi32(in[7], in[7]);
2603 :
2604 0 : if (!do_cols) {
2605 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
2606 0 : const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
2607 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2608 0 : const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
2609 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2610 :
2611 0 : shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 8);
2612 : }
2613 : else
2614 0 : highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 8);
2615 0 : }
2616 :
2617 0 : static void iidentity16_sse4_1(__m128i *in, __m128i *out, int32_t bit, int32_t do_cols,
2618 : int32_t bd, int32_t out_shift) {
2619 : (void)bit;
2620 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2621 0 : const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2622 0 : const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2623 : __m128i v[16];
2624 0 : __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
2625 0 : __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
2626 : __m128i a0, a1, a2, a3;
2627 :
2628 0 : for (int32_t i = 0; i < 16; i += 8) {
2629 0 : a0 = _mm_mullo_epi32(in[i], fact);
2630 0 : a1 = _mm_mullo_epi32(in[i + 1], fact);
2631 0 : a0 = _mm_add_epi32(a0, offset);
2632 0 : a1 = _mm_add_epi32(a1, offset);
2633 0 : v[i] = _mm_srai_epi32(a0, NewSqrt2Bits);
2634 0 : v[i + 1] = _mm_srai_epi32(a1, NewSqrt2Bits);
2635 :
2636 0 : a2 = _mm_mullo_epi32(in[i + 2], fact);
2637 0 : a3 = _mm_mullo_epi32(in[i + 3], fact);
2638 0 : a2 = _mm_add_epi32(a2, offset);
2639 0 : a3 = _mm_add_epi32(a3, offset);
2640 0 : v[i + 2] = _mm_srai_epi32(a2, NewSqrt2Bits);
2641 0 : v[i + 3] = _mm_srai_epi32(a3, NewSqrt2Bits);
2642 :
2643 0 : a0 = _mm_mullo_epi32(in[i + 4], fact);
2644 0 : a1 = _mm_mullo_epi32(in[i + 5], fact);
2645 0 : a0 = _mm_add_epi32(a0, offset);
2646 0 : a1 = _mm_add_epi32(a1, offset);
2647 0 : v[i + 4] = _mm_srai_epi32(a0, NewSqrt2Bits);
2648 0 : v[i + 5] = _mm_srai_epi32(a1, NewSqrt2Bits);
2649 :
2650 0 : a2 = _mm_mullo_epi32(in[i + 6], fact);
2651 0 : a3 = _mm_mullo_epi32(in[i + 7], fact);
2652 0 : a2 = _mm_add_epi32(a2, offset);
2653 0 : a3 = _mm_add_epi32(a3, offset);
2654 0 : v[i + 6] = _mm_srai_epi32(a2, NewSqrt2Bits);
2655 0 : v[i + 7] = _mm_srai_epi32(a3, NewSqrt2Bits);
2656 : }
2657 :
2658 0 : if (!do_cols) {
2659 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
2660 0 : const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
2661 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2662 0 : const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
2663 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2664 :
2665 0 : shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 16);
2666 : }
2667 : else
2668 0 : highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 16);
2669 0 : }
2670 :
2671 0 : static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred,
2672 : __m128i res0, const int32_t bd) {
2673 0 : __m128i x0 = _mm_cvtepi16_epi32(pred);
2674 :
2675 0 : x0 = _mm_add_epi32(res0, x0);
2676 0 : x0 = _mm_packus_epi32(x0, x0);
2677 0 : x0 = highbd_clamp_epi16(x0, bd);
2678 0 : return x0;
2679 : }
2680 :
2681 0 : static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int32_t bit, int32_t do_cols,
2682 : int32_t bd, int32_t out_shift) {
2683 0 : const int32_t *cospi = cospi_arr(bit);
2684 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2685 0 : const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
2686 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2687 0 : const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
2688 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2689 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2690 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2691 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2692 0 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
2693 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2694 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2695 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2696 0 : const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2697 0 : const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2698 : __m128i u0, u1, u2, u3, u4, u5, u6, u7;
2699 : __m128i v0, v1, v2, v3, v4, v5, v6, v7;
2700 : __m128i x, y;
2701 :
2702 : // stage 0
2703 : // stage 1
2704 : // stage 2
2705 0 : u0 = in[0];
2706 0 : u1 = in[4];
2707 0 : u2 = in[2];
2708 0 : u3 = in[6];
2709 :
2710 0 : x = _mm_mullo_epi32(in[1], cospi56);
2711 0 : y = _mm_mullo_epi32(in[7], cospim8);
2712 0 : u4 = _mm_add_epi32(x, y);
2713 0 : u4 = _mm_add_epi32(u4, rnding);
2714 0 : u4 = _mm_srai_epi32(u4, bit);
2715 :
2716 0 : x = _mm_mullo_epi32(in[1], cospi8);
2717 0 : y = _mm_mullo_epi32(in[7], cospi56);
2718 0 : u7 = _mm_add_epi32(x, y);
2719 0 : u7 = _mm_add_epi32(u7, rnding);
2720 0 : u7 = _mm_srai_epi32(u7, bit);
2721 :
2722 0 : x = _mm_mullo_epi32(in[5], cospi24);
2723 0 : y = _mm_mullo_epi32(in[3], cospim40);
2724 0 : u5 = _mm_add_epi32(x, y);
2725 0 : u5 = _mm_add_epi32(u5, rnding);
2726 0 : u5 = _mm_srai_epi32(u5, bit);
2727 :
2728 0 : x = _mm_mullo_epi32(in[5], cospi40);
2729 0 : y = _mm_mullo_epi32(in[3], cospi24);
2730 0 : u6 = _mm_add_epi32(x, y);
2731 0 : u6 = _mm_add_epi32(u6, rnding);
2732 0 : u6 = _mm_srai_epi32(u6, bit);
2733 :
2734 : // stage 3
2735 0 : x = _mm_mullo_epi32(u0, cospi32);
2736 0 : y = _mm_mullo_epi32(u1, cospi32);
2737 0 : v0 = _mm_add_epi32(x, y);
2738 0 : v0 = _mm_add_epi32(v0, rnding);
2739 0 : v0 = _mm_srai_epi32(v0, bit);
2740 :
2741 0 : v1 = _mm_sub_epi32(x, y);
2742 0 : v1 = _mm_add_epi32(v1, rnding);
2743 0 : v1 = _mm_srai_epi32(v1, bit);
2744 :
2745 0 : x = _mm_mullo_epi32(u2, cospi48);
2746 0 : y = _mm_mullo_epi32(u3, cospim16);
2747 0 : v2 = _mm_add_epi32(x, y);
2748 0 : v2 = _mm_add_epi32(v2, rnding);
2749 0 : v2 = _mm_srai_epi32(v2, bit);
2750 :
2751 0 : x = _mm_mullo_epi32(u2, cospi16);
2752 0 : y = _mm_mullo_epi32(u3, cospi48);
2753 0 : v3 = _mm_add_epi32(x, y);
2754 0 : v3 = _mm_add_epi32(v3, rnding);
2755 0 : v3 = _mm_srai_epi32(v3, bit);
2756 :
2757 0 : addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
2758 0 : addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
2759 :
2760 : // stage 4
2761 0 : addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
2762 0 : addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
2763 0 : u4 = v4;
2764 0 : u7 = v7;
2765 :
2766 0 : x = _mm_mullo_epi32(v5, cospi32);
2767 0 : y = _mm_mullo_epi32(v6, cospi32);
2768 0 : u6 = _mm_add_epi32(y, x);
2769 0 : u6 = _mm_add_epi32(u6, rnding);
2770 0 : u6 = _mm_srai_epi32(u6, bit);
2771 :
2772 0 : u5 = _mm_sub_epi32(y, x);
2773 0 : u5 = _mm_add_epi32(u5, rnding);
2774 0 : u5 = _mm_srai_epi32(u5, bit);
2775 :
2776 : // stage 5
2777 0 : if (do_cols) {
2778 0 : addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7);
2779 0 : addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6);
2780 0 : addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5);
2781 0 : addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4);
2782 : }
2783 : else {
2784 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
2785 0 : const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
2786 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2787 0 : const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
2788 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2789 0 : addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
2790 : out_shift);
2791 0 : addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
2792 : out_shift);
2793 0 : addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
2794 : out_shift);
2795 0 : addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
2796 : out_shift);
2797 : }
2798 0 : }
2799 :
2800 0 : static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int32_t bit, int32_t do_cols,
2801 : int32_t bd, int32_t out_shift) {
2802 0 : const int32_t *cospi = cospi_arr(bit);
2803 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2804 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2805 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2806 : __m128i x;
2807 :
2808 : // stage 0
2809 : // stage 1
2810 : // stage 2
2811 : // stage 3
2812 0 : x = _mm_mullo_epi32(in[0], cospi32);
2813 0 : x = _mm_add_epi32(x, rnding);
2814 0 : x = _mm_srai_epi32(x, bit);
2815 :
2816 : // stage 4
2817 : // stage 5
2818 0 : if (!do_cols) {
2819 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
2820 0 : const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
2821 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2822 0 : const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
2823 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2824 :
2825 0 : __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
2826 0 : x = _mm_add_epi32(x, offset);
2827 0 : x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
2828 0 : x = _mm_max_epi32(x, clamp_lo_out);
2829 0 : x = _mm_min_epi32(x, clamp_hi_out);
2830 : }
2831 :
2832 0 : out[0] = x;
2833 0 : out[1] = x;
2834 0 : out[2] = x;
2835 0 : out[3] = x;
2836 0 : out[4] = x;
2837 0 : out[5] = x;
2838 0 : out[6] = x;
2839 0 : out[7] = x;
2840 0 : }
2841 :
2842 0 : static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int32_t bit, int32_t do_cols,
2843 : int32_t bd, int32_t out_shift) {
2844 0 : const int32_t *cospi = cospi_arr(bit);
2845 0 : const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
2846 0 : const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
2847 0 : const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
2848 0 : const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
2849 0 : const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
2850 0 : const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
2851 0 : const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
2852 0 : const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
2853 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2854 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2855 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2856 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2857 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2858 0 : const __m128i kZero = _mm_setzero_si128();
2859 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2860 0 : const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2861 0 : const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2862 : __m128i u[8], v[8], x;
2863 :
2864 : // stage 0
2865 : // stage 1
2866 : // stage 2
2867 :
2868 0 : u[0] = _mm_mullo_epi32(in[7], cospi4);
2869 0 : x = _mm_mullo_epi32(in[0], cospi60);
2870 0 : u[0] = _mm_add_epi32(u[0], x);
2871 0 : u[0] = _mm_add_epi32(u[0], rnding);
2872 0 : u[0] = _mm_srai_epi32(u[0], bit);
2873 :
2874 0 : u[1] = _mm_mullo_epi32(in[7], cospi60);
2875 0 : x = _mm_mullo_epi32(in[0], cospi4);
2876 0 : u[1] = _mm_sub_epi32(u[1], x);
2877 0 : u[1] = _mm_add_epi32(u[1], rnding);
2878 0 : u[1] = _mm_srai_epi32(u[1], bit);
2879 :
2880 : // (2)
2881 0 : u[2] = _mm_mullo_epi32(in[5], cospi20);
2882 0 : x = _mm_mullo_epi32(in[2], cospi44);
2883 0 : u[2] = _mm_add_epi32(u[2], x);
2884 0 : u[2] = _mm_add_epi32(u[2], rnding);
2885 0 : u[2] = _mm_srai_epi32(u[2], bit);
2886 :
2887 0 : u[3] = _mm_mullo_epi32(in[5], cospi44);
2888 0 : x = _mm_mullo_epi32(in[2], cospi20);
2889 0 : u[3] = _mm_sub_epi32(u[3], x);
2890 0 : u[3] = _mm_add_epi32(u[3], rnding);
2891 0 : u[3] = _mm_srai_epi32(u[3], bit);
2892 :
2893 : // (3)
2894 0 : u[4] = _mm_mullo_epi32(in[3], cospi36);
2895 0 : x = _mm_mullo_epi32(in[4], cospi28);
2896 0 : u[4] = _mm_add_epi32(u[4], x);
2897 0 : u[4] = _mm_add_epi32(u[4], rnding);
2898 0 : u[4] = _mm_srai_epi32(u[4], bit);
2899 :
2900 0 : u[5] = _mm_mullo_epi32(in[3], cospi28);
2901 0 : x = _mm_mullo_epi32(in[4], cospi36);
2902 0 : u[5] = _mm_sub_epi32(u[5], x);
2903 0 : u[5] = _mm_add_epi32(u[5], rnding);
2904 0 : u[5] = _mm_srai_epi32(u[5], bit);
2905 :
2906 : // (4)
2907 0 : u[6] = _mm_mullo_epi32(in[1], cospi52);
2908 0 : x = _mm_mullo_epi32(in[6], cospi12);
2909 0 : u[6] = _mm_add_epi32(u[6], x);
2910 0 : u[6] = _mm_add_epi32(u[6], rnding);
2911 0 : u[6] = _mm_srai_epi32(u[6], bit);
2912 :
2913 0 : u[7] = _mm_mullo_epi32(in[1], cospi12);
2914 0 : x = _mm_mullo_epi32(in[6], cospi52);
2915 0 : u[7] = _mm_sub_epi32(u[7], x);
2916 0 : u[7] = _mm_add_epi32(u[7], rnding);
2917 0 : u[7] = _mm_srai_epi32(u[7], bit);
2918 :
2919 : // stage 3
2920 0 : addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
2921 0 : addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
2922 0 : addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
2923 0 : addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
2924 :
2925 : // stage 4
2926 0 : u[0] = v[0];
2927 0 : u[1] = v[1];
2928 0 : u[2] = v[2];
2929 0 : u[3] = v[3];
2930 :
2931 0 : u[4] = _mm_mullo_epi32(v[4], cospi16);
2932 0 : x = _mm_mullo_epi32(v[5], cospi48);
2933 0 : u[4] = _mm_add_epi32(u[4], x);
2934 0 : u[4] = _mm_add_epi32(u[4], rnding);
2935 0 : u[4] = _mm_srai_epi32(u[4], bit);
2936 :
2937 0 : u[5] = _mm_mullo_epi32(v[4], cospi48);
2938 0 : x = _mm_mullo_epi32(v[5], cospi16);
2939 0 : u[5] = _mm_sub_epi32(u[5], x);
2940 0 : u[5] = _mm_add_epi32(u[5], rnding);
2941 0 : u[5] = _mm_srai_epi32(u[5], bit);
2942 :
2943 0 : u[6] = _mm_mullo_epi32(v[6], cospim48);
2944 0 : x = _mm_mullo_epi32(v[7], cospi16);
2945 0 : u[6] = _mm_add_epi32(u[6], x);
2946 0 : u[6] = _mm_add_epi32(u[6], rnding);
2947 0 : u[6] = _mm_srai_epi32(u[6], bit);
2948 :
2949 0 : u[7] = _mm_mullo_epi32(v[6], cospi16);
2950 0 : x = _mm_mullo_epi32(v[7], cospim48);
2951 0 : u[7] = _mm_sub_epi32(u[7], x);
2952 0 : u[7] = _mm_add_epi32(u[7], rnding);
2953 0 : u[7] = _mm_srai_epi32(u[7], bit);
2954 :
2955 : // stage 5
2956 0 : addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
2957 0 : addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
2958 0 : addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
2959 0 : addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
2960 :
2961 : // stage 6
2962 0 : u[0] = v[0];
2963 0 : u[1] = v[1];
2964 0 : u[4] = v[4];
2965 0 : u[5] = v[5];
2966 :
2967 0 : v[0] = _mm_mullo_epi32(v[2], cospi32);
2968 0 : x = _mm_mullo_epi32(v[3], cospi32);
2969 0 : u[2] = _mm_add_epi32(v[0], x);
2970 0 : u[2] = _mm_add_epi32(u[2], rnding);
2971 0 : u[2] = _mm_srai_epi32(u[2], bit);
2972 :
2973 0 : u[3] = _mm_sub_epi32(v[0], x);
2974 0 : u[3] = _mm_add_epi32(u[3], rnding);
2975 0 : u[3] = _mm_srai_epi32(u[3], bit);
2976 :
2977 0 : v[0] = _mm_mullo_epi32(v[6], cospi32);
2978 0 : x = _mm_mullo_epi32(v[7], cospi32);
2979 0 : u[6] = _mm_add_epi32(v[0], x);
2980 0 : u[6] = _mm_add_epi32(u[6], rnding);
2981 0 : u[6] = _mm_srai_epi32(u[6], bit);
2982 :
2983 0 : u[7] = _mm_sub_epi32(v[0], x);
2984 0 : u[7] = _mm_add_epi32(u[7], rnding);
2985 0 : u[7] = _mm_srai_epi32(u[7], bit);
2986 :
2987 : // stage 7
2988 0 : if (do_cols) {
2989 0 : out[0] = u[0];
2990 0 : out[1] = _mm_sub_epi32(kZero, u[4]);
2991 0 : out[2] = u[6];
2992 0 : out[3] = _mm_sub_epi32(kZero, u[2]);
2993 0 : out[4] = u[3];
2994 0 : out[5] = _mm_sub_epi32(kZero, u[7]);
2995 0 : out[6] = u[5];
2996 0 : out[7] = _mm_sub_epi32(kZero, u[1]);
2997 : }
2998 : else {
2999 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
3000 0 : const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
3001 0 : const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
3002 :
3003 0 : neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
3004 : out_shift);
3005 0 : neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
3006 : out_shift);
3007 0 : neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
3008 : out_shift);
3009 0 : neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
3010 : out_shift);
3011 : }
3012 0 : }
3013 :
3014 0 : static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int32_t bit,
3015 : int32_t do_cols, int32_t bd, int32_t out_shift) {
3016 0 : const int32_t *cospi = cospi_arr(bit);
3017 0 : const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3018 0 : const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3019 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3020 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3021 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3022 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3023 0 : const __m128i kZero = _mm_setzero_si128();
3024 : __m128i u[8], x;
3025 :
3026 : // stage 0
3027 : // stage 1
3028 : // stage 2
3029 :
3030 0 : x = _mm_mullo_epi32(in[0], cospi60);
3031 0 : u[0] = _mm_add_epi32(x, rnding);
3032 0 : u[0] = _mm_srai_epi32(u[0], bit);
3033 :
3034 0 : x = _mm_mullo_epi32(in[0], cospi4);
3035 0 : u[1] = _mm_sub_epi32(kZero, x);
3036 0 : u[1] = _mm_add_epi32(u[1], rnding);
3037 0 : u[1] = _mm_srai_epi32(u[1], bit);
3038 :
3039 : // stage 3
3040 : // stage 4
3041 : __m128i temp1, temp2;
3042 0 : temp1 = _mm_mullo_epi32(u[0], cospi16);
3043 0 : x = _mm_mullo_epi32(u[1], cospi48);
3044 0 : temp1 = _mm_add_epi32(temp1, x);
3045 0 : temp1 = _mm_add_epi32(temp1, rnding);
3046 0 : temp1 = _mm_srai_epi32(temp1, bit);
3047 0 : u[4] = temp1;
3048 :
3049 0 : temp2 = _mm_mullo_epi32(u[0], cospi48);
3050 0 : x = _mm_mullo_epi32(u[1], cospi16);
3051 0 : u[5] = _mm_sub_epi32(temp2, x);
3052 0 : u[5] = _mm_add_epi32(u[5], rnding);
3053 0 : u[5] = _mm_srai_epi32(u[5], bit);
3054 :
3055 : // stage 5
3056 : // stage 6
3057 0 : temp1 = _mm_mullo_epi32(u[0], cospi32);
3058 0 : x = _mm_mullo_epi32(u[1], cospi32);
3059 0 : u[2] = _mm_add_epi32(temp1, x);
3060 0 : u[2] = _mm_add_epi32(u[2], rnding);
3061 0 : u[2] = _mm_srai_epi32(u[2], bit);
3062 :
3063 0 : u[3] = _mm_sub_epi32(temp1, x);
3064 0 : u[3] = _mm_add_epi32(u[3], rnding);
3065 0 : u[3] = _mm_srai_epi32(u[3], bit);
3066 :
3067 0 : temp1 = _mm_mullo_epi32(u[4], cospi32);
3068 0 : x = _mm_mullo_epi32(u[5], cospi32);
3069 0 : u[6] = _mm_add_epi32(temp1, x);
3070 0 : u[6] = _mm_add_epi32(u[6], rnding);
3071 0 : u[6] = _mm_srai_epi32(u[6], bit);
3072 :
3073 0 : u[7] = _mm_sub_epi32(temp1, x);
3074 0 : u[7] = _mm_add_epi32(u[7], rnding);
3075 0 : u[7] = _mm_srai_epi32(u[7], bit);
3076 :
3077 : // stage 7
3078 0 : if (do_cols) {
3079 0 : out[0] = u[0];
3080 0 : out[1] = _mm_sub_epi32(kZero, u[4]);
3081 0 : out[2] = u[6];
3082 0 : out[3] = _mm_sub_epi32(kZero, u[2]);
3083 0 : out[4] = u[3];
3084 0 : out[5] = _mm_sub_epi32(kZero, u[7]);
3085 0 : out[6] = u[5];
3086 0 : out[7] = _mm_sub_epi32(kZero, u[1]);
3087 : }
3088 : else {
3089 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
3090 0 : const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
3091 0 : const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
3092 :
3093 0 : neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
3094 : out_shift);
3095 0 : neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
3096 : out_shift);
3097 0 : neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
3098 : out_shift);
3099 0 : neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
3100 : out_shift);
3101 : }
3102 0 : }
3103 :
3104 0 : static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int32_t bit,
3105 : int32_t do_cols, int32_t bd, int32_t out_shift) {
3106 0 : const int32_t *cospi = cospi_arr(bit);
3107 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3108 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3109 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3110 0 : const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3111 0 : const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3112 :
3113 : {
3114 : // stage 0
3115 : // stage 1
3116 : // stage 2
3117 : // stage 3
3118 : // stage 4
3119 0 : in[0] = _mm_mullo_epi32(in[0], cospi32);
3120 0 : in[0] = _mm_add_epi32(in[0], rnding);
3121 0 : in[0] = _mm_srai_epi32(in[0], bit);
3122 :
3123 : // stage 5
3124 : // stage 6
3125 : // stage 7
3126 0 : if (do_cols) {
3127 0 : in[0] = _mm_max_epi32(in[0], clamp_lo);
3128 0 : in[0] = _mm_min_epi32(in[0], clamp_hi);
3129 : }
3130 : else {
3131 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
3132 0 : const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
3133 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3134 0 : const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
3135 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3136 0 : __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
3137 0 : in[0] = _mm_add_epi32(in[0], offset);
3138 0 : in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
3139 0 : in[0] = _mm_max_epi32(in[0], clamp_lo_out);
3140 0 : in[0] = _mm_min_epi32(in[0], clamp_hi_out);
3141 : }
3142 :
3143 0 : out[0] = in[0];
3144 0 : out[1] = in[0];
3145 0 : out[2] = in[0];
3146 0 : out[3] = in[0];
3147 0 : out[4] = in[0];
3148 0 : out[5] = in[0];
3149 0 : out[6] = in[0];
3150 0 : out[7] = in[0];
3151 0 : out[8] = in[0];
3152 0 : out[9] = in[0];
3153 0 : out[10] = in[0];
3154 0 : out[11] = in[0];
3155 0 : out[12] = in[0];
3156 0 : out[13] = in[0];
3157 0 : out[14] = in[0];
3158 0 : out[15] = in[0];
3159 : }
3160 0 : }
3161 :
3162 0 : static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int32_t bit,
3163 : int32_t do_cols, int32_t bd, int32_t out_shift) {
3164 0 : const int32_t *cospi = cospi_arr(bit);
3165 0 : const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3166 0 : const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3167 0 : const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3168 0 : const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3169 0 : const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3170 0 : const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3171 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3172 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3173 0 : const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3174 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3175 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3176 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3177 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3178 0 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3179 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3180 0 : const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3181 0 : const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3182 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3183 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3184 0 : const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3185 0 : const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3186 : __m128i u[16], x, y;
3187 :
3188 : {
3189 : // stage 0
3190 : // stage 1
3191 0 : u[0] = in[0];
3192 0 : u[2] = in[4];
3193 0 : u[4] = in[2];
3194 0 : u[6] = in[6];
3195 0 : u[8] = in[1];
3196 0 : u[10] = in[5];
3197 0 : u[12] = in[3];
3198 0 : u[14] = in[7];
3199 :
3200 : // stage 2
3201 0 : u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3202 0 : u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3203 :
3204 0 : u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
3205 0 : u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
3206 :
3207 0 : u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
3208 0 : u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
3209 :
3210 0 : u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
3211 0 : u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
3212 :
3213 : // stage 3
3214 0 : u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
3215 0 : u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
3216 0 : u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
3217 0 : u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
3218 :
3219 0 : addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
3220 0 : addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
3221 0 : addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
3222 0 : addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
3223 :
3224 : // stage 4
3225 0 : x = _mm_mullo_epi32(u[0], cospi32);
3226 0 : u[0] = _mm_add_epi32(x, rnding);
3227 0 : u[0] = _mm_srai_epi32(u[0], bit);
3228 0 : u[1] = u[0];
3229 :
3230 0 : u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
3231 0 : u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
3232 :
3233 0 : addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
3234 0 : addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
3235 :
3236 0 : x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3237 0 : u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3238 0 : u[9] = x;
3239 0 : y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3240 0 : u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3241 0 : u[10] = y;
3242 :
3243 : // stage 5
3244 0 : addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
3245 0 : addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
3246 :
3247 0 : x = _mm_mullo_epi32(u[5], cospi32);
3248 0 : y = _mm_mullo_epi32(u[6], cospi32);
3249 0 : u[5] = _mm_sub_epi32(y, x);
3250 0 : u[5] = _mm_add_epi32(u[5], rnding);
3251 0 : u[5] = _mm_srai_epi32(u[5], bit);
3252 :
3253 0 : u[6] = _mm_add_epi32(y, x);
3254 0 : u[6] = _mm_add_epi32(u[6], rnding);
3255 0 : u[6] = _mm_srai_epi32(u[6], bit);
3256 :
3257 0 : addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3258 0 : addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3259 0 : addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3260 0 : addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3261 :
3262 : // stage 6
3263 0 : addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
3264 0 : addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
3265 0 : addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
3266 0 : addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
3267 :
3268 0 : x = _mm_mullo_epi32(u[10], cospi32);
3269 0 : y = _mm_mullo_epi32(u[13], cospi32);
3270 0 : u[10] = _mm_sub_epi32(y, x);
3271 0 : u[10] = _mm_add_epi32(u[10], rnding);
3272 0 : u[10] = _mm_srai_epi32(u[10], bit);
3273 :
3274 0 : u[13] = _mm_add_epi32(x, y);
3275 0 : u[13] = _mm_add_epi32(u[13], rnding);
3276 0 : u[13] = _mm_srai_epi32(u[13], bit);
3277 :
3278 0 : x = _mm_mullo_epi32(u[11], cospi32);
3279 0 : y = _mm_mullo_epi32(u[12], cospi32);
3280 0 : u[11] = _mm_sub_epi32(y, x);
3281 0 : u[11] = _mm_add_epi32(u[11], rnding);
3282 0 : u[11] = _mm_srai_epi32(u[11], bit);
3283 :
3284 0 : u[12] = _mm_add_epi32(x, y);
3285 0 : u[12] = _mm_add_epi32(u[12], rnding);
3286 0 : u[12] = _mm_srai_epi32(u[12], bit);
3287 : // stage 7
3288 0 : if (do_cols) {
3289 0 : addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15);
3290 0 : addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14);
3291 0 : addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13);
3292 0 : addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12);
3293 0 : addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11);
3294 0 : addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10);
3295 0 : addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9);
3296 0 : addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8);
3297 : }
3298 : else {
3299 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
3300 0 : const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
3301 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3302 0 : const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
3303 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3304 :
3305 0 : addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
3306 : &clamp_hi_out, out_shift);
3307 0 : addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
3308 : &clamp_hi_out, out_shift);
3309 0 : addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
3310 : &clamp_hi_out, out_shift);
3311 0 : addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
3312 : &clamp_hi_out, out_shift);
3313 0 : addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
3314 : &clamp_hi_out, out_shift);
3315 0 : addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
3316 : &clamp_hi_out, out_shift);
3317 0 : addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
3318 : &clamp_hi_out, out_shift);
3319 0 : addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
3320 : &clamp_hi_out, out_shift);
3321 : }
3322 : }
3323 0 : }
3324 :
3325 0 : static void idct16x16_new_sse4_1(__m128i *in, __m128i *out, int32_t bit, int32_t do_cols,
3326 : int32_t bd, int32_t out_shift) {
3327 0 : const int32_t *cospi = cospi_arr(bit);
3328 0 : const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3329 0 : const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3330 0 : const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3331 0 : const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3332 0 : const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3333 0 : const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3334 0 : const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3335 0 : const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3336 0 : const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3337 0 : const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
3338 0 : const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
3339 0 : const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3340 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3341 0 : const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3342 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3343 0 : const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3344 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3345 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3346 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3347 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3348 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3349 0 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3350 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3351 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3352 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3353 0 : const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3354 0 : const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3355 : __m128i u[16], v[16], x, y;
3356 :
3357 : {
3358 : // stage 0
3359 : // stage 1
3360 0 : u[0] = in[0];
3361 0 : u[1] = in[8];
3362 0 : u[2] = in[4];
3363 0 : u[3] = in[12];
3364 0 : u[4] = in[2];
3365 0 : u[5] = in[10];
3366 0 : u[6] = in[6];
3367 0 : u[7] = in[14];
3368 0 : u[8] = in[1];
3369 0 : u[9] = in[9];
3370 0 : u[10] = in[5];
3371 0 : u[11] = in[13];
3372 0 : u[12] = in[3];
3373 0 : u[13] = in[11];
3374 0 : u[14] = in[7];
3375 0 : u[15] = in[15];
3376 :
3377 : // stage 2
3378 0 : v[0] = u[0];
3379 0 : v[1] = u[1];
3380 0 : v[2] = u[2];
3381 0 : v[3] = u[3];
3382 0 : v[4] = u[4];
3383 0 : v[5] = u[5];
3384 0 : v[6] = u[6];
3385 0 : v[7] = u[7];
3386 :
3387 0 : v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
3388 0 : v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
3389 0 : v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
3390 0 : v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
3391 0 : v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
3392 0 : v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
3393 0 : v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
3394 0 : v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
3395 :
3396 : // stage 3
3397 0 : u[0] = v[0];
3398 0 : u[1] = v[1];
3399 0 : u[2] = v[2];
3400 0 : u[3] = v[3];
3401 0 : u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
3402 0 : u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
3403 0 : u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
3404 0 : u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
3405 0 : addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
3406 0 : addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
3407 0 : addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
3408 0 : addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
3409 :
3410 : // stage 4
3411 0 : x = _mm_mullo_epi32(u[0], cospi32);
3412 0 : y = _mm_mullo_epi32(u[1], cospi32);
3413 0 : v[0] = _mm_add_epi32(x, y);
3414 0 : v[0] = _mm_add_epi32(v[0], rnding);
3415 0 : v[0] = _mm_srai_epi32(v[0], bit);
3416 :
3417 0 : v[1] = _mm_sub_epi32(x, y);
3418 0 : v[1] = _mm_add_epi32(v[1], rnding);
3419 0 : v[1] = _mm_srai_epi32(v[1], bit);
3420 :
3421 0 : v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
3422 0 : v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
3423 0 : addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
3424 0 : addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
3425 0 : v[8] = u[8];
3426 0 : v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3427 0 : v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3428 0 : v[11] = u[11];
3429 0 : v[12] = u[12];
3430 0 : v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3431 0 : v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3432 0 : v[15] = u[15];
3433 :
3434 : // stage 5
3435 0 : addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
3436 0 : addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
3437 0 : u[4] = v[4];
3438 :
3439 0 : x = _mm_mullo_epi32(v[5], cospi32);
3440 0 : y = _mm_mullo_epi32(v[6], cospi32);
3441 0 : u[5] = _mm_sub_epi32(y, x);
3442 0 : u[5] = _mm_add_epi32(u[5], rnding);
3443 0 : u[5] = _mm_srai_epi32(u[5], bit);
3444 :
3445 0 : u[6] = _mm_add_epi32(y, x);
3446 0 : u[6] = _mm_add_epi32(u[6], rnding);
3447 0 : u[6] = _mm_srai_epi32(u[6], bit);
3448 :
3449 0 : u[7] = v[7];
3450 0 : addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3451 0 : addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3452 0 : addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3453 0 : addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3454 :
3455 : // stage 6
3456 0 : addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
3457 0 : addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
3458 0 : addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
3459 0 : addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
3460 0 : v[8] = u[8];
3461 0 : v[9] = u[9];
3462 :
3463 0 : x = _mm_mullo_epi32(u[10], cospi32);
3464 0 : y = _mm_mullo_epi32(u[13], cospi32);
3465 0 : v[10] = _mm_sub_epi32(y, x);
3466 0 : v[10] = _mm_add_epi32(v[10], rnding);
3467 0 : v[10] = _mm_srai_epi32(v[10], bit);
3468 :
3469 0 : v[13] = _mm_add_epi32(x, y);
3470 0 : v[13] = _mm_add_epi32(v[13], rnding);
3471 0 : v[13] = _mm_srai_epi32(v[13], bit);
3472 :
3473 0 : x = _mm_mullo_epi32(u[11], cospi32);
3474 0 : y = _mm_mullo_epi32(u[12], cospi32);
3475 0 : v[11] = _mm_sub_epi32(y, x);
3476 0 : v[11] = _mm_add_epi32(v[11], rnding);
3477 0 : v[11] = _mm_srai_epi32(v[11], bit);
3478 :
3479 0 : v[12] = _mm_add_epi32(x, y);
3480 0 : v[12] = _mm_add_epi32(v[12], rnding);
3481 0 : v[12] = _mm_srai_epi32(v[12], bit);
3482 :
3483 0 : v[14] = u[14];
3484 0 : v[15] = u[15];
3485 :
3486 : // stage 7
3487 0 : if (do_cols) {
3488 0 : addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15);
3489 0 : addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14);
3490 0 : addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13);
3491 0 : addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12);
3492 0 : addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11);
3493 0 : addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10);
3494 0 : addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9);
3495 0 : addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8);
3496 : }
3497 : else {
3498 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
3499 0 : const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
3500 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3501 0 : const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
3502 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3503 :
3504 0 : addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
3505 : &clamp_hi_out, out_shift);
3506 0 : addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
3507 : &clamp_hi_out, out_shift);
3508 0 : addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
3509 : &clamp_hi_out, out_shift);
3510 0 : addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
3511 : &clamp_hi_out, out_shift);
3512 0 : addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
3513 : &clamp_hi_out, out_shift);
3514 0 : addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
3515 : &clamp_hi_out, out_shift);
3516 0 : addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
3517 : &clamp_hi_out, out_shift);
3518 0 : addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
3519 : &clamp_hi_out, out_shift);
3520 : }
3521 : }
3522 0 : }
3523 :
3524 0 : static void iadst16x16_new_sse4_1(__m128i *in, __m128i *out, int32_t bit, int32_t do_cols,
3525 : int32_t bd, int32_t out_shift) {
3526 0 : const int32_t *cospi = cospi_arr(bit);
3527 0 : const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3528 0 : const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3529 0 : const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
3530 0 : const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
3531 0 : const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
3532 0 : const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
3533 0 : const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
3534 0 : const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
3535 0 : const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
3536 0 : const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
3537 0 : const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
3538 0 : const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
3539 0 : const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
3540 0 : const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
3541 0 : const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
3542 0 : const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3543 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3544 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3545 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3546 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3547 0 : const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3548 0 : const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3549 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3550 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3551 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3552 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3553 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3554 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3555 0 : const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3556 0 : const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3557 : __m128i u[16], v[16], x, y;
3558 :
3559 : // Calculate the column 0, 1, 2, 3
3560 : {
3561 : // stage 0
3562 : // stage 1
3563 : // stage 2
3564 0 : v[0] = _mm_mullo_epi32(in[15], cospi2);
3565 0 : x = _mm_mullo_epi32(in[0], cospi62);
3566 0 : v[0] = _mm_add_epi32(v[0], x);
3567 0 : v[0] = _mm_add_epi32(v[0], rnding);
3568 0 : v[0] = _mm_srai_epi32(v[0], bit);
3569 :
3570 0 : v[1] = _mm_mullo_epi32(in[15], cospi62);
3571 0 : x = _mm_mullo_epi32(in[0], cospi2);
3572 0 : v[1] = _mm_sub_epi32(v[1], x);
3573 0 : v[1] = _mm_add_epi32(v[1], rnding);
3574 0 : v[1] = _mm_srai_epi32(v[1], bit);
3575 :
3576 0 : v[2] = _mm_mullo_epi32(in[13], cospi10);
3577 0 : x = _mm_mullo_epi32(in[2], cospi54);
3578 0 : v[2] = _mm_add_epi32(v[2], x);
3579 0 : v[2] = _mm_add_epi32(v[2], rnding);
3580 0 : v[2] = _mm_srai_epi32(v[2], bit);
3581 :
3582 0 : v[3] = _mm_mullo_epi32(in[13], cospi54);
3583 0 : x = _mm_mullo_epi32(in[2], cospi10);
3584 0 : v[3] = _mm_sub_epi32(v[3], x);
3585 0 : v[3] = _mm_add_epi32(v[3], rnding);
3586 0 : v[3] = _mm_srai_epi32(v[3], bit);
3587 :
3588 0 : v[4] = _mm_mullo_epi32(in[11], cospi18);
3589 0 : x = _mm_mullo_epi32(in[4], cospi46);
3590 0 : v[4] = _mm_add_epi32(v[4], x);
3591 0 : v[4] = _mm_add_epi32(v[4], rnding);
3592 0 : v[4] = _mm_srai_epi32(v[4], bit);
3593 :
3594 0 : v[5] = _mm_mullo_epi32(in[11], cospi46);
3595 0 : x = _mm_mullo_epi32(in[4], cospi18);
3596 0 : v[5] = _mm_sub_epi32(v[5], x);
3597 0 : v[5] = _mm_add_epi32(v[5], rnding);
3598 0 : v[5] = _mm_srai_epi32(v[5], bit);
3599 :
3600 0 : v[6] = _mm_mullo_epi32(in[9], cospi26);
3601 0 : x = _mm_mullo_epi32(in[6], cospi38);
3602 0 : v[6] = _mm_add_epi32(v[6], x);
3603 0 : v[6] = _mm_add_epi32(v[6], rnding);
3604 0 : v[6] = _mm_srai_epi32(v[6], bit);
3605 :
3606 0 : v[7] = _mm_mullo_epi32(in[9], cospi38);
3607 0 : x = _mm_mullo_epi32(in[6], cospi26);
3608 0 : v[7] = _mm_sub_epi32(v[7], x);
3609 0 : v[7] = _mm_add_epi32(v[7], rnding);
3610 0 : v[7] = _mm_srai_epi32(v[7], bit);
3611 :
3612 0 : v[8] = _mm_mullo_epi32(in[7], cospi34);
3613 0 : x = _mm_mullo_epi32(in[8], cospi30);
3614 0 : v[8] = _mm_add_epi32(v[8], x);
3615 0 : v[8] = _mm_add_epi32(v[8], rnding);
3616 0 : v[8] = _mm_srai_epi32(v[8], bit);
3617 :
3618 0 : v[9] = _mm_mullo_epi32(in[7], cospi30);
3619 0 : x = _mm_mullo_epi32(in[8], cospi34);
3620 0 : v[9] = _mm_sub_epi32(v[9], x);
3621 0 : v[9] = _mm_add_epi32(v[9], rnding);
3622 0 : v[9] = _mm_srai_epi32(v[9], bit);
3623 :
3624 0 : v[10] = _mm_mullo_epi32(in[5], cospi42);
3625 0 : x = _mm_mullo_epi32(in[10], cospi22);
3626 0 : v[10] = _mm_add_epi32(v[10], x);
3627 0 : v[10] = _mm_add_epi32(v[10], rnding);
3628 0 : v[10] = _mm_srai_epi32(v[10], bit);
3629 :
3630 0 : v[11] = _mm_mullo_epi32(in[5], cospi22);
3631 0 : x = _mm_mullo_epi32(in[10], cospi42);
3632 0 : v[11] = _mm_sub_epi32(v[11], x);
3633 0 : v[11] = _mm_add_epi32(v[11], rnding);
3634 0 : v[11] = _mm_srai_epi32(v[11], bit);
3635 :
3636 0 : v[12] = _mm_mullo_epi32(in[3], cospi50);
3637 0 : x = _mm_mullo_epi32(in[12], cospi14);
3638 0 : v[12] = _mm_add_epi32(v[12], x);
3639 0 : v[12] = _mm_add_epi32(v[12], rnding);
3640 0 : v[12] = _mm_srai_epi32(v[12], bit);
3641 :
3642 0 : v[13] = _mm_mullo_epi32(in[3], cospi14);
3643 0 : x = _mm_mullo_epi32(in[12], cospi50);
3644 0 : v[13] = _mm_sub_epi32(v[13], x);
3645 0 : v[13] = _mm_add_epi32(v[13], rnding);
3646 0 : v[13] = _mm_srai_epi32(v[13], bit);
3647 :
3648 0 : v[14] = _mm_mullo_epi32(in[1], cospi58);
3649 0 : x = _mm_mullo_epi32(in[14], cospi6);
3650 0 : v[14] = _mm_add_epi32(v[14], x);
3651 0 : v[14] = _mm_add_epi32(v[14], rnding);
3652 0 : v[14] = _mm_srai_epi32(v[14], bit);
3653 :
3654 0 : v[15] = _mm_mullo_epi32(in[1], cospi6);
3655 0 : x = _mm_mullo_epi32(in[14], cospi58);
3656 0 : v[15] = _mm_sub_epi32(v[15], x);
3657 0 : v[15] = _mm_add_epi32(v[15], rnding);
3658 0 : v[15] = _mm_srai_epi32(v[15], bit);
3659 :
3660 : // stage 3
3661 0 : addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
3662 0 : addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
3663 0 : addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
3664 0 : addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
3665 0 : addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
3666 0 : addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
3667 0 : addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
3668 0 : addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
3669 :
3670 : // stage 4
3671 0 : v[0] = u[0];
3672 0 : v[1] = u[1];
3673 0 : v[2] = u[2];
3674 0 : v[3] = u[3];
3675 0 : v[4] = u[4];
3676 0 : v[5] = u[5];
3677 0 : v[6] = u[6];
3678 0 : v[7] = u[7];
3679 :
3680 0 : v[8] = _mm_mullo_epi32(u[8], cospi8);
3681 0 : x = _mm_mullo_epi32(u[9], cospi56);
3682 0 : v[8] = _mm_add_epi32(v[8], x);
3683 0 : v[8] = _mm_add_epi32(v[8], rnding);
3684 0 : v[8] = _mm_srai_epi32(v[8], bit);
3685 :
3686 0 : v[9] = _mm_mullo_epi32(u[8], cospi56);
3687 0 : x = _mm_mullo_epi32(u[9], cospi8);
3688 0 : v[9] = _mm_sub_epi32(v[9], x);
3689 0 : v[9] = _mm_add_epi32(v[9], rnding);
3690 0 : v[9] = _mm_srai_epi32(v[9], bit);
3691 :
3692 0 : v[10] = _mm_mullo_epi32(u[10], cospi40);
3693 0 : x = _mm_mullo_epi32(u[11], cospi24);
3694 0 : v[10] = _mm_add_epi32(v[10], x);
3695 0 : v[10] = _mm_add_epi32(v[10], rnding);
3696 0 : v[10] = _mm_srai_epi32(v[10], bit);
3697 :
3698 0 : v[11] = _mm_mullo_epi32(u[10], cospi24);
3699 0 : x = _mm_mullo_epi32(u[11], cospi40);
3700 0 : v[11] = _mm_sub_epi32(v[11], x);
3701 0 : v[11] = _mm_add_epi32(v[11], rnding);
3702 0 : v[11] = _mm_srai_epi32(v[11], bit);
3703 :
3704 0 : v[12] = _mm_mullo_epi32(u[12], cospim56);
3705 0 : x = _mm_mullo_epi32(u[13], cospi8);
3706 0 : v[12] = _mm_add_epi32(v[12], x);
3707 0 : v[12] = _mm_add_epi32(v[12], rnding);
3708 0 : v[12] = _mm_srai_epi32(v[12], bit);
3709 :
3710 0 : v[13] = _mm_mullo_epi32(u[12], cospi8);
3711 0 : x = _mm_mullo_epi32(u[13], cospim56);
3712 0 : v[13] = _mm_sub_epi32(v[13], x);
3713 0 : v[13] = _mm_add_epi32(v[13], rnding);
3714 0 : v[13] = _mm_srai_epi32(v[13], bit);
3715 :
3716 0 : v[14] = _mm_mullo_epi32(u[14], cospim24);
3717 0 : x = _mm_mullo_epi32(u[15], cospi40);
3718 0 : v[14] = _mm_add_epi32(v[14], x);
3719 0 : v[14] = _mm_add_epi32(v[14], rnding);
3720 0 : v[14] = _mm_srai_epi32(v[14], bit);
3721 :
3722 0 : v[15] = _mm_mullo_epi32(u[14], cospi40);
3723 0 : x = _mm_mullo_epi32(u[15], cospim24);
3724 0 : v[15] = _mm_sub_epi32(v[15], x);
3725 0 : v[15] = _mm_add_epi32(v[15], rnding);
3726 0 : v[15] = _mm_srai_epi32(v[15], bit);
3727 :
3728 : // stage 5
3729 0 : addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
3730 0 : addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
3731 0 : addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
3732 0 : addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
3733 0 : addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
3734 0 : addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
3735 0 : addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
3736 0 : addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
3737 :
3738 : // stage 6
3739 0 : v[0] = u[0];
3740 0 : v[1] = u[1];
3741 0 : v[2] = u[2];
3742 0 : v[3] = u[3];
3743 :
3744 0 : v[4] = _mm_mullo_epi32(u[4], cospi16);
3745 0 : x = _mm_mullo_epi32(u[5], cospi48);
3746 0 : v[4] = _mm_add_epi32(v[4], x);
3747 0 : v[4] = _mm_add_epi32(v[4], rnding);
3748 0 : v[4] = _mm_srai_epi32(v[4], bit);
3749 :
3750 0 : v[5] = _mm_mullo_epi32(u[4], cospi48);
3751 0 : x = _mm_mullo_epi32(u[5], cospi16);
3752 0 : v[5] = _mm_sub_epi32(v[5], x);
3753 0 : v[5] = _mm_add_epi32(v[5], rnding);
3754 0 : v[5] = _mm_srai_epi32(v[5], bit);
3755 :
3756 0 : v[6] = _mm_mullo_epi32(u[6], cospim48);
3757 0 : x = _mm_mullo_epi32(u[7], cospi16);
3758 0 : v[6] = _mm_add_epi32(v[6], x);
3759 0 : v[6] = _mm_add_epi32(v[6], rnding);
3760 0 : v[6] = _mm_srai_epi32(v[6], bit);
3761 :
3762 0 : v[7] = _mm_mullo_epi32(u[6], cospi16);
3763 0 : x = _mm_mullo_epi32(u[7], cospim48);
3764 0 : v[7] = _mm_sub_epi32(v[7], x);
3765 0 : v[7] = _mm_add_epi32(v[7], rnding);
3766 0 : v[7] = _mm_srai_epi32(v[7], bit);
3767 :
3768 0 : v[8] = u[8];
3769 0 : v[9] = u[9];
3770 0 : v[10] = u[10];
3771 0 : v[11] = u[11];
3772 :
3773 0 : v[12] = _mm_mullo_epi32(u[12], cospi16);
3774 0 : x = _mm_mullo_epi32(u[13], cospi48);
3775 0 : v[12] = _mm_add_epi32(v[12], x);
3776 0 : v[12] = _mm_add_epi32(v[12], rnding);
3777 0 : v[12] = _mm_srai_epi32(v[12], bit);
3778 :
3779 0 : v[13] = _mm_mullo_epi32(u[12], cospi48);
3780 0 : x = _mm_mullo_epi32(u[13], cospi16);
3781 0 : v[13] = _mm_sub_epi32(v[13], x);
3782 0 : v[13] = _mm_add_epi32(v[13], rnding);
3783 0 : v[13] = _mm_srai_epi32(v[13], bit);
3784 :
3785 0 : v[14] = _mm_mullo_epi32(u[14], cospim48);
3786 0 : x = _mm_mullo_epi32(u[15], cospi16);
3787 0 : v[14] = _mm_add_epi32(v[14], x);
3788 0 : v[14] = _mm_add_epi32(v[14], rnding);
3789 0 : v[14] = _mm_srai_epi32(v[14], bit);
3790 :
3791 0 : v[15] = _mm_mullo_epi32(u[14], cospi16);
3792 0 : x = _mm_mullo_epi32(u[15], cospim48);
3793 0 : v[15] = _mm_sub_epi32(v[15], x);
3794 0 : v[15] = _mm_add_epi32(v[15], rnding);
3795 0 : v[15] = _mm_srai_epi32(v[15], bit);
3796 :
3797 : // stage 7
3798 0 : addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
3799 0 : addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
3800 0 : addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
3801 0 : addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
3802 0 : addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
3803 0 : addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
3804 0 : addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
3805 0 : addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
3806 :
3807 : // stage 8
3808 0 : v[0] = u[0];
3809 0 : v[1] = u[1];
3810 :
3811 0 : y = _mm_mullo_epi32(u[2], cospi32);
3812 0 : x = _mm_mullo_epi32(u[3], cospi32);
3813 0 : v[2] = _mm_add_epi32(y, x);
3814 0 : v[2] = _mm_add_epi32(v[2], rnding);
3815 0 : v[2] = _mm_srai_epi32(v[2], bit);
3816 :
3817 0 : v[3] = _mm_sub_epi32(y, x);
3818 0 : v[3] = _mm_add_epi32(v[3], rnding);
3819 0 : v[3] = _mm_srai_epi32(v[3], bit);
3820 :
3821 0 : v[4] = u[4];
3822 0 : v[5] = u[5];
3823 :
3824 0 : y = _mm_mullo_epi32(u[6], cospi32);
3825 0 : x = _mm_mullo_epi32(u[7], cospi32);
3826 0 : v[6] = _mm_add_epi32(y, x);
3827 0 : v[6] = _mm_add_epi32(v[6], rnding);
3828 0 : v[6] = _mm_srai_epi32(v[6], bit);
3829 :
3830 0 : v[7] = _mm_sub_epi32(y, x);
3831 0 : v[7] = _mm_add_epi32(v[7], rnding);
3832 0 : v[7] = _mm_srai_epi32(v[7], bit);
3833 :
3834 0 : v[8] = u[8];
3835 0 : v[9] = u[9];
3836 :
3837 0 : y = _mm_mullo_epi32(u[10], cospi32);
3838 0 : x = _mm_mullo_epi32(u[11], cospi32);
3839 0 : v[10] = _mm_add_epi32(y, x);
3840 0 : v[10] = _mm_add_epi32(v[10], rnding);
3841 0 : v[10] = _mm_srai_epi32(v[10], bit);
3842 :
3843 0 : v[11] = _mm_sub_epi32(y, x);
3844 0 : v[11] = _mm_add_epi32(v[11], rnding);
3845 0 : v[11] = _mm_srai_epi32(v[11], bit);
3846 :
3847 0 : v[12] = u[12];
3848 0 : v[13] = u[13];
3849 :
3850 0 : y = _mm_mullo_epi32(u[14], cospi32);
3851 0 : x = _mm_mullo_epi32(u[15], cospi32);
3852 0 : v[14] = _mm_add_epi32(y, x);
3853 0 : v[14] = _mm_add_epi32(v[14], rnding);
3854 0 : v[14] = _mm_srai_epi32(v[14], bit);
3855 :
3856 0 : v[15] = _mm_sub_epi32(y, x);
3857 0 : v[15] = _mm_add_epi32(v[15], rnding);
3858 0 : v[15] = _mm_srai_epi32(v[15], bit);
3859 :
3860 : // stage 9
3861 0 : if (do_cols) {
3862 0 : out[0] = v[0];
3863 0 : out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
3864 0 : out[2] = v[12];
3865 0 : out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
3866 0 : out[4] = v[6];
3867 0 : out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
3868 0 : out[6] = v[10];
3869 0 : out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
3870 0 : out[8] = v[3];
3871 0 : out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
3872 0 : out[10] = v[15];
3873 0 : out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
3874 0 : out[12] = v[5];
3875 0 : out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
3876 0 : out[14] = v[9];
3877 0 : out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
3878 : }
3879 : else {
3880 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
3881 0 : const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
3882 0 : const __m128i clamp_hi_out =
3883 0 : _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
3884 :
3885 0 : neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
3886 : &clamp_hi_out, out_shift);
3887 0 : neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
3888 : &clamp_hi_out, out_shift);
3889 0 : neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
3890 : &clamp_hi_out, out_shift);
3891 0 : neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
3892 : &clamp_hi_out, out_shift);
3893 0 : neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
3894 : &clamp_hi_out, out_shift);
3895 0 : neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
3896 : &clamp_hi_out, out_shift);
3897 0 : neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
3898 : &clamp_hi_out, out_shift);
3899 0 : neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
3900 : &clamp_hi_out, out_shift);
3901 : }
3902 : }
3903 0 : }
3904 :
3905 0 : static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int32_t bit,
3906 : int32_t do_cols, int32_t bd, int32_t out_shift) {
3907 0 : const int32_t *cospi = cospi_arr(bit);
3908 0 : const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3909 0 : const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3910 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3911 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3912 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3913 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3914 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3915 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3916 0 : const __m128i zero = _mm_setzero_si128();
3917 : __m128i v[16], x, y, temp1, temp2;
3918 :
3919 : // Calculate the column 0, 1, 2, 3
3920 : {
3921 : // stage 0
3922 : // stage 1
3923 : // stage 2
3924 0 : x = _mm_mullo_epi32(in[0], cospi62);
3925 0 : v[0] = _mm_add_epi32(x, rnding);
3926 0 : v[0] = _mm_srai_epi32(v[0], bit);
3927 :
3928 0 : x = _mm_mullo_epi32(in[0], cospi2);
3929 0 : v[1] = _mm_sub_epi32(zero, x);
3930 0 : v[1] = _mm_add_epi32(v[1], rnding);
3931 0 : v[1] = _mm_srai_epi32(v[1], bit);
3932 :
3933 : // stage 3
3934 0 : v[8] = v[0];
3935 0 : v[9] = v[1];
3936 :
3937 : // stage 4
3938 0 : temp1 = _mm_mullo_epi32(v[8], cospi8);
3939 0 : x = _mm_mullo_epi32(v[9], cospi56);
3940 0 : temp1 = _mm_add_epi32(temp1, x);
3941 0 : temp1 = _mm_add_epi32(temp1, rnding);
3942 0 : temp1 = _mm_srai_epi32(temp1, bit);
3943 :
3944 0 : temp2 = _mm_mullo_epi32(v[8], cospi56);
3945 0 : x = _mm_mullo_epi32(v[9], cospi8);
3946 0 : temp2 = _mm_sub_epi32(temp2, x);
3947 0 : temp2 = _mm_add_epi32(temp2, rnding);
3948 0 : temp2 = _mm_srai_epi32(temp2, bit);
3949 0 : v[8] = temp1;
3950 0 : v[9] = temp2;
3951 :
3952 : // stage 5
3953 0 : v[4] = v[0];
3954 0 : v[5] = v[1];
3955 0 : v[12] = v[8];
3956 0 : v[13] = v[9];
3957 :
3958 : // stage 6
3959 0 : temp1 = _mm_mullo_epi32(v[4], cospi16);
3960 0 : x = _mm_mullo_epi32(v[5], cospi48);
3961 0 : temp1 = _mm_add_epi32(temp1, x);
3962 0 : temp1 = _mm_add_epi32(temp1, rnding);
3963 0 : temp1 = _mm_srai_epi32(temp1, bit);
3964 :
3965 0 : temp2 = _mm_mullo_epi32(v[4], cospi48);
3966 0 : x = _mm_mullo_epi32(v[5], cospi16);
3967 0 : temp2 = _mm_sub_epi32(temp2, x);
3968 0 : temp2 = _mm_add_epi32(temp2, rnding);
3969 0 : temp2 = _mm_srai_epi32(temp2, bit);
3970 0 : v[4] = temp1;
3971 0 : v[5] = temp2;
3972 :
3973 0 : temp1 = _mm_mullo_epi32(v[12], cospi16);
3974 0 : x = _mm_mullo_epi32(v[13], cospi48);
3975 0 : temp1 = _mm_add_epi32(temp1, x);
3976 0 : temp1 = _mm_add_epi32(temp1, rnding);
3977 0 : temp1 = _mm_srai_epi32(temp1, bit);
3978 :
3979 0 : temp2 = _mm_mullo_epi32(v[12], cospi48);
3980 0 : x = _mm_mullo_epi32(v[13], cospi16);
3981 0 : temp2 = _mm_sub_epi32(temp2, x);
3982 0 : temp2 = _mm_add_epi32(temp2, rnding);
3983 0 : temp2 = _mm_srai_epi32(temp2, bit);
3984 0 : v[12] = temp1;
3985 0 : v[13] = temp2;
3986 :
3987 : // stage 7
3988 0 : v[2] = v[0];
3989 0 : v[3] = v[1];
3990 0 : v[6] = v[4];
3991 0 : v[7] = v[5];
3992 0 : v[10] = v[8];
3993 0 : v[11] = v[9];
3994 0 : v[14] = v[12];
3995 0 : v[15] = v[13];
3996 :
3997 : // stage 8
3998 0 : y = _mm_mullo_epi32(v[2], cospi32);
3999 0 : x = _mm_mullo_epi32(v[3], cospi32);
4000 0 : v[2] = _mm_add_epi32(y, x);
4001 0 : v[2] = _mm_add_epi32(v[2], rnding);
4002 0 : v[2] = _mm_srai_epi32(v[2], bit);
4003 :
4004 0 : v[3] = _mm_sub_epi32(y, x);
4005 0 : v[3] = _mm_add_epi32(v[3], rnding);
4006 0 : v[3] = _mm_srai_epi32(v[3], bit);
4007 :
4008 0 : y = _mm_mullo_epi32(v[6], cospi32);
4009 0 : x = _mm_mullo_epi32(v[7], cospi32);
4010 0 : v[6] = _mm_add_epi32(y, x);
4011 0 : v[6] = _mm_add_epi32(v[6], rnding);
4012 0 : v[6] = _mm_srai_epi32(v[6], bit);
4013 :
4014 0 : v[7] = _mm_sub_epi32(y, x);
4015 0 : v[7] = _mm_add_epi32(v[7], rnding);
4016 0 : v[7] = _mm_srai_epi32(v[7], bit);
4017 :
4018 0 : y = _mm_mullo_epi32(v[10], cospi32);
4019 0 : x = _mm_mullo_epi32(v[11], cospi32);
4020 0 : v[10] = _mm_add_epi32(y, x);
4021 0 : v[10] = _mm_add_epi32(v[10], rnding);
4022 0 : v[10] = _mm_srai_epi32(v[10], bit);
4023 :
4024 0 : v[11] = _mm_sub_epi32(y, x);
4025 0 : v[11] = _mm_add_epi32(v[11], rnding);
4026 0 : v[11] = _mm_srai_epi32(v[11], bit);
4027 :
4028 0 : y = _mm_mullo_epi32(v[14], cospi32);
4029 0 : x = _mm_mullo_epi32(v[15], cospi32);
4030 0 : v[14] = _mm_add_epi32(y, x);
4031 0 : v[14] = _mm_add_epi32(v[14], rnding);
4032 0 : v[14] = _mm_srai_epi32(v[14], bit);
4033 :
4034 0 : v[15] = _mm_sub_epi32(y, x);
4035 0 : v[15] = _mm_add_epi32(v[15], rnding);
4036 0 : v[15] = _mm_srai_epi32(v[15], bit);
4037 :
4038 : // stage 9
4039 0 : if (do_cols) {
4040 0 : out[0] = v[0];
4041 0 : out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
4042 0 : out[2] = v[12];
4043 0 : out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
4044 0 : out[4] = v[6];
4045 0 : out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
4046 0 : out[6] = v[10];
4047 0 : out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
4048 0 : out[8] = v[3];
4049 0 : out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
4050 0 : out[10] = v[15];
4051 0 : out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
4052 0 : out[12] = v[5];
4053 0 : out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
4054 0 : out[14] = v[9];
4055 0 : out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
4056 : }
4057 : else {
4058 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
4059 0 : const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
4060 0 : const __m128i clamp_hi_out =
4061 0 : _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
4062 :
4063 0 : neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
4064 : &clamp_hi_out, out_shift);
4065 0 : neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
4066 : &clamp_hi_out, out_shift);
4067 0 : neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
4068 : &clamp_hi_out, out_shift);
4069 0 : neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
4070 : &clamp_hi_out, out_shift);
4071 0 : neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
4072 : &clamp_hi_out, out_shift);
4073 0 : neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
4074 : &clamp_hi_out, out_shift);
4075 0 : neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
4076 : &clamp_hi_out, out_shift);
4077 0 : neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
4078 : &clamp_hi_out, out_shift);
4079 : }
4080 : }
4081 0 : }
4082 :
4083 0 : static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int32_t bit,
4084 : int32_t do_cols, int32_t bd, int32_t out_shift) {
4085 0 : const int32_t *cospi = cospi_arr(bit);
4086 0 : const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4087 0 : const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4088 0 : const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4089 0 : const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4090 0 : const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4091 0 : const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4092 0 : const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4093 0 : const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4094 0 : const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
4095 0 : const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4096 0 : const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
4097 0 : const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4098 0 : const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
4099 0 : const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4100 0 : const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
4101 0 : const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4102 0 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4103 0 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4104 0 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4105 0 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4106 0 : const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4107 0 : const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4108 0 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4109 0 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4110 0 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4111 0 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4112 0 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
4113 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4114 0 : const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4115 0 : const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4116 : __m128i u[16], x, y;
4117 :
4118 : // Calculate the column 0, 1, 2, 3
4119 : {
4120 : // stage 0
4121 : // stage 1
4122 : // stage 2
4123 0 : __m128i zero = _mm_setzero_si128();
4124 0 : x = _mm_mullo_epi32(in[0], cospi62);
4125 0 : u[0] = _mm_add_epi32(x, rnding);
4126 0 : u[0] = _mm_srai_epi32(u[0], bit);
4127 :
4128 0 : x = _mm_mullo_epi32(in[0], cospi2);
4129 0 : u[1] = _mm_sub_epi32(zero, x);
4130 0 : u[1] = _mm_add_epi32(u[1], rnding);
4131 0 : u[1] = _mm_srai_epi32(u[1], bit);
4132 :
4133 0 : x = _mm_mullo_epi32(in[2], cospi54);
4134 0 : u[2] = _mm_add_epi32(x, rnding);
4135 0 : u[2] = _mm_srai_epi32(u[2], bit);
4136 :
4137 0 : x = _mm_mullo_epi32(in[2], cospi10);
4138 0 : u[3] = _mm_sub_epi32(zero, x);
4139 0 : u[3] = _mm_add_epi32(u[3], rnding);
4140 0 : u[3] = _mm_srai_epi32(u[3], bit);
4141 :
4142 0 : x = _mm_mullo_epi32(in[4], cospi46);
4143 0 : u[4] = _mm_add_epi32(x, rnding);
4144 0 : u[4] = _mm_srai_epi32(u[4], bit);
4145 :
4146 0 : x = _mm_mullo_epi32(in[4], cospi18);
4147 0 : u[5] = _mm_sub_epi32(zero, x);
4148 0 : u[5] = _mm_add_epi32(u[5], rnding);
4149 0 : u[5] = _mm_srai_epi32(u[5], bit);
4150 :
4151 0 : x = _mm_mullo_epi32(in[6], cospi38);
4152 0 : u[6] = _mm_add_epi32(x, rnding);
4153 0 : u[6] = _mm_srai_epi32(u[6], bit);
4154 :
4155 0 : x = _mm_mullo_epi32(in[6], cospi26);
4156 0 : u[7] = _mm_sub_epi32(zero, x);
4157 0 : u[7] = _mm_add_epi32(u[7], rnding);
4158 0 : u[7] = _mm_srai_epi32(u[7], bit);
4159 :
4160 0 : u[8] = _mm_mullo_epi32(in[7], cospi34);
4161 0 : u[8] = _mm_add_epi32(u[8], rnding);
4162 0 : u[8] = _mm_srai_epi32(u[8], bit);
4163 :
4164 0 : u[9] = _mm_mullo_epi32(in[7], cospi30);
4165 0 : u[9] = _mm_add_epi32(u[9], rnding);
4166 0 : u[9] = _mm_srai_epi32(u[9], bit);
4167 :
4168 0 : u[10] = _mm_mullo_epi32(in[5], cospi42);
4169 0 : u[10] = _mm_add_epi32(u[10], rnding);
4170 0 : u[10] = _mm_srai_epi32(u[10], bit);
4171 :
4172 0 : u[11] = _mm_mullo_epi32(in[5], cospi22);
4173 0 : u[11] = _mm_add_epi32(u[11], rnding);
4174 0 : u[11] = _mm_srai_epi32(u[11], bit);
4175 :
4176 0 : u[12] = _mm_mullo_epi32(in[3], cospi50);
4177 0 : u[12] = _mm_add_epi32(u[12], rnding);
4178 0 : u[12] = _mm_srai_epi32(u[12], bit);
4179 :
4180 0 : u[13] = _mm_mullo_epi32(in[3], cospi14);
4181 0 : u[13] = _mm_add_epi32(u[13], rnding);
4182 0 : u[13] = _mm_srai_epi32(u[13], bit);
4183 :
4184 0 : u[14] = _mm_mullo_epi32(in[1], cospi58);
4185 0 : u[14] = _mm_add_epi32(u[14], rnding);
4186 0 : u[14] = _mm_srai_epi32(u[14], bit);
4187 :
4188 0 : u[15] = _mm_mullo_epi32(in[1], cospi6);
4189 0 : u[15] = _mm_add_epi32(u[15], rnding);
4190 0 : u[15] = _mm_srai_epi32(u[15], bit);
4191 :
4192 : // stage 3
4193 0 : addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
4194 0 : addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
4195 0 : addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
4196 0 : addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
4197 0 : addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
4198 0 : addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
4199 0 : addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
4200 0 : addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
4201 :
4202 : // stage 4
4203 0 : y = _mm_mullo_epi32(u[8], cospi56);
4204 0 : x = _mm_mullo_epi32(u[9], cospi56);
4205 0 : u[8] = _mm_mullo_epi32(u[8], cospi8);
4206 0 : u[8] = _mm_add_epi32(u[8], x);
4207 0 : u[8] = _mm_add_epi32(u[8], rnding);
4208 0 : u[8] = _mm_srai_epi32(u[8], bit);
4209 :
4210 0 : x = _mm_mullo_epi32(u[9], cospi8);
4211 0 : u[9] = _mm_sub_epi32(y, x);
4212 0 : u[9] = _mm_add_epi32(u[9], rnding);
4213 0 : u[9] = _mm_srai_epi32(u[9], bit);
4214 :
4215 0 : x = _mm_mullo_epi32(u[11], cospi24);
4216 0 : y = _mm_mullo_epi32(u[10], cospi24);
4217 0 : u[10] = _mm_mullo_epi32(u[10], cospi40);
4218 0 : u[10] = _mm_add_epi32(u[10], x);
4219 0 : u[10] = _mm_add_epi32(u[10], rnding);
4220 0 : u[10] = _mm_srai_epi32(u[10], bit);
4221 :
4222 0 : x = _mm_mullo_epi32(u[11], cospi40);
4223 0 : u[11] = _mm_sub_epi32(y, x);
4224 0 : u[11] = _mm_add_epi32(u[11], rnding);
4225 0 : u[11] = _mm_srai_epi32(u[11], bit);
4226 :
4227 0 : x = _mm_mullo_epi32(u[13], cospi8);
4228 0 : y = _mm_mullo_epi32(u[12], cospi8);
4229 0 : u[12] = _mm_mullo_epi32(u[12], cospim56);
4230 0 : u[12] = _mm_add_epi32(u[12], x);
4231 0 : u[12] = _mm_add_epi32(u[12], rnding);
4232 0 : u[12] = _mm_srai_epi32(u[12], bit);
4233 :
4234 0 : x = _mm_mullo_epi32(u[13], cospim56);
4235 0 : u[13] = _mm_sub_epi32(y, x);
4236 0 : u[13] = _mm_add_epi32(u[13], rnding);
4237 0 : u[13] = _mm_srai_epi32(u[13], bit);
4238 :
4239 0 : x = _mm_mullo_epi32(u[15], cospi40);
4240 0 : y = _mm_mullo_epi32(u[14], cospi40);
4241 0 : u[14] = _mm_mullo_epi32(u[14], cospim24);
4242 0 : u[14] = _mm_add_epi32(u[14], x);
4243 0 : u[14] = _mm_add_epi32(u[14], rnding);
4244 0 : u[14] = _mm_srai_epi32(u[14], bit);
4245 :
4246 0 : x = _mm_mullo_epi32(u[15], cospim24);
4247 0 : u[15] = _mm_sub_epi32(y, x);
4248 0 : u[15] = _mm_add_epi32(u[15], rnding);
4249 0 : u[15] = _mm_srai_epi32(u[15], bit);
4250 :
4251 : // stage 5
4252 0 : addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
4253 0 : addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
4254 0 : addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
4255 0 : addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
4256 0 : addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
4257 0 : addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
4258 0 : addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
4259 0 : addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
4260 :
4261 : // stage 6
4262 0 : x = _mm_mullo_epi32(u[5], cospi48);
4263 0 : y = _mm_mullo_epi32(u[4], cospi48);
4264 0 : u[4] = _mm_mullo_epi32(u[4], cospi16);
4265 0 : u[4] = _mm_add_epi32(u[4], x);
4266 0 : u[4] = _mm_add_epi32(u[4], rnding);
4267 0 : u[4] = _mm_srai_epi32(u[4], bit);
4268 :
4269 0 : x = _mm_mullo_epi32(u[5], cospi16);
4270 0 : u[5] = _mm_sub_epi32(y, x);
4271 0 : u[5] = _mm_add_epi32(u[5], rnding);
4272 0 : u[5] = _mm_srai_epi32(u[5], bit);
4273 :
4274 0 : x = _mm_mullo_epi32(u[7], cospi16);
4275 0 : y = _mm_mullo_epi32(u[6], cospi16);
4276 0 : u[6] = _mm_mullo_epi32(u[6], cospim48);
4277 0 : u[6] = _mm_add_epi32(u[6], x);
4278 0 : u[6] = _mm_add_epi32(u[6], rnding);
4279 0 : u[6] = _mm_srai_epi32(u[6], bit);
4280 :
4281 0 : x = _mm_mullo_epi32(u[7], cospim48);
4282 0 : u[7] = _mm_sub_epi32(y, x);
4283 0 : u[7] = _mm_add_epi32(u[7], rnding);
4284 0 : u[7] = _mm_srai_epi32(u[7], bit);
4285 :
4286 0 : x = _mm_mullo_epi32(u[13], cospi48);
4287 0 : y = _mm_mullo_epi32(u[12], cospi48);
4288 0 : u[12] = _mm_mullo_epi32(u[12], cospi16);
4289 0 : u[12] = _mm_add_epi32(u[12], x);
4290 0 : u[12] = _mm_add_epi32(u[12], rnding);
4291 0 : u[12] = _mm_srai_epi32(u[12], bit);
4292 :
4293 0 : x = _mm_mullo_epi32(u[13], cospi16);
4294 0 : u[13] = _mm_sub_epi32(y, x);
4295 0 : u[13] = _mm_add_epi32(u[13], rnding);
4296 0 : u[13] = _mm_srai_epi32(u[13], bit);
4297 :
4298 0 : x = _mm_mullo_epi32(u[15], cospi16);
4299 0 : y = _mm_mullo_epi32(u[14], cospi16);
4300 0 : u[14] = _mm_mullo_epi32(u[14], cospim48);
4301 0 : u[14] = _mm_add_epi32(u[14], x);
4302 0 : u[14] = _mm_add_epi32(u[14], rnding);
4303 0 : u[14] = _mm_srai_epi32(u[14], bit);
4304 :
4305 0 : x = _mm_mullo_epi32(u[15], cospim48);
4306 0 : u[15] = _mm_sub_epi32(y, x);
4307 0 : u[15] = _mm_add_epi32(u[15], rnding);
4308 0 : u[15] = _mm_srai_epi32(u[15], bit);
4309 :
4310 : // stage 7
4311 0 : addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
4312 0 : addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
4313 0 : addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
4314 0 : addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
4315 0 : addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
4316 0 : addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
4317 0 : addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
4318 0 : addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
4319 :
4320 : // stage 8
4321 0 : y = _mm_mullo_epi32(u[2], cospi32);
4322 0 : x = _mm_mullo_epi32(u[3], cospi32);
4323 0 : u[2] = _mm_add_epi32(y, x);
4324 0 : u[2] = _mm_add_epi32(u[2], rnding);
4325 0 : u[2] = _mm_srai_epi32(u[2], bit);
4326 :
4327 0 : u[3] = _mm_sub_epi32(y, x);
4328 0 : u[3] = _mm_add_epi32(u[3], rnding);
4329 0 : u[3] = _mm_srai_epi32(u[3], bit);
4330 0 : y = _mm_mullo_epi32(u[6], cospi32);
4331 0 : x = _mm_mullo_epi32(u[7], cospi32);
4332 0 : u[6] = _mm_add_epi32(y, x);
4333 0 : u[6] = _mm_add_epi32(u[6], rnding);
4334 0 : u[6] = _mm_srai_epi32(u[6], bit);
4335 :
4336 0 : u[7] = _mm_sub_epi32(y, x);
4337 0 : u[7] = _mm_add_epi32(u[7], rnding);
4338 0 : u[7] = _mm_srai_epi32(u[7], bit);
4339 :
4340 0 : y = _mm_mullo_epi32(u[10], cospi32);
4341 0 : x = _mm_mullo_epi32(u[11], cospi32);
4342 0 : u[10] = _mm_add_epi32(y, x);
4343 0 : u[10] = _mm_add_epi32(u[10], rnding);
4344 0 : u[10] = _mm_srai_epi32(u[10], bit);
4345 :
4346 0 : u[11] = _mm_sub_epi32(y, x);
4347 0 : u[11] = _mm_add_epi32(u[11], rnding);
4348 0 : u[11] = _mm_srai_epi32(u[11], bit);
4349 :
4350 0 : y = _mm_mullo_epi32(u[14], cospi32);
4351 0 : x = _mm_mullo_epi32(u[15], cospi32);
4352 0 : u[14] = _mm_add_epi32(y, x);
4353 0 : u[14] = _mm_add_epi32(u[14], rnding);
4354 0 : u[14] = _mm_srai_epi32(u[14], bit);
4355 :
4356 0 : u[15] = _mm_sub_epi32(y, x);
4357 0 : u[15] = _mm_add_epi32(u[15], rnding);
4358 0 : u[15] = _mm_srai_epi32(u[15], bit);
4359 :
4360 : // stage 9
4361 0 : if (do_cols) {
4362 0 : out[0] = u[0];
4363 0 : out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]);
4364 0 : out[2] = u[12];
4365 0 : out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]);
4366 0 : out[4] = u[6];
4367 0 : out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]);
4368 0 : out[6] = u[10];
4369 0 : out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]);
4370 0 : out[8] = u[3];
4371 0 : out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]);
4372 0 : out[10] = u[15];
4373 0 : out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]);
4374 0 : out[12] = u[5];
4375 0 : out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]);
4376 0 : out[14] = u[9];
4377 0 : out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]);
4378 : }
4379 : else {
4380 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
4381 0 : const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
4382 0 : const __m128i clamp_hi_out =
4383 0 : _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
4384 :
4385 0 : neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out,
4386 : &clamp_hi_out, out_shift);
4387 0 : neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
4388 : &clamp_hi_out, out_shift);
4389 0 : neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
4390 : &clamp_hi_out, out_shift);
4391 0 : neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
4392 : &clamp_hi_out, out_shift);
4393 0 : neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
4394 : &clamp_hi_out, out_shift);
4395 0 : neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
4396 : &clamp_hi_out, out_shift);
4397 0 : neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
4398 : &clamp_hi_out, out_shift);
4399 0 : neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
4400 : &clamp_hi_out, out_shift);
4401 : }
4402 : }
4403 0 : }
4404 :
4405 0 : static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in,
4406 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
4407 : int32_t flipud, int32_t height, const int32_t bd) {
4408 0 : int32_t j = flipud ? (height - 1) : 0;
4409 0 : const int32_t step = flipud ? -1 : 1;
4410 0 : for (int32_t i = 0; i < height; ++i, j += step) {
4411 0 : __m128i v = _mm_loadl_epi64((__m128i const *)(output_r + i * stride_r));
4412 0 : __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd);
4413 :
4414 0 : _mm_storel_epi64((__m128i *)(output_w + i * stride_w), u);
4415 : }
4416 0 : }
4417 :
4418 : static const transform_1d_sse4_1
4419 : highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
4420 : {
4421 : { idct4x4_sse4_1, NULL, NULL, NULL },
4422 : { iadst4x4_sse4_1, NULL, NULL, NULL },
4423 : { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL },
4424 : },
4425 : { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
4426 : { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
4427 : { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } },
4428 : {
4429 : { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_new_sse4_1,
4430 : NULL },
4431 : { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_new_sse4_1,
4432 : NULL },
4433 : { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL },
4434 : },
4435 : { { NULL, NULL, NULL,
4436 : NULL },
4437 : { NULL, NULL, NULL, NULL },
4438 : { NULL, NULL, NULL, NULL } },
4439 : { { NULL, NULL, NULL,
4440 : NULL },
4441 : { NULL, NULL, NULL, NULL },
4442 : { NULL, NULL, NULL, NULL } }
4443 : };
4444 :
4445 0 : void eb_av1_inv_txfm2d_add_4x8_sse4_1(const int32_t *input,
4446 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
4447 : TxType tx_type, TxSize tx_size, int32_t bd) {
4448 : __m128i buf1[8];
4449 0 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
4450 0 : const int32_t txw_idx = get_txw_idx(tx_size);
4451 0 : const int32_t txh_idx = get_txh_idx(tx_size);
4452 0 : const int32_t txfm_size_col = tx_size_wide[tx_size];
4453 0 : const int32_t txfm_size_row = tx_size_high[tx_size];
4454 0 : const transform_1d_sse4_1 row_txfm =
4455 0 : highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
4456 0 : const transform_1d_sse4_1 col_txfm =
4457 0 : highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
4458 0 : const int32_t input_stride = AOMMIN(32, txfm_size_col);
4459 :
4460 0 : assert(col_txfm != NULL);
4461 0 : assert(row_txfm != NULL);
4462 : int32_t ud_flip, lr_flip;
4463 0 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4464 :
4465 : // 1st stage: column transform
4466 : __m128i buf0[8];
4467 0 : const int32_t *input_row = input;
4468 0 : __m128i *buf0_cur = buf0;
4469 0 : load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
4470 0 : av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
4471 : NewInvSqrt2);
4472 0 : row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
4473 0 : row_txfm(buf0 + 4, buf0 + 4, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
4474 0 : -shift[0]);
4475 :
4476 0 : if (lr_flip) {
4477 0 : TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
4478 : buf1[3]);
4479 :
4480 0 : TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
4481 : buf1[7]);
4482 : }
4483 : else {
4484 0 : TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
4485 : buf1[3]);
4486 :
4487 0 : TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
4488 : buf1[7]);
4489 : }
4490 :
4491 : // 2nd stage: column transform
4492 0 : col_txfm(buf1, buf1, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
4493 :
4494 0 : av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
4495 :
4496 : // write to buffer
4497 0 : highbd_write_buffer_4xn_sse4_1(buf1, output_r, stride_r, output_w, stride_w,
4498 : ud_flip, txfm_size_row, bd);
4499 0 : }
4500 :
4501 : //8x4
4502 0 : static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int32_t size) {
4503 0 : for (int32_t i = 0; i < size; ++i)
4504 0 : out[size - i - 1] = in[i];
4505 0 : }
4506 :
4507 0 : static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
4508 : __m128i res0, __m128i res1,
4509 : const int32_t bd) {
4510 0 : __m128i x0 = _mm_cvtepi16_epi32(pred);
4511 0 : __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
4512 :
4513 0 : x0 = _mm_add_epi32(res0, x0);
4514 0 : x1 = _mm_add_epi32(res1, x1);
4515 0 : x0 = _mm_packus_epi32(x0, x1);
4516 0 : x0 = highbd_clamp_epi16(x0, bd);
4517 0 : return x0;
4518 : }
4519 :
4520 0 : static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in,
4521 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
4522 : int32_t flipud, int32_t height, const int32_t bd) {
4523 0 : int32_t j = flipud ? (height - 1) : 0;
4524 0 : const int32_t step = flipud ? -1 : 1;
4525 0 : for (int32_t i = 0; i < height; ++i, j += step) {
4526 0 : __m128i v = _mm_loadu_si128((__m128i const *)(output_r + i * stride_r));
4527 0 : __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
4528 :
4529 0 : _mm_storeu_si128((__m128i *)(output_w + i * stride_w), u);
4530 : }
4531 0 : }
4532 0 : void eb_av1_inv_txfm2d_add_8x4_sse4_1(const int32_t *input,
4533 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
4534 : TxType tx_type, TxSize tx_size, int32_t bd) {
4535 : __m128i buf1[8];
4536 0 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
4537 0 : const int32_t txw_idx = get_txw_idx(tx_size);
4538 0 : const int32_t txh_idx = get_txh_idx(tx_size);
4539 0 : const int32_t txfm_size_col = tx_size_wide[tx_size];
4540 0 : const int32_t txfm_size_row = tx_size_high[tx_size];
4541 0 : const transform_1d_sse4_1 row_txfm =
4542 0 : highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
4543 0 : const transform_1d_sse4_1 col_txfm =
4544 0 : highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
4545 :
4546 0 : assert(col_txfm != NULL);
4547 0 : assert(row_txfm != NULL);
4548 : int32_t ud_flip, lr_flip;
4549 0 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4550 :
4551 : // 1st stage: column transform
4552 : __m128i buf0[8];
4553 0 : const int32_t *input_row = input;
4554 0 : load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
4555 :
4556 0 : TRANSPOSE_4X4(buf0[0], buf0[2], buf0[4], buf0[6], buf1[0], buf1[1], buf1[2],
4557 : buf1[3]);
4558 0 : TRANSPOSE_4X4(buf0[1], buf0[3], buf0[5], buf0[7], buf1[4], buf1[5], buf1[6],
4559 : buf1[7]);
4560 :
4561 0 : av1_round_shift_rect_array_32_sse4_1(buf1, buf0, txfm_size_col, 0,
4562 : NewInvSqrt2);
4563 0 : row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
4564 :
4565 : __m128i *buf1_ptr;
4566 0 : if (lr_flip) {
4567 0 : flip_buf_sse2(buf0, buf1, txfm_size_col);
4568 0 : buf1_ptr = buf1;
4569 : }
4570 : else
4571 0 : buf1_ptr = buf0;
4572 : // 2nd stage: column transform
4573 0 : for (int32_t i = 0; i < 2; i++) {
4574 0 : col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
4575 0 : inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
4576 : }
4577 0 : av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
4578 : // write to buffer
4579 0 : highbd_write_buffer_8xn_sse4_1(buf1_ptr,
4580 : output_r, stride_r, output_w, stride_w,
4581 : ud_flip, txfm_size_row, bd);
4582 0 : }
4583 :
4584 : //4x16
4585 0 : void eb_av1_inv_txfm2d_add_4x16_sse4_1(const int32_t *input,
4586 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
4587 : TxType tx_type, TxSize tx_size, int32_t bd) {
4588 : __m128i buf1[16];
4589 0 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
4590 0 : const int32_t txw_idx = get_txw_idx(tx_size);
4591 0 : const int32_t txh_idx = get_txh_idx(tx_size);
4592 0 : const int32_t txfm_size_col = tx_size_wide[tx_size];
4593 0 : const int32_t txfm_size_row = tx_size_high[tx_size];
4594 0 : const int32_t buf_size_h_div8 = txfm_size_row >> 2;
4595 0 : const transform_1d_sse4_1 row_txfm =
4596 0 : highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
4597 0 : const transform_1d_sse4_1 col_txfm =
4598 0 : highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
4599 0 : const int32_t input_stride = AOMMIN(32, txfm_size_col);
4600 :
4601 0 : assert(col_txfm != NULL);
4602 0 : assert(row_txfm != NULL);
4603 : int32_t ud_flip, lr_flip;
4604 0 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4605 :
4606 : // 1st stage: column transform
4607 : __m128i buf0[16];
4608 0 : const int32_t *input_row = input;
4609 0 : __m128i *buf0_cur = buf0;
4610 0 : load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
4611 0 : for (int32_t i = 0; i < (txfm_size_row >> 2); i++) {
4612 0 : row_txfm(buf0 + (i << 2), buf0 + (i << 2),
4613 0 : inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
4614 : }
4615 :
4616 0 : av1_round_shift_array_32_sse4_1(buf0, buf0, txfm_size_row, -shift[0]);
4617 :
4618 0 : if (lr_flip) {
4619 0 : for (int32_t j = 0; j < buf_size_h_div8; ++j) {
4620 0 : TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
4621 : buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
4622 : buf1[4 * j + 3]);
4623 : }
4624 : }
4625 : else {
4626 0 : for (int32_t j = 0; j < buf_size_h_div8; ++j) {
4627 0 : TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
4628 : buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
4629 : buf1[4 * j + 2], buf1[4 * j + 3]);
4630 : }
4631 : }
4632 :
4633 : // 2nd stage: column transform
4634 0 : col_txfm(buf1, buf1, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
4635 :
4636 0 : av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
4637 :
4638 : // write to buffer
4639 0 : highbd_write_buffer_4xn_sse4_1(buf1,
4640 : output_r, stride_r, output_w, stride_w,
4641 : ud_flip, txfm_size_row, bd);
4642 0 : }
4643 :
4644 : //16x4
4645 0 : void eb_av1_inv_txfm2d_add_16x4_sse4_1(const int32_t *input,
4646 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
4647 : TxType tx_type, TxSize tx_size, int32_t bd) {
4648 : __m128i buf1[16];
4649 0 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
4650 0 : const int32_t txw_idx = get_txw_idx(tx_size);
4651 0 : const int32_t txh_idx = get_txh_idx(tx_size);
4652 0 : const int32_t txfm_size_col = tx_size_wide[tx_size];
4653 0 : const int32_t txfm_size_row = tx_size_high[tx_size];
4654 0 : const int32_t buf_size_w_div8 = txfm_size_col >> 2;
4655 0 : const transform_1d_sse4_1 row_txfm =
4656 0 : highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
4657 0 : const transform_1d_sse4_1 col_txfm =
4658 0 : highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
4659 :
4660 0 : assert(col_txfm != NULL);
4661 0 : assert(row_txfm != NULL);
4662 : int32_t ud_flip, lr_flip;
4663 0 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4664 :
4665 : // 1st stage: column transform
4666 : __m128i buf0[16];
4667 0 : const int32_t *input_row = input;
4668 0 : load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
4669 :
4670 0 : for (int32_t j = 0; j < buf_size_w_div8; j++) {
4671 0 : TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
4672 : buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
4673 : }
4674 0 : row_txfm(buf1, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
4675 :
4676 : __m128i *buf1_ptr;
4677 0 : if (lr_flip) {
4678 0 : flip_buf_sse2(buf0, buf1, txfm_size_col);
4679 0 : buf1_ptr = buf1;
4680 : }
4681 : else
4682 0 : buf1_ptr = buf0;
4683 : // 2nd stage: column transform
4684 0 : for (int32_t i = 0; i < buf_size_w_div8; i++) {
4685 0 : col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
4686 0 : inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
4687 : }
4688 0 : av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
4689 :
4690 : // write to buffer
4691 0 : for (int32_t i = 0; i < (txfm_size_col >> 3); i++) {
4692 0 : highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2,
4693 0 : output_r + 8 * i, stride_r,
4694 0 : output_w + 8 * i, stride_w,
4695 : ud_flip, txfm_size_row, bd);
4696 : }
4697 0 : }
|