Line data Source code
1 : /*
2 : * Copyright(c) 2019 Intel Corporation
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 :
6 : /*
7 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
8 : *
9 : * This source code is subject to the terms of the BSD 2 Clause License and
10 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
11 : * was not distributed with this source code in the LICENSE file, you can
12 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
13 : * Media Patent License 1.0 was not distributed with this source code in the
14 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
15 : */
16 :
17 : #include <assert.h>
18 : #include "EbDefinitions.h"
19 : #include "aom_dsp_rtcd.h"
20 : #include "EbTransforms.h"
21 : #include <immintrin.h>
22 : #include "txfm_common_avx2.h"
23 :
24 : void Av1TransformConfig(
25 : TxType tx_type,
26 : TxSize tx_size,
27 : Txfm2DFlipCfg *cfg);
28 :
29 : typedef void(*fwd_transform_1d_avx2)(const __m256i *in, __m256i *out, int8_t bit,
30 : const int32_t num_cols);
31 :
32 : #define TRANSPOSE_4X4_AVX2(x0, x1, x2, x3, y0, y1, y2, y3) \
33 : do { \
34 : __m256i u0, u1, u2, u3; \
35 : u0 = _mm256_unpacklo_epi32(x0, x1); \
36 : u1 = _mm256_unpackhi_epi32(x0, x1); \
37 : u2 = _mm256_unpacklo_epi32(x2, x3); \
38 : u3 = _mm256_unpackhi_epi32(x2, x3); \
39 : y0 = _mm256_unpacklo_epi64(u0, u2); \
40 : y1 = _mm256_unpackhi_epi64(u0, u2); \
41 : y2 = _mm256_unpacklo_epi64(u1, u3); \
42 : y3 = _mm256_unpackhi_epi64(u1, u3); \
43 : } while (0)
44 :
45 98739800 : static INLINE void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
46 : __m256i out1[8];
47 789919000 : TRANSPOSE_4X4_AVX2(in[0], in[1], in[2], in[3], out1[0], out1[1], out1[4], out1[5]);
48 789919000 : TRANSPOSE_4X4_AVX2(in[4], in[5], in[6], in[7], out1[2], out1[3], out1[6], out1[7]);
49 98739800 : out[0] = _mm256_permute2x128_si256(out1[0], out1[2], 0x20);
50 98739800 : out[1] = _mm256_permute2x128_si256(out1[1], out1[3], 0x20);
51 98739800 : out[2] = _mm256_permute2x128_si256(out1[4], out1[6], 0x20);
52 98739800 : out[3] = _mm256_permute2x128_si256(out1[5], out1[7], 0x20);
53 98739800 : out[4] = _mm256_permute2x128_si256(out1[0], out1[2], 0x31);
54 98739800 : out[5] = _mm256_permute2x128_si256(out1[1], out1[3], 0x31);
55 98739800 : out[6] = _mm256_permute2x128_si256(out1[4], out1[6], 0x31);
56 98739800 : out[7] = _mm256_permute2x128_si256(out1[5], out1[7], 0x31);
57 98739800 : }
58 :
59 13216200 : static INLINE void transpose_16x16_avx2(const __m256i *in, __m256i *out) {
60 : __m256i temp[32];
61 105730000 : TRANSPOSE_4X4_AVX2(in[0], in[2], in[4], in[6], temp[0], temp[2], temp[4], temp[6]);
62 105730000 : TRANSPOSE_4X4_AVX2(in[8], in[10], in[12], in[14], temp[17], temp[19], temp[21], temp[23]);
63 105730000 : TRANSPOSE_4X4_AVX2(in[1], in[3], in[5], in[7], temp[16], temp[18], temp[20], temp[22]);
64 105730000 : TRANSPOSE_4X4_AVX2(in[9], in[11], in[13], in[15], temp[25], temp[27], temp[29], temp[31]);
65 105730000 : TRANSPOSE_4X4_AVX2(in[16], in[18], in[20], in[22], temp[1], temp[3], temp[5], temp[7]);
66 105730000 : TRANSPOSE_4X4_AVX2(in[24], in[26], in[28], in[30], temp[9], temp[11], temp[13], temp[15]);
67 105730000 : TRANSPOSE_4X4_AVX2(in[17], in[19], in[21], in[23], temp[8], temp[10], temp[12], temp[14]);
68 105730000 : TRANSPOSE_4X4_AVX2(in[25], in[27], in[29], in[31], temp[24], temp[26], temp[28], temp[30]);
69 :
70 13216200 : out[0] = _mm256_permute2x128_si256(temp[0], temp[17], 0x20);
71 13216200 : out[1] = _mm256_permute2x128_si256(temp[1], temp[9], 0x20);
72 13216200 : out[2] = _mm256_permute2x128_si256(temp[2], temp[19], 0x20);
73 13216200 : out[3] = _mm256_permute2x128_si256(temp[3], temp[11], 0x20);
74 13216200 : out[4] = _mm256_permute2x128_si256(temp[4], temp[21], 0x20);
75 13216200 : out[5] = _mm256_permute2x128_si256(temp[5], temp[13], 0x20);
76 13216200 : out[6] = _mm256_permute2x128_si256(temp[6], temp[23], 0x20);
77 13216200 : out[7] = _mm256_permute2x128_si256(temp[7], temp[15], 0x20);
78 13216200 : out[8] = _mm256_permute2x128_si256(temp[0], temp[17], 0x31);
79 13216200 : out[9] = _mm256_permute2x128_si256(temp[1], temp[9], 0x31);
80 13216200 : out[10] = _mm256_permute2x128_si256(temp[2], temp[19], 0x31);
81 13216200 : out[11] = _mm256_permute2x128_si256(temp[3], temp[11], 0x31);
82 13216200 : out[12] = _mm256_permute2x128_si256(temp[4], temp[21], 0x31);
83 13216200 : out[13] = _mm256_permute2x128_si256(temp[5], temp[13], 0x31);
84 13216200 : out[14] = _mm256_permute2x128_si256(temp[6], temp[23], 0x31);
85 13216200 : out[15] = _mm256_permute2x128_si256(temp[7], temp[15], 0x31);
86 13216200 : out[16] = _mm256_permute2x128_si256(temp[16], temp[25], 0x20);
87 13216200 : out[17] = _mm256_permute2x128_si256(temp[8], temp[24], 0x20);
88 13216200 : out[18] = _mm256_permute2x128_si256(temp[18], temp[27], 0x20);
89 13216200 : out[19] = _mm256_permute2x128_si256(temp[10], temp[26], 0x20);
90 13216200 : out[20] = _mm256_permute2x128_si256(temp[20], temp[29], 0x20);
91 13216200 : out[21] = _mm256_permute2x128_si256(temp[12], temp[28], 0x20);
92 13216200 : out[22] = _mm256_permute2x128_si256(temp[22], temp[31], 0x20);
93 13216200 : out[23] = _mm256_permute2x128_si256(temp[14], temp[30], 0x20);
94 13216200 : out[24] = _mm256_permute2x128_si256(temp[16], temp[25], 0x31);
95 13216200 : out[25] = _mm256_permute2x128_si256(temp[8], temp[24], 0x31);
96 13216200 : out[26] = _mm256_permute2x128_si256(temp[18], temp[27], 0x31);
97 13216200 : out[27] = _mm256_permute2x128_si256(temp[10], temp[26], 0x31);
98 13216200 : out[28] = _mm256_permute2x128_si256(temp[20], temp[29], 0x31);
99 13216200 : out[29] = _mm256_permute2x128_si256(temp[12], temp[28], 0x31);
100 13216200 : out[30] = _mm256_permute2x128_si256(temp[22], temp[31], 0x31);
101 13216200 : out[31] = _mm256_permute2x128_si256(temp[14], temp[30], 0x31);
102 13216200 : }
103 :
104 54789800 : static INLINE void transpose_32_8x8_avx2(int32_t stride, const __m256i *in,
105 : __m256i *out) {
106 : __m256i out1[8];
107 54789800 : __m256i temp0 = _mm256_unpacklo_epi32(in[0 * stride], in[2 * stride]);
108 54789800 : __m256i temp1 = _mm256_unpackhi_epi32(in[0 * stride], in[2 * stride]);
109 54789800 : __m256i temp2 = _mm256_unpacklo_epi32(in[1 * stride], in[3 * stride]);
110 54789800 : __m256i temp3 = _mm256_unpackhi_epi32(in[1 * stride], in[3 * stride]);
111 54789800 : __m256i temp4 = _mm256_unpacklo_epi32(in[4 * stride], in[6 * stride]);
112 54789800 : __m256i temp5 = _mm256_unpackhi_epi32(in[4 * stride], in[6 * stride]);
113 54789800 : __m256i temp6 = _mm256_unpacklo_epi32(in[5 * stride], in[7 * stride]);
114 109580000 : __m256i temp7 = _mm256_unpackhi_epi32(in[5 * stride], in[7 * stride]);
115 :
116 54789800 : out1[0] = _mm256_unpacklo_epi32(temp0, temp2);
117 54789800 : out1[1] = _mm256_unpackhi_epi32(temp0, temp2);
118 54789800 : out1[4] = _mm256_unpacklo_epi32(temp1, temp3);
119 54789800 : out1[5] = _mm256_unpackhi_epi32(temp1, temp3);
120 54789800 : out1[2] = _mm256_unpacklo_epi32(temp4, temp6);
121 54789800 : out1[3] = _mm256_unpackhi_epi32(temp4, temp6);
122 54789800 : out1[6] = _mm256_unpacklo_epi32(temp5, temp7);
123 54789800 : out1[7] = _mm256_unpackhi_epi32(temp5, temp7);
124 :
125 54789800 : out[0 * stride] = _mm256_permute2x128_si256(out1[0], out1[2], 0x20);
126 54789800 : out[1 * stride] = _mm256_permute2x128_si256(out1[1], out1[3], 0x20);
127 54789800 : out[2 * stride] = _mm256_permute2x128_si256(out1[4], out1[6], 0x20);
128 54789800 : out[3 * stride] = _mm256_permute2x128_si256(out1[5], out1[7], 0x20);
129 54789800 : out[4 * stride] = _mm256_permute2x128_si256(out1[0], out1[2], 0x31);
130 54789800 : out[5 * stride] = _mm256_permute2x128_si256(out1[1], out1[3], 0x31);
131 54789800 : out[6 * stride] = _mm256_permute2x128_si256(out1[4], out1[6], 0x31);
132 54789800 : out[7 * stride] = _mm256_permute2x128_si256(out1[5], out1[7], 0x31);
133 54789800 : }
134 :
135 3427220 : static INLINE void transpose_32_avx2(int32_t txfm_size, const __m256i *input,
136 : __m256i *output) {
137 3427220 : const int32_t num_per_256 = 8;
138 3427220 : const int32_t row_size = txfm_size;
139 3427220 : const int32_t col_size = txfm_size / num_per_256;
140 : int32_t r, c;
141 :
142 : // transpose each 8x8 block internally
143 17133000 : for (r = 0; r < row_size; r += 8) {
144 68491400 : for (c = 0; c < col_size; c++) {
145 54785700 : transpose_32_8x8_avx2(col_size, &input[r * col_size + c],
146 54785700 : &output[c * 8 * col_size + r / 8]);
147 : }
148 : }
149 3426740 : }
150 :
151 20384100 : static INLINE void transpose_8nx8n(const __m256i *input, __m256i *output,
152 : const int32_t width, const int32_t height) {
153 20384100 : const int32_t numcol = height >> 3;
154 20384100 : const int32_t numrow = width >> 3;
155 : __m256i out1[8];
156 92539200 : for (int32_t j = 0; j < numrow; j++) {
157 306538000 : for (int32_t i = 0; i < numcol; i++) {
158 1875060000 : TRANSPOSE_4X4_AVX2(input[i * width + j + (numrow * 0)],
159 : input[i * width + j + (numrow * 1)],
160 : input[i * width + j + (numrow * 2)],
161 : input[i * width + j + (numrow * 3)],
162 : out1[0], out1[1], out1[4], out1[5]);
163 1875060000 : TRANSPOSE_4X4_AVX2(input[i * width + j + (numrow * 4)],
164 : input[i * width + j + (numrow * 5)],
165 : input[i * width + j + (numrow * 6)],
166 : input[i * width + j + (numrow * 7)],
167 : out1[2], out1[3], out1[6], out1[7]);
168 234383000 : output[j * height + i + (numcol * 0)] = _mm256_permute2x128_si256(out1[0], out1[2], 0x20);
169 234383000 : output[j * height + i + (numcol * 1)] = _mm256_permute2x128_si256(out1[1], out1[3], 0x20);
170 234383000 : output[j * height + i + (numcol * 2)] = _mm256_permute2x128_si256(out1[4], out1[6], 0x20);
171 234383000 : output[j * height + i + (numcol * 3)] = _mm256_permute2x128_si256(out1[5], out1[7], 0x20);
172 234383000 : output[j * height + i + (numcol * 4)] = _mm256_permute2x128_si256(out1[0], out1[2], 0x31);
173 234383000 : output[j * height + i + (numcol * 5)] = _mm256_permute2x128_si256(out1[1], out1[3], 0x31);
174 234383000 : output[j * height + i + (numcol * 6)] = _mm256_permute2x128_si256(out1[4], out1[6], 0x31);
175 234383000 : output[j * height + i + (numcol * 7)] = _mm256_permute2x128_si256(out1[5], out1[7], 0x31);
176 : }
177 : }
178 20384100 : }
179 :
180 15040100 : static INLINE void transpose_4x8_avx2(const __m256i *in, __m256i *out) {
181 15040100 : __m256i perm = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
182 :
183 120320000 : TRANSPOSE_4X4_AVX2(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]);
184 15040100 : out[0] = _mm256_permutevar8x32_epi32(out[0], perm);
185 15040100 : out[1] = _mm256_permutevar8x32_epi32(out[1], perm);
186 15040100 : out[2] = _mm256_permutevar8x32_epi32(out[2], perm);
187 15040100 : out[3] = _mm256_permutevar8x32_epi32(out[3], perm);
188 15040100 : }
189 :
190 11341100 : static INLINE void transpose_4x16_avx2(const __m256i *in, __m256i *out) {
191 11341100 : __m256i perm = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
192 :
193 90728800 : TRANSPOSE_4X4_AVX2(in[0], in[1], in[2], in[3], out[0], out[2], out[4], out[6]);
194 90728800 : TRANSPOSE_4X4_AVX2(in[4], in[5], in[6], in[7], out[1], out[3], out[5], out[7]);
195 :
196 11341100 : out[0] = _mm256_permutevar8x32_epi32(out[0], perm);
197 11341100 : out[1] = _mm256_permutevar8x32_epi32(out[1], perm);
198 11341100 : out[2] = _mm256_permutevar8x32_epi32(out[2], perm);
199 11341100 : out[3] = _mm256_permutevar8x32_epi32(out[3], perm);
200 11341100 : out[4] = _mm256_permutevar8x32_epi32(out[4], perm);
201 11341100 : out[5] = _mm256_permutevar8x32_epi32(out[5], perm);
202 11341100 : out[6] = _mm256_permutevar8x32_epi32(out[6], perm);
203 11341100 : out[7] = _mm256_permutevar8x32_epi32(out[7], perm);
204 11341100 : }
205 :
206 : // Note:
207 : // rounding = 1 << (bit - 1)
208 249215000 : static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
209 : const __m256i *w1, const __m256i *n1,
210 : const __m256i *rounding, int32_t bit) {
211 : __m256i x, y;
212 :
213 249215000 : x = _mm256_mullo_epi32(*w0, *n0);
214 498430000 : y = _mm256_mullo_epi32(*w1, *n1);
215 249215000 : x = _mm256_add_epi32(x, y);
216 498430000 : x = _mm256_add_epi32(x, *rounding);
217 249215000 : x = _mm256_srai_epi32(x, bit);
218 249215000 : return x;
219 : }
220 :
221 86665700 : static INLINE __m128i half_btf_small(const __m128i *w0, const __m128i *n0,
222 : const __m128i *w1, const __m128i *n1,
223 : const __m128i *rounding, int32_t bit) {
224 : __m128i x, y;
225 :
226 86665700 : x = _mm_mullo_epi32(*w0, *n0);
227 173331000 : y = _mm_mullo_epi32(*w1, *n1);
228 86665700 : x = _mm_add_epi32(x, y);
229 173331000 : x = _mm_add_epi32(x, *rounding);
230 86665700 : x = _mm_srai_epi32(x, bit);
231 86665700 : return x;
232 : }
233 :
234 : // out0 = in0*w0 + in1*w1
235 : // out1 = -in1*w0 + in0*w1
236 : #define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \
237 : do { \
238 : const __m256i ww0 = _mm256_set1_epi32(w0); \
239 : const __m256i ww1 = _mm256_set1_epi32(w1); \
240 : const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \
241 : const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \
242 : out0 = _mm256_add_epi32(in0_w0, in1_w1); \
243 : out0 = av1_round_shift_32_avx2(out0, bit); \
244 : const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \
245 : const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \
246 : out1 = _mm256_sub_epi32(in0_w1, in1_w0); \
247 : out1 = av1_round_shift_32_avx2(out1, bit); \
248 : } while (0)
249 :
250 : // out0 = in0*w0 + in1*w1
251 : // out1 = in1*w0 - in0*w1
252 : #define btf_32_avx2_type1(w0, w1, in0, in1, out0, out1, bit) \
253 : do { \
254 : btf_32_avx2_type0(w1, w0, in1, in0, out0, out1, bit); \
255 : } while (0)
256 :
257 : // out0 = in0*w0 + in1*w1
258 : // out1 = -in1*w0 + in0*w1
259 : #define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
260 : do { \
261 : const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \
262 : const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \
263 : out0 = _mm256_add_epi32(in0_w0, in1_w1); \
264 : out0 = _mm256_add_epi32(out0, r); \
265 : out0 = _mm256_srai_epi32(out0, bit); \
266 : const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \
267 : const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \
268 : out1 = _mm256_sub_epi32(in0_w1, in1_w0); \
269 : out1 = _mm256_add_epi32(out1, r); \
270 : out1 = _mm256_srai_epi32(out1, bit); \
271 : } while (0)
272 :
273 : // out0 = in0*w0 + in1*w1
274 : // out1 = in1*w0 - in0*w1
275 : #define btf_32_type1_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
276 : do { \
277 : btf_32_type0_avx2_new(ww1, ww0, in1, in0, out0, out1, r, bit); \
278 : } while (0)
279 :
280 : static const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = {
281 : fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32,
282 : fwd_shift_64x64, fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16,
283 : fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64,
284 : fwd_shift_64x32, fwd_shift_4x16, fwd_shift_16x4, fwd_shift_8x32,
285 : fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16,
286 : };
287 :
288 106415000 : static INLINE void load_buffer_8x8(const int16_t *input, __m256i *in,
289 : int32_t stride, int32_t flipud, int32_t fliplr,
290 : int32_t shift) {
291 : __m128i temp[8];
292 106415000 : if (!flipud) {
293 101463000 : temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
294 101463000 : temp[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
295 101463000 : temp[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
296 101463000 : temp[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
297 101463000 : temp[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
298 101463000 : temp[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
299 101463000 : temp[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
300 202926000 : temp[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
301 : }
302 : else {
303 4951770 : temp[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
304 4951770 : temp[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
305 4951770 : temp[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
306 4951770 : temp[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
307 4951770 : temp[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
308 4951770 : temp[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
309 9903540 : temp[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
310 4951770 : temp[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
311 : }
312 :
313 106415000 : if (fliplr) {
314 4044020 : temp[0] = mm_reverse_epi16(temp[0]);
315 4044000 : temp[1] = mm_reverse_epi16(temp[1]);
316 4043910 : temp[2] = mm_reverse_epi16(temp[2]);
317 4043800 : temp[3] = mm_reverse_epi16(temp[3]);
318 4043730 : temp[4] = mm_reverse_epi16(temp[4]);
319 4043700 : temp[5] = mm_reverse_epi16(temp[5]);
320 4043680 : temp[6] = mm_reverse_epi16(temp[6]);
321 4043670 : temp[7] = mm_reverse_epi16(temp[7]);
322 : }
323 :
324 106484000 : in[0] = _mm256_cvtepi16_epi32(temp[0]);
325 106484000 : in[1] = _mm256_cvtepi16_epi32(temp[1]);
326 106484000 : in[2] = _mm256_cvtepi16_epi32(temp[2]);
327 106484000 : in[3] = _mm256_cvtepi16_epi32(temp[3]);
328 106484000 : in[4] = _mm256_cvtepi16_epi32(temp[4]);
329 106484000 : in[5] = _mm256_cvtepi16_epi32(temp[5]);
330 106484000 : in[6] = _mm256_cvtepi16_epi32(temp[6]);
331 106484000 : in[7] = _mm256_cvtepi16_epi32(temp[7]);
332 :
333 106484000 : in[0] = _mm256_slli_epi32(in[0], shift);
334 106484000 : in[1] = _mm256_slli_epi32(in[1], shift);
335 106484000 : in[2] = _mm256_slli_epi32(in[2], shift);
336 106484000 : in[3] = _mm256_slli_epi32(in[3], shift);
337 106484000 : in[4] = _mm256_slli_epi32(in[4], shift);
338 106484000 : in[5] = _mm256_slli_epi32(in[5], shift);
339 106484000 : in[6] = _mm256_slli_epi32(in[6], shift);
340 106484000 : in[7] = _mm256_slli_epi32(in[7], shift);
341 106484000 : }
342 :
343 75347400 : static INLINE void load_buffer_4x4_avx2(const int16_t *input, __m256i *in,
344 : int32_t stride, int32_t flipud, int32_t fliplr, int32_t shift) {
345 75347400 : if (!flipud) {
346 140235000 : in[0] = _mm256_setr_epi64x(*(uint64_t *)(input + 0 * stride),
347 70117700 : *(uint64_t *)(input + 1 * stride), 0, 0);
348 70117700 : in[1] = _mm256_setr_epi64x(*(uint64_t *)(input + 2 * stride),
349 70117700 : *(uint64_t *)(input + 3 * stride), 0, 0);
350 : }
351 : else {
352 10459300 : in[0] = _mm256_setr_epi64x(*(uint64_t *)(input + 3 * stride),
353 5229630 : *(uint64_t *)(input + 2 * stride), 0, 0);
354 5229630 : in[1] = _mm256_setr_epi64x(*(uint64_t *)(input + 1 * stride),
355 5229630 : *(uint64_t *)(input + 0 * stride), 0, 0);
356 : }
357 :
358 75347400 : if (fliplr) {
359 5261200 : in[0] = _mm256_shufflelo_epi16(in[0], 0x1b);
360 5261200 : in[0] = _mm256_shufflehi_epi16(in[0], 0x1b);
361 5261200 : in[1] = _mm256_shufflelo_epi16(in[1], 0x1b);
362 5261200 : in[1] = _mm256_shufflehi_epi16(in[1], 0x1b);
363 : }
364 :
365 150695000 : in[0] = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[0]));
366 150695000 : in[1] = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[1]));
367 :
368 75347400 : in[0] = _mm256_slli_epi32(in[0], shift);
369 75347400 : in[1] = _mm256_slli_epi32(in[1], shift);
370 75347400 : }
371 :
372 18805700 : static INLINE void load_buffer_4x8_avx2(const int16_t *input, __m256i *out,
373 : int32_t stride, int32_t flipud, int32_t fliplr,
374 : int32_t shift) {
375 18805700 : const int16_t *topL = input;
376 18805700 : const int16_t *botL = input + 4 * stride;
377 :
378 18805700 : if (flipud) {
379 1280060 : load_buffer_4x4_avx2(botL, out, stride, flipud, fliplr, shift);
380 1280070 : load_buffer_4x4_avx2(topL, out + 2, stride, flipud, fliplr, shift);
381 : }
382 : else {
383 17525700 : load_buffer_4x4_avx2(topL, out, stride, flipud, fliplr, shift);
384 17531200 : load_buffer_4x4_avx2(botL, out + 2, stride, flipud, fliplr, shift);
385 : }
386 18811600 : }
387 :
388 18905200 : static INLINE void load_buffer_8x4_avx2(const int16_t *input, __m256i *out,
389 : int32_t stride, int32_t flipud, int32_t fliplr, int32_t shift) {
390 18905200 : const int16_t *topL = input;
391 18905200 : const int16_t *topR = input + 4;
392 :
393 18905200 : if (fliplr) {
394 1342910 : load_buffer_4x4_avx2(topR, out, stride, flipud, fliplr, shift);
395 1342920 : load_buffer_4x4_avx2(topL, out + 2, stride, flipud, fliplr, shift);
396 : }
397 : else {
398 17562300 : load_buffer_4x4_avx2(topL, out, stride, flipud, fliplr, shift);
399 17567600 : load_buffer_4x4_avx2(topR, out + 2, stride, flipud, fliplr, shift);
400 : }
401 18911900 : }
402 :
403 5564200 : static INLINE void load_buffer_4x16_avx2(const int16_t *input, __m256i *out,
404 : const int32_t stride, const int32_t flipud,
405 : const int32_t fliplr, const int32_t shift) {
406 5564200 : const int16_t *topL = input;
407 5564200 : const int16_t *botL = input + 8 * stride;
408 :
409 5564200 : if (flipud) {
410 475174 : load_buffer_4x8_avx2(botL, out, stride, flipud, fliplr, shift);
411 475178 : load_buffer_4x8_avx2(topL, out + 4, stride, flipud, fliplr, shift);
412 : }
413 : else {
414 5089020 : load_buffer_4x8_avx2(topL, out, stride, flipud, fliplr, shift);
415 5089680 : load_buffer_4x8_avx2(botL, out + 4, stride, flipud, fliplr, shift);
416 : }
417 5564990 : }
418 :
419 5777000 : static INLINE void load_buffer_16x4_avx2(const int16_t *input, __m256i *out,
420 : int32_t stride, int32_t flipud, int32_t fliplr, int32_t shift) {
421 5777000 : const int16_t *topL = input;
422 5777000 : const int16_t *topR = input + 8;
423 :
424 5777000 : if (fliplr) {
425 502106 : load_buffer_8x4_avx2(topR, out, stride, flipud, fliplr, shift);
426 502111 : load_buffer_8x4_avx2(topL, out + 4, stride, flipud, fliplr, shift);
427 : }
428 : else {
429 5274890 : load_buffer_8x4_avx2(topL, out, stride, flipud, fliplr, shift);
430 5275790 : load_buffer_8x4_avx2(topR, out + 4, stride, flipud, fliplr, shift);
431 : }
432 5777850 : }
433 :
434 188826000 : static INLINE void col_txfm_8x8_rounding(__m256i *in, int32_t shift) {
435 188826000 : const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
436 :
437 188826000 : in[0] = _mm256_add_epi32(in[0], rounding);
438 188826000 : in[1] = _mm256_add_epi32(in[1], rounding);
439 188826000 : in[2] = _mm256_add_epi32(in[2], rounding);
440 188826000 : in[3] = _mm256_add_epi32(in[3], rounding);
441 188826000 : in[4] = _mm256_add_epi32(in[4], rounding);
442 188826000 : in[5] = _mm256_add_epi32(in[5], rounding);
443 188826000 : in[6] = _mm256_add_epi32(in[6], rounding);
444 188826000 : in[7] = _mm256_add_epi32(in[7], rounding);
445 :
446 188826000 : in[0] = _mm256_srai_epi32(in[0], shift);
447 188826000 : in[1] = _mm256_srai_epi32(in[1], shift);
448 188826000 : in[2] = _mm256_srai_epi32(in[2], shift);
449 188826000 : in[3] = _mm256_srai_epi32(in[3], shift);
450 188826000 : in[4] = _mm256_srai_epi32(in[4], shift);
451 188826000 : in[5] = _mm256_srai_epi32(in[5], shift);
452 188826000 : in[6] = _mm256_srai_epi32(in[6], shift);
453 188826000 : in[7] = _mm256_srai_epi32(in[7], shift);
454 188826000 : }
455 :
456 7557430 : static void fidtx8x8_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
457 : (void)bit;
458 7557430 : out[0] = _mm256_slli_epi32(in[0 * col_num], 1);
459 7557430 : out[1] = _mm256_slli_epi32(in[1 * col_num], 1);
460 7557430 : out[2] = _mm256_slli_epi32(in[2 * col_num], 1);
461 7557430 : out[3] = _mm256_slli_epi32(in[3 * col_num], 1);
462 7557430 : out[4] = _mm256_slli_epi32(in[4 * col_num], 1);
463 7557430 : out[5] = _mm256_slli_epi32(in[5 * col_num], 1);
464 7557430 : out[6] = _mm256_slli_epi32(in[6 * col_num], 1);
465 7557430 : out[7] = _mm256_slli_epi32(in[7 * col_num], 1);
466 7557430 : }
467 :
468 1286740 : static INLINE void fidtx16x8_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
469 : (void)bit;
470 1286740 : const int32_t bits = 12; // NewSqrt2Bits = 12
471 1286740 : const int32_t sqrt = 2 * 5793; // 2 * NewSqrt2
472 1286740 : const __m256i newsqrt = _mm256_set1_epi32(sqrt);
473 1286740 : const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
474 : __m256i temp;
475 1286740 : int32_t num_iters = 8 * col_num;
476 11580400 : for (int32_t i = 0; i < num_iters; i++) {
477 20587300 : temp = _mm256_mullo_epi32(in[i], newsqrt);
478 10293700 : temp = _mm256_add_epi32(temp, rounding);
479 20587300 : out[i] = _mm256_srai_epi32(temp, bits);
480 : }
481 1286740 : }
482 :
483 7685080 : static INLINE void write_buffer_4x8(const __m256i *res, int32_t *output) {
484 7685080 : _mm256_storeu_si256((__m256i *)(output + 0 * 8), res[0]);
485 7685080 : _mm256_storeu_si256((__m256i *)(output + 1 * 8), res[1]);
486 7685080 : _mm256_storeu_si256((__m256i *)(output + 2 * 8), res[2]);
487 7685080 : _mm256_storeu_si256((__m256i *)(output + 3 * 8), res[3]);
488 7685080 : }
489 :
490 42880000 : static INLINE void write_buffer_8x8(const __m256i *res, int32_t *output) {
491 42880000 : _mm256_storeu_si256((__m256i *)(output + 0 * 8), res[0]);
492 42880000 : _mm256_storeu_si256((__m256i *)(output + 1 * 8), res[1]);
493 42880000 : _mm256_storeu_si256((__m256i *)(output + 2 * 8), res[2]);
494 42880000 : _mm256_storeu_si256((__m256i *)(output + 3 * 8), res[3]);
495 :
496 42880000 : _mm256_storeu_si256((__m256i *)(output + 4 * 8), res[4]);
497 42880000 : _mm256_storeu_si256((__m256i *)(output + 5 * 8), res[5]);
498 42880000 : _mm256_storeu_si256((__m256i *)(output + 6 * 8), res[6]);
499 42880000 : _mm256_storeu_si256((__m256i *)(output + 7 * 8), res[7]);
500 42880000 : }
501 :
502 69633300 : static void fdct8x8_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
503 69633300 : const int32_t *cospi = cospi_arr(bit);
504 69617100 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
505 69617100 : const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
506 69617100 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
507 69617100 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
508 69617100 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
509 69617100 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
510 69617100 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
511 69617100 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
512 69617100 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
513 : __m256i u[8], v[8];
514 :
515 : // stage 0
516 : // stage 1
517 69617100 : u[0] = _mm256_add_epi32(in[0 * col_num], in[7 * col_num]);
518 69617100 : v[7] = _mm256_sub_epi32(in[0 * col_num], in[7 * col_num]);
519 69617100 : u[1] = _mm256_add_epi32(in[1 * col_num], in[6 * col_num]);
520 69617100 : u[6] = _mm256_sub_epi32(in[1 * col_num], in[6 * col_num]);
521 69617100 : u[2] = _mm256_add_epi32(in[2 * col_num], in[5 * col_num]);
522 69617100 : u[5] = _mm256_sub_epi32(in[2 * col_num], in[5 * col_num]);
523 69617100 : u[3] = _mm256_add_epi32(in[3 * col_num], in[4 * col_num]);
524 69617100 : v[4] = _mm256_sub_epi32(in[3 * col_num], in[4 * col_num]);
525 :
526 : // stage 2
527 69617100 : v[0] = _mm256_add_epi32(u[0], u[3]);
528 69617100 : v[3] = _mm256_sub_epi32(u[0], u[3]);
529 69617100 : v[1] = _mm256_add_epi32(u[1], u[2]);
530 69617100 : v[2] = _mm256_sub_epi32(u[1], u[2]);
531 :
532 69617100 : v[5] = _mm256_mullo_epi32(u[5], cospim32);
533 69617100 : v[6] = _mm256_mullo_epi32(u[6], cospi32);
534 69617100 : v[5] = _mm256_add_epi32(v[5], v[6]);
535 69617100 : v[5] = _mm256_add_epi32(v[5], rnding);
536 69617100 : v[5] = _mm256_srai_epi32(v[5], bit);
537 :
538 69617100 : u[0] = _mm256_mullo_epi32(u[5], cospi32);
539 69617100 : v[6] = _mm256_mullo_epi32(u[6], cospim32);
540 69617100 : v[6] = _mm256_sub_epi32(u[0], v[6]);
541 69617100 : v[6] = _mm256_add_epi32(v[6], rnding);
542 69617100 : v[6] = _mm256_srai_epi32(v[6], bit);
543 :
544 : // stage 3
545 : // type 0
546 69617100 : v[0] = _mm256_mullo_epi32(v[0], cospi32);
547 69617100 : v[1] = _mm256_mullo_epi32(v[1], cospi32);
548 69617100 : u[0] = _mm256_add_epi32(v[0], v[1]);
549 69617100 : u[0] = _mm256_add_epi32(u[0], rnding);
550 69617100 : u[0] = _mm256_srai_epi32(u[0], bit);
551 :
552 69617100 : u[1] = _mm256_sub_epi32(v[0], v[1]);
553 69617100 : u[1] = _mm256_add_epi32(u[1], rnding);
554 69617100 : u[1] = _mm256_srai_epi32(u[1], bit);
555 :
556 : // type 1
557 69617100 : v[0] = _mm256_mullo_epi32(v[2], cospi48);
558 69617100 : v[1] = _mm256_mullo_epi32(v[3], cospi16);
559 69617100 : u[2] = _mm256_add_epi32(v[0], v[1]);
560 69617100 : u[2] = _mm256_add_epi32(u[2], rnding);
561 69617100 : u[2] = _mm256_srai_epi32(u[2], bit);
562 :
563 69617100 : v[0] = _mm256_mullo_epi32(v[2], cospi16);
564 69617100 : v[1] = _mm256_mullo_epi32(v[3], cospi48);
565 69617100 : u[3] = _mm256_sub_epi32(v[1], v[0]);
566 69617100 : u[3] = _mm256_add_epi32(u[3], rnding);
567 69617100 : u[3] = _mm256_srai_epi32(u[3], bit);
568 :
569 69617100 : u[4] = _mm256_add_epi32(v[4], v[5]);
570 69617100 : u[5] = _mm256_sub_epi32(v[4], v[5]);
571 69617100 : u[6] = _mm256_sub_epi32(v[7], v[6]);
572 69617100 : u[7] = _mm256_add_epi32(v[7], v[6]);
573 :
574 : // stage 4
575 : // stage 5
576 69617100 : v[0] = _mm256_mullo_epi32(u[4], cospi56);
577 69617100 : v[1] = _mm256_mullo_epi32(u[7], cospi8);
578 69617100 : v[0] = _mm256_add_epi32(v[0], v[1]);
579 69617100 : v[0] = _mm256_add_epi32(v[0], rnding);
580 69617100 : out[1 * col_num] = _mm256_srai_epi32(v[0], bit);
581 :
582 69617100 : v[0] = _mm256_mullo_epi32(u[4], cospi8);
583 69617100 : v[1] = _mm256_mullo_epi32(u[7], cospi56);
584 69617100 : v[0] = _mm256_sub_epi32(v[1], v[0]);
585 69617100 : v[0] = _mm256_add_epi32(v[0], rnding);
586 69617100 : out[7 * col_num] = _mm256_srai_epi32(v[0], bit);
587 :
588 69617100 : v[0] = _mm256_mullo_epi32(u[5], cospi24);
589 69617100 : v[1] = _mm256_mullo_epi32(u[6], cospi40);
590 69617100 : v[0] = _mm256_add_epi32(v[0], v[1]);
591 69617100 : v[0] = _mm256_add_epi32(v[0], rnding);
592 69617100 : out[5 * col_num] = _mm256_srai_epi32(v[0], bit);
593 :
594 69617100 : v[0] = _mm256_mullo_epi32(u[5], cospi40);
595 69617100 : v[1] = _mm256_mullo_epi32(u[6], cospi24);
596 69617100 : v[0] = _mm256_sub_epi32(v[1], v[0]);
597 69617100 : v[0] = _mm256_add_epi32(v[0], rnding);
598 69617100 : out[3 * col_num] = _mm256_srai_epi32(v[0], bit);
599 :
600 69617100 : out[0 * col_num] = u[0];
601 69617100 : out[4 * col_num] = u[1];
602 69617100 : out[2 * col_num] = u[2];
603 69617100 : out[6 * col_num] = u[3];
604 69617100 : }
605 :
606 13888100 : static void fadst8x8_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
607 13888100 : const int32_t *cospi = cospi_arr(bit);
608 13887500 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
609 13887500 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
610 13887500 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
611 13887500 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
612 13887500 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
613 13887500 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
614 13887500 : const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
615 13887500 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
616 13887500 : const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
617 13887500 : const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
618 13887500 : const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
619 13887500 : const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
620 13887500 : const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
621 13887500 : const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
622 13887500 : const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
623 13887500 : const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
624 13887500 : const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
625 27774900 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
626 13887500 : const __m256i zero = _mm256_setzero_si256();
627 : __m256i u0, u1, u2, u3, u4, u5, u6, u7;
628 : __m256i v0, v1, v2, v3, v4, v5, v6, v7;
629 : __m256i x, y;
630 :
631 13887500 : u0 = in[0 * col_num];
632 13887500 : u1 = _mm256_sub_epi32(zero, in[7 * col_num]);
633 13887500 : u2 = _mm256_sub_epi32(zero, in[3 * col_num]);
634 13887500 : u3 = in[4 * col_num];
635 13887500 : u4 = _mm256_sub_epi32(zero, in[1 * col_num]);
636 13887500 : u5 = in[6 * col_num];
637 13887500 : u6 = in[2 * col_num];
638 13887500 : u7 = _mm256_sub_epi32(zero, in[5 * col_num]);
639 :
640 : // stage 2
641 13887500 : v0 = u0;
642 13887500 : v1 = u1;
643 :
644 13887500 : x = _mm256_mullo_epi32(u2, cospi32);
645 13887500 : y = _mm256_mullo_epi32(u3, cospi32);
646 13887500 : v2 = _mm256_add_epi32(x, y);
647 13887500 : v2 = _mm256_add_epi32(v2, rnding);
648 27774900 : v2 = _mm256_srai_epi32(v2, bit);
649 :
650 13887500 : v3 = _mm256_sub_epi32(x, y);
651 13887500 : v3 = _mm256_add_epi32(v3, rnding);
652 13887500 : v3 = _mm256_srai_epi32(v3, bit);
653 :
654 13887500 : v4 = u4;
655 13887500 : v5 = u5;
656 :
657 13887500 : x = _mm256_mullo_epi32(u6, cospi32);
658 13887500 : y = _mm256_mullo_epi32(u7, cospi32);
659 13887500 : v6 = _mm256_add_epi32(x, y);
660 13887500 : v6 = _mm256_add_epi32(v6, rnding);
661 27774900 : v6 = _mm256_srai_epi32(v6, bit);
662 :
663 13887500 : v7 = _mm256_sub_epi32(x, y);
664 13887500 : v7 = _mm256_add_epi32(v7, rnding);
665 27774900 : v7 = _mm256_srai_epi32(v7, bit);
666 :
667 : // stage 3
668 13887500 : u0 = _mm256_add_epi32(v0, v2);
669 13887500 : u1 = _mm256_add_epi32(v1, v3);
670 13887500 : u2 = _mm256_sub_epi32(v0, v2);
671 13887500 : u3 = _mm256_sub_epi32(v1, v3);
672 13887500 : u4 = _mm256_add_epi32(v4, v6);
673 13887500 : u5 = _mm256_add_epi32(v5, v7);
674 13887500 : u6 = _mm256_sub_epi32(v4, v6);
675 13887500 : u7 = _mm256_sub_epi32(v5, v7);
676 :
677 : // stage 4
678 13887500 : v0 = u0;
679 13887500 : v1 = u1;
680 13887500 : v2 = u2;
681 13887500 : v3 = u3;
682 :
683 13887500 : x = _mm256_mullo_epi32(u4, cospi16);
684 13887500 : y = _mm256_mullo_epi32(u5, cospi48);
685 13887500 : v4 = _mm256_add_epi32(x, y);
686 13887500 : v4 = _mm256_add_epi32(v4, rnding);
687 27774900 : v4 = _mm256_srai_epi32(v4, bit);
688 :
689 13887500 : x = _mm256_mullo_epi32(u4, cospi48);
690 13887500 : y = _mm256_mullo_epi32(u5, cospim16);
691 13887500 : v5 = _mm256_add_epi32(x, y);
692 13887500 : v5 = _mm256_add_epi32(v5, rnding);
693 27774900 : v5 = _mm256_srai_epi32(v5, bit);
694 :
695 13887500 : x = _mm256_mullo_epi32(u6, cospim48);
696 13887500 : y = _mm256_mullo_epi32(u7, cospi16);
697 13887500 : v6 = _mm256_add_epi32(x, y);
698 13887500 : v6 = _mm256_add_epi32(v6, rnding);
699 27774900 : v6 = _mm256_srai_epi32(v6, bit);
700 :
701 13887500 : x = _mm256_mullo_epi32(u6, cospi16);
702 13887500 : y = _mm256_mullo_epi32(u7, cospi48);
703 13887500 : v7 = _mm256_add_epi32(x, y);
704 13887500 : v7 = _mm256_add_epi32(v7, rnding);
705 27774900 : v7 = _mm256_srai_epi32(v7, bit);
706 :
707 : // stage 5
708 13887500 : u0 = _mm256_add_epi32(v0, v4);
709 13887500 : u1 = _mm256_add_epi32(v1, v5);
710 13887500 : u2 = _mm256_add_epi32(v2, v6);
711 13887500 : u3 = _mm256_add_epi32(v3, v7);
712 13887500 : u4 = _mm256_sub_epi32(v0, v4);
713 13887500 : u5 = _mm256_sub_epi32(v1, v5);
714 13887500 : u6 = _mm256_sub_epi32(v2, v6);
715 13887500 : u7 = _mm256_sub_epi32(v3, v7);
716 :
717 : // stage 6
718 13887500 : x = _mm256_mullo_epi32(u0, cospi4);
719 13887500 : y = _mm256_mullo_epi32(u1, cospi60);
720 13887500 : v0 = _mm256_add_epi32(x, y);
721 13887500 : v0 = _mm256_add_epi32(v0, rnding);
722 27774900 : v0 = _mm256_srai_epi32(v0, bit);
723 :
724 13887500 : x = _mm256_mullo_epi32(u0, cospi60);
725 13887500 : y = _mm256_mullo_epi32(u1, cospim4);
726 13887500 : v1 = _mm256_add_epi32(x, y);
727 13887500 : v1 = _mm256_add_epi32(v1, rnding);
728 27774900 : v1 = _mm256_srai_epi32(v1, bit);
729 :
730 13887500 : x = _mm256_mullo_epi32(u2, cospi20);
731 13887500 : y = _mm256_mullo_epi32(u3, cospi44);
732 13887500 : v2 = _mm256_add_epi32(x, y);
733 13887500 : v2 = _mm256_add_epi32(v2, rnding);
734 27774900 : v2 = _mm256_srai_epi32(v2, bit);
735 :
736 13887500 : x = _mm256_mullo_epi32(u2, cospi44);
737 13887500 : y = _mm256_mullo_epi32(u3, cospim20);
738 13887500 : v3 = _mm256_add_epi32(x, y);
739 13887500 : v3 = _mm256_add_epi32(v3, rnding);
740 27774900 : v3 = _mm256_srai_epi32(v3, bit);
741 :
742 13887500 : x = _mm256_mullo_epi32(u4, cospi36);
743 13887500 : y = _mm256_mullo_epi32(u5, cospi28);
744 13887500 : v4 = _mm256_add_epi32(x, y);
745 13887500 : v4 = _mm256_add_epi32(v4, rnding);
746 27774900 : v4 = _mm256_srai_epi32(v4, bit);
747 :
748 13887500 : x = _mm256_mullo_epi32(u4, cospi28);
749 13887500 : y = _mm256_mullo_epi32(u5, cospim36);
750 13887500 : v5 = _mm256_add_epi32(x, y);
751 13887500 : v5 = _mm256_add_epi32(v5, rnding);
752 27774900 : v5 = _mm256_srai_epi32(v5, bit);
753 :
754 13887500 : x = _mm256_mullo_epi32(u6, cospi52);
755 13887500 : y = _mm256_mullo_epi32(u7, cospi12);
756 13887500 : v6 = _mm256_add_epi32(x, y);
757 13887500 : v6 = _mm256_add_epi32(v6, rnding);
758 27774900 : v6 = _mm256_srai_epi32(v6, bit);
759 :
760 13887500 : x = _mm256_mullo_epi32(u6, cospi12);
761 13887500 : y = _mm256_mullo_epi32(u7, cospim52);
762 13887500 : v7 = _mm256_add_epi32(x, y);
763 13887500 : v7 = _mm256_add_epi32(v7, rnding);
764 13887500 : v7 = _mm256_srai_epi32(v7, bit);
765 :
766 : // stage 7
767 13887500 : out[0 * col_num] = v1;
768 13887500 : out[1 * col_num] = v6;
769 13887500 : out[2 * col_num] = v3;
770 13887500 : out[3 * col_num] = v4;
771 13887500 : out[4 * col_num] = v5;
772 13887500 : out[5 * col_num] = v2;
773 13887500 : out[6 * col_num] = v7;
774 13887500 : out[7 * col_num] = v0;
775 13887500 : }
776 :
777 19667600 : void eb_av1_fwd_txfm2d_8x8_avx2(int16_t *input, int32_t *coeff, uint32_t stride, TxType tx_type, uint8_t bd)
778 : {
779 : __m256i in[8], out[8];
780 19667600 : const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
781 19667600 : const int32_t txw_idx = get_txw_idx(TX_8X8);
782 19667000 : const int32_t txh_idx = get_txh_idx(TX_8X8);
783 :
784 19676900 : switch (tx_type) {
785 10384800 : case DCT_DCT:
786 10384800 : load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
787 10386600 : fdct8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
788 10386300 : col_txfm_8x8_rounding(out, -shift[1]);
789 10386100 : transpose_8x8_avx2(out, in);
790 10386400 : fdct8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
791 10386600 : transpose_8x8_avx2(out, in);
792 10386600 : write_buffer_8x8(in, coeff);
793 10386200 : break;
794 1232280 : case ADST_DCT:
795 1232280 : load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
796 1232340 : fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
797 1232310 : col_txfm_8x8_rounding(out, -shift[1]);
798 1232310 : transpose_8x8_avx2(out, in);
799 1232310 : fdct8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
800 1232320 : transpose_8x8_avx2(out, in);
801 1232320 : write_buffer_8x8(in, coeff);
802 1232310 : break;
803 1241180 : case DCT_ADST:
804 1241180 : load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
805 1241250 : fdct8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
806 1241250 : col_txfm_8x8_rounding(out, -shift[1]);
807 1241250 : transpose_8x8_avx2(out, in);
808 1241260 : fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
809 1241240 : transpose_8x8_avx2(out, in);
810 1241240 : write_buffer_8x8(in, coeff);
811 1241240 : break;
812 1108260 : case ADST_ADST:
813 1108260 : load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
814 1108310 : fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
815 1108310 : col_txfm_8x8_rounding(out, -shift[1]);
816 1108300 : transpose_8x8_avx2(out, in);
817 1108310 : fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
818 1108300 : transpose_8x8_avx2(out, in);
819 1108300 : write_buffer_8x8(in, coeff);
820 1108300 : break;
821 348711 : case FLIPADST_DCT:
822 348711 : load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
823 348713 : fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
824 348712 : col_txfm_8x8_rounding(out, -shift[1]);
825 348713 : transpose_8x8_avx2(out, in);
826 348712 : fdct8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
827 348712 : transpose_8x8_avx2(out, in);
828 348710 : write_buffer_8x8(in, coeff);
829 348710 : break;
830 348572 : case DCT_FLIPADST:
831 348572 : load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
832 348576 : fdct8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
833 348577 : col_txfm_8x8_rounding(out, -shift[1]);
834 348580 : transpose_8x8_avx2(out, in);
835 348580 : fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
836 348580 : transpose_8x8_avx2(out, in);
837 348579 : write_buffer_8x8(in, coeff);
838 348578 : break;
839 347389 : case FLIPADST_FLIPADST:
840 347389 : load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
841 347387 : fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
842 347390 : col_txfm_8x8_rounding(out, -shift[1]);
843 347388 : transpose_8x8_avx2(out, in);
844 347389 : fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
845 347389 : transpose_8x8_avx2(out, in);
846 347389 : write_buffer_8x8(in, coeff);
847 347388 : break;
848 351460 : case ADST_FLIPADST:
849 351460 : load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
850 351460 : fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
851 351463 : col_txfm_8x8_rounding(out, -shift[1]);
852 351463 : transpose_8x8_avx2(out, in);
853 351463 : fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
854 351464 : transpose_8x8_avx2(out, in);
855 351465 : write_buffer_8x8(in, coeff);
856 351464 : break;
857 351058 : case FLIPADST_ADST:
858 351058 : load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
859 351062 : fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
860 351060 : col_txfm_8x8_rounding(out, -shift[1]);
861 351063 : transpose_8x8_avx2(out, in);
862 351063 : fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
863 351064 : transpose_8x8_avx2(out, in);
864 351063 : write_buffer_8x8(in, coeff);
865 351063 : break;
866 999908 : case IDTX:
867 999908 : load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
868 999936 : fidtx8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
869 999928 : col_txfm_8x8_rounding(out, -shift[1]);
870 999918 : fidtx8x8_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
871 999914 : write_buffer_8x8(out, coeff);
872 999917 : break;
873 935885 : case V_DCT:
874 935885 : load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
875 935915 : fdct8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
876 935910 : col_txfm_8x8_rounding(out, -shift[1]);
877 935909 : fidtx8x8_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
878 935906 : write_buffer_8x8(out, coeff);
879 935902 : break;
880 984897 : case H_DCT:
881 984897 : load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
882 984942 : fidtx8x8_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
883 984931 : col_txfm_8x8_rounding(in, -shift[1]);
884 984926 : transpose_8x8_avx2(in, out);
885 984932 : fdct8x8_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
886 984933 : transpose_8x8_avx2(in, out);
887 984937 : write_buffer_8x8(out, coeff);
888 984931 : break;
889 256910 : case V_ADST:
890 256910 : load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
891 256912 : fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
892 256912 : col_txfm_8x8_rounding(out, -shift[1]);
893 256910 : fidtx8x8_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
894 256910 : write_buffer_8x8(out, coeff);
895 256909 : break;
896 265622 : case H_ADST:
897 265622 : load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
898 265623 : fidtx8x8_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
899 265623 : col_txfm_8x8_rounding(in, -shift[1]);
900 265623 : transpose_8x8_avx2(in, out);
901 265623 : fadst8x8_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
902 265620 : transpose_8x8_avx2(in, out);
903 265621 : write_buffer_8x8(out, coeff);
904 265622 : break;
905 256028 : case V_FLIPADST:
906 256028 : load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
907 256029 : fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
908 256029 : col_txfm_8x8_rounding(out, -shift[1]);
909 256030 : fidtx8x8_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
910 256029 : write_buffer_8x8(out, coeff);
911 256029 : break;
912 264031 : case H_FLIPADST:
913 264031 : load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
914 264030 : fidtx8x8_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
915 264030 : col_txfm_8x8_rounding(in, -shift[1]);
916 264030 : transpose_8x8_avx2(in, out);
917 264031 : fadst8x8_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
918 264031 : transpose_8x8_avx2(in, out);
919 264031 : write_buffer_8x8(out, coeff);
920 264031 : break;
921 0 : default: assert(0);
922 : }
923 : (void)bd;
924 19678600 : }
925 :
926 11158200 : static INLINE void convert_8x8_to_16x16(const __m256i *in, __m256i *out) {
927 11158200 : int32_t row_index = 0;
928 11158200 : int32_t dst_index = 0;
929 11158200 : int32_t src_index = 0;
930 :
931 : // row 0, 1, .., 7
932 : do {
933 89248100 : out[dst_index] = in[src_index];
934 89248100 : out[dst_index + 1] = in[src_index + 8];
935 89248100 : dst_index += 2;
936 89248100 : src_index += 1;
937 89248100 : row_index += 1;
938 89248100 : } while (row_index < 8);
939 :
940 : // row 8, 9, ..., 15
941 11158200 : src_index += 8;
942 : do {
943 89237900 : out[dst_index] = in[src_index];
944 89237900 : out[dst_index + 1] = in[src_index + 8];
945 89237900 : dst_index += 2;
946 89237900 : src_index += 1;
947 89237900 : row_index += 1;
948 89237900 : } while (row_index < 16);
949 11158200 : }
950 :
951 11155200 : static INLINE void load_buffer_16x16(const int16_t *input, __m256i *out,
952 : int32_t stride, int32_t flipud, int32_t fliplr, int32_t shift) {
953 : __m256i in[32];
954 : // Load 4 8x8 blocks
955 11155200 : const int16_t *topL = input;
956 11155200 : const int16_t *topR = input + 8;
957 11155200 : const int16_t *botL = input + 8 * stride;
958 11155200 : const int16_t *botR = input + 8 * stride + 8;
959 :
960 : const int16_t *tmp;
961 :
962 11155200 : if (flipud) {
963 : // Swap left columns
964 387231 : tmp = topL;
965 387231 : topL = botL;
966 387231 : botL = tmp;
967 : // Swap right columns
968 387231 : tmp = topR;
969 387231 : topR = botR;
970 387231 : botR = tmp;
971 : }
972 :
973 11155200 : if (fliplr) {
974 : // Swap top rows
975 387043 : tmp = topL;
976 387043 : topL = topR;
977 387043 : topR = tmp;
978 : // Swap bottom rows
979 387043 : tmp = botL;
980 387043 : botL = botR;
981 387043 : botR = tmp;
982 : }
983 :
984 : // load first 8 columns
985 11155200 : load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
986 11157400 : load_buffer_8x8(botL, &in[16], stride, flipud, fliplr, shift);
987 :
988 : // load second 8 columns
989 11158200 : load_buffer_8x8(topR, &in[8], stride, flipud, fliplr, shift);
990 11158000 : load_buffer_8x8(botR, &in[24], stride, flipud, fliplr, shift);
991 :
992 11158000 : convert_8x8_to_16x16(in, out);
993 11156600 : }
994 :
995 31536600 : static INLINE void col_txfm_16x16_rounding(__m256i *in, int32_t shift) {
996 31536600 : col_txfm_8x8_rounding(&in[0], shift);
997 31539100 : col_txfm_8x8_rounding(&in[8], shift);
998 31539700 : col_txfm_8x8_rounding(&in[16], shift);
999 31539900 : col_txfm_8x8_rounding(&in[24], shift);
1000 31542400 : }
1001 :
1002 2550280 : static void fidtx16x16_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
1003 : (void)bit;
1004 2550280 : const int32_t bits = 12; // NewSqrt2Bits = 12
1005 2550280 : const int32_t sqrt = 2 * 5793; // 2 * NewSqrt2
1006 2550280 : const __m256i newsqrt = _mm256_set1_epi32(sqrt);
1007 2550280 : const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
1008 : __m256i temp;
1009 2550280 : int32_t num_iters = 16 * col_num;
1010 68711300 : for (int32_t i = 0; i < num_iters; i++) {
1011 132322000 : temp = _mm256_mullo_epi32(in[i], newsqrt);
1012 66161100 : temp = _mm256_add_epi32(temp, rounding);
1013 132322000 : out[i] = _mm256_srai_epi32(temp, bits);
1014 : }
1015 2550280 : }
1016 :
1017 7144750 : static INLINE void write_buffer_16x16(const __m256i *res, int32_t *output) {
1018 7144750 : int32_t fact = -1, index = -1;
1019 64273700 : for (int32_t i = 0; i < 8; i++)
1020 : {
1021 57128900 : _mm256_store_si256((__m256i *)(output + (++fact) * 16), res[++index]);
1022 57128900 : _mm256_store_si256((__m256i *)(output + (fact) * 16 + 8), res[++index]);
1023 57128900 : _mm256_store_si256((__m256i *)(output + (++fact) * 16), res[++index]);
1024 57128900 : _mm256_store_si256((__m256i *)(output + (fact) * 16 + 8), res[++index]);
1025 : }
1026 7144750 : }
1027 :
1028 12675600 : static INLINE void fdct4x8_row_avx2(__m256i *input, __m256i *output, int32_t bit,
1029 : const int32_t num_col) {
1030 12675600 : const int32_t *cospi = cospi_arr(bit);
1031 12675300 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1032 12675300 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1033 12675300 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1034 12675300 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1035 : __m256i in[4];
1036 : __m256i out[4];
1037 : __m256i s0, s1, s2, s3;
1038 : __m256i u0, u1, u2, u3;
1039 : __m256i v0, v1, v2, v3;
1040 12675300 : int32_t endidx = 3 * num_col;
1041 :
1042 12675300 : in[0] = _mm256_permute2x128_si256(input[0], input[2], 0x20);
1043 12675300 : in[1] = _mm256_permute2x128_si256(input[0], input[2], 0x31);
1044 12675300 : in[2] = _mm256_permute2x128_si256(input[1], input[3], 0x20);
1045 12675300 : in[3] = _mm256_permute2x128_si256(input[1], input[3], 0x31);
1046 :
1047 12675300 : s0 = _mm256_add_epi32(in[0], in[endidx]);
1048 12675300 : s3 = _mm256_sub_epi32(in[0], in[endidx]);
1049 12675300 : endidx -= num_col;
1050 12675300 : s1 = _mm256_add_epi32(in[num_col], in[endidx]);
1051 25350700 : s2 = _mm256_sub_epi32(in[num_col], in[endidx]);
1052 :
1053 : // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
1054 12675300 : u0 = _mm256_mullo_epi32(s0, cospi32);
1055 12675300 : u1 = _mm256_mullo_epi32(s1, cospi32);
1056 12675300 : u2 = _mm256_add_epi32(u0, u1);
1057 12675300 : v0 = _mm256_sub_epi32(u0, u1);
1058 :
1059 12675300 : u3 = _mm256_add_epi32(u2, rnding);
1060 12675300 : v1 = _mm256_add_epi32(v0, rnding);
1061 :
1062 12675300 : u0 = _mm256_srai_epi32(u3, bit);
1063 12675300 : u2 = _mm256_srai_epi32(v1, bit);
1064 :
1065 : // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
1066 12675300 : v0 = _mm256_mullo_epi32(s2, cospi48);
1067 12675300 : v1 = _mm256_mullo_epi32(s3, cospi16);
1068 12675300 : v2 = _mm256_add_epi32(v0, v1);
1069 :
1070 12675300 : v3 = _mm256_add_epi32(v2, rnding);
1071 12675300 : u1 = _mm256_srai_epi32(v3, bit);
1072 :
1073 12675300 : v0 = _mm256_mullo_epi32(s2, cospi16);
1074 12675300 : v1 = _mm256_mullo_epi32(s3, cospi48);
1075 12675300 : v2 = _mm256_sub_epi32(v1, v0);
1076 :
1077 12675300 : v3 = _mm256_add_epi32(v2, rnding);
1078 12675300 : u3 = _mm256_srai_epi32(v3, bit);
1079 :
1080 : // Note: shift[1] and shift[2] are zeros
1081 :
1082 : // Transpose 4x4 32-bit
1083 12675300 : v0 = _mm256_unpacklo_epi32(u0, u1);
1084 12675300 : v1 = _mm256_unpackhi_epi32(u0, u1);
1085 12675300 : v2 = _mm256_unpacklo_epi32(u2, u3);
1086 12675300 : v3 = _mm256_unpackhi_epi32(u2, u3);
1087 :
1088 12675300 : out[0] = _mm256_unpacklo_epi64(v0, v2);
1089 12675300 : out[1] = _mm256_unpackhi_epi64(v0, v2);
1090 12675300 : out[2] = _mm256_unpacklo_epi64(v1, v3);
1091 12675300 : out[3] = _mm256_unpackhi_epi64(v1, v3);
1092 :
1093 12675300 : output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
1094 12675300 : output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
1095 12675300 : output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
1096 12675300 : output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
1097 12675300 : }
1098 :
1099 12716600 : static INLINE void fdct4x8_col_avx2(__m256i *in, __m256i *output, int32_t bit,
1100 : const int32_t num_col) {
1101 12716600 : const int32_t *cospi = cospi_arr(bit);
1102 12716100 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1103 12716100 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1104 12716100 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1105 12716100 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1106 : __m256i s0, s1, s2, s3;
1107 : __m256i u0, u1, u2, u3;
1108 : __m256i v0, v1, v2, v3;
1109 : __m256i out[4];
1110 :
1111 12716100 : int32_t endidx = 3 * num_col;
1112 12716100 : s0 = _mm256_add_epi32(in[0], in[endidx]);
1113 12716100 : s3 = _mm256_sub_epi32(in[0], in[endidx]);
1114 12716100 : endidx -= num_col;
1115 12716100 : s1 = _mm256_add_epi32(in[num_col], in[endidx]);
1116 25432200 : s2 = _mm256_sub_epi32(in[num_col], in[endidx]);
1117 :
1118 : // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
1119 12716100 : u0 = _mm256_mullo_epi32(s0, cospi32);
1120 12716100 : u1 = _mm256_mullo_epi32(s1, cospi32);
1121 12716100 : u2 = _mm256_add_epi32(u0, u1);
1122 12716100 : v0 = _mm256_sub_epi32(u0, u1);
1123 :
1124 12716100 : u3 = _mm256_add_epi32(u2, rnding);
1125 12716100 : v1 = _mm256_add_epi32(v0, rnding);
1126 :
1127 12716100 : u0 = _mm256_srai_epi32(u3, bit);
1128 12716100 : u2 = _mm256_srai_epi32(v1, bit);
1129 :
1130 : // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
1131 12716100 : v0 = _mm256_mullo_epi32(s2, cospi48);
1132 12716100 : v1 = _mm256_mullo_epi32(s3, cospi16);
1133 12716100 : v2 = _mm256_add_epi32(v0, v1);
1134 :
1135 12716100 : v3 = _mm256_add_epi32(v2, rnding);
1136 12716100 : u1 = _mm256_srai_epi32(v3, bit);
1137 :
1138 12716100 : v0 = _mm256_mullo_epi32(s2, cospi16);
1139 12716100 : v1 = _mm256_mullo_epi32(s3, cospi48);
1140 12716100 : v2 = _mm256_sub_epi32(v1, v0);
1141 :
1142 12716100 : v3 = _mm256_add_epi32(v2, rnding);
1143 12716100 : u3 = _mm256_srai_epi32(v3, bit);
1144 :
1145 : // Note: shift[1] and shift[2] are zeros
1146 :
1147 : // Transpose 4x4 32-bit
1148 12716100 : v0 = _mm256_unpacklo_epi32(u0, u1);
1149 12716100 : v1 = _mm256_unpackhi_epi32(u0, u1);
1150 12716100 : v2 = _mm256_unpacklo_epi32(u2, u3);
1151 12716100 : v3 = _mm256_unpackhi_epi32(u2, u3);
1152 :
1153 12716100 : out[0] = _mm256_unpacklo_epi64(v0, v2);
1154 12716100 : out[1] = _mm256_unpackhi_epi64(v0, v2);
1155 12716100 : out[2] = _mm256_unpacklo_epi64(v1, v3);
1156 12716100 : out[3] = _mm256_unpackhi_epi64(v1, v3);
1157 :
1158 12716100 : output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
1159 12716100 : output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
1160 12716100 : output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
1161 12716100 : output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
1162 12716100 : }
1163 :
1164 7345900 : static INLINE void fdct16x4_avx2(__m256i *input, __m256i *output, int32_t bit) {
1165 7345900 : __m128i *in = (__m128i *)input;
1166 7345900 : __m128i *out = (__m128i *)output;
1167 :
1168 7345900 : const int32_t *cospi = cospi_arr(bit);
1169 7345760 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1170 7345760 : const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
1171 7345760 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1172 7345760 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1173 7345760 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1174 7345760 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1175 7345760 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1176 7345760 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1177 7345760 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1178 7345760 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1179 7345760 : const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1180 7345760 : const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1181 7345760 : const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1182 7345760 : const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1183 7345760 : const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1184 7345760 : const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1185 7345760 : const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1186 7345760 : const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1187 7345760 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1188 : __m128i u[16], v[16], x;
1189 :
1190 : // stage 0
1191 : // stage 1
1192 7345760 : u[0] = _mm_add_epi32(in[0], in[15]);
1193 7345760 : v[15] = _mm_sub_epi32(in[0], in[15]);
1194 7345760 : u[1] = _mm_add_epi32(in[1], in[14]);
1195 7345760 : v[14] = _mm_sub_epi32(in[1], in[14]);
1196 7345760 : u[2] = _mm_add_epi32(in[2], in[13]);
1197 7345760 : u[13] = _mm_sub_epi32(in[2], in[13]);
1198 7345760 : u[3] = _mm_add_epi32(in[3], in[12]);
1199 7345760 : u[12] = _mm_sub_epi32(in[3], in[12]);
1200 7345760 : u[4] = _mm_add_epi32(in[4], in[11]);
1201 7345760 : u[11] = _mm_sub_epi32(in[4], in[11]);
1202 7345760 : u[5] = _mm_add_epi32(in[5], in[10]);
1203 7345760 : u[10] = _mm_sub_epi32(in[5], in[10]);
1204 7345760 : u[6] = _mm_add_epi32(in[6], in[9]);
1205 7345760 : v[9] = _mm_sub_epi32(in[6], in[9]);
1206 7345760 : u[7] = _mm_add_epi32(in[7], in[8]);
1207 7345760 : v[8] = _mm_sub_epi32(in[7], in[8]);
1208 :
1209 : // stage 2
1210 7345760 : v[0] = _mm_add_epi32(u[0], u[7]);
1211 7345760 : u[7] = _mm_sub_epi32(u[0], u[7]);
1212 7345760 : v[1] = _mm_add_epi32(u[1], u[6]);
1213 7345760 : v[6] = _mm_sub_epi32(u[1], u[6]);
1214 7345760 : v[2] = _mm_add_epi32(u[2], u[5]);
1215 7345760 : v[5] = _mm_sub_epi32(u[2], u[5]);
1216 7345760 : v[3] = _mm_add_epi32(u[3], u[4]);
1217 7345760 : u[4] = _mm_sub_epi32(u[3], u[4]);
1218 :
1219 7345760 : v[10] = _mm_mullo_epi32(u[10], cospim32);
1220 7345760 : x = _mm_mullo_epi32(u[13], cospi32);
1221 7345760 : v[10] = _mm_add_epi32(v[10], x);
1222 7345760 : v[10] = _mm_add_epi32(v[10], rnding);
1223 7345760 : v[10] = _mm_srai_epi32(v[10], bit);
1224 :
1225 7345760 : v[13] = _mm_mullo_epi32(u[10], cospi32);
1226 7345760 : x = _mm_mullo_epi32(u[13], cospim32);
1227 7345760 : v[13] = _mm_sub_epi32(v[13], x);
1228 7345760 : v[13] = _mm_add_epi32(v[13], rnding);
1229 7345760 : v[13] = _mm_srai_epi32(v[13], bit);
1230 :
1231 7345760 : v[11] = _mm_mullo_epi32(u[11], cospim32);
1232 7345760 : x = _mm_mullo_epi32(u[12], cospi32);
1233 7345760 : v[11] = _mm_add_epi32(v[11], x);
1234 7345760 : v[11] = _mm_add_epi32(v[11], rnding);
1235 7345760 : v[11] = _mm_srai_epi32(v[11], bit);
1236 :
1237 7345760 : v[12] = _mm_mullo_epi32(u[11], cospi32);
1238 7345760 : x = _mm_mullo_epi32(u[12], cospim32);
1239 7345760 : v[12] = _mm_sub_epi32(v[12], x);
1240 7345760 : v[12] = _mm_add_epi32(v[12], rnding);
1241 7345760 : v[12] = _mm_srai_epi32(v[12], bit);
1242 :
1243 : // stage 3
1244 7345760 : u[0] = _mm_add_epi32(v[0], v[3]);
1245 7345760 : u[3] = _mm_sub_epi32(v[0], v[3]);
1246 7345760 : u[1] = _mm_add_epi32(v[1], v[2]);
1247 7345760 : u[2] = _mm_sub_epi32(v[1], v[2]);
1248 :
1249 7345760 : u[5] = _mm_mullo_epi32(v[5], cospim32);
1250 7345760 : x = _mm_mullo_epi32(v[6], cospi32);
1251 7345760 : u[5] = _mm_add_epi32(u[5], x);
1252 7345760 : u[5] = _mm_add_epi32(u[5], rnding);
1253 7345760 : u[5] = _mm_srai_epi32(u[5], bit);
1254 :
1255 7345760 : u[6] = _mm_mullo_epi32(v[5], cospi32);
1256 7345760 : x = _mm_mullo_epi32(v[6], cospim32);
1257 7345760 : u[6] = _mm_sub_epi32(u[6], x);
1258 7345760 : u[6] = _mm_add_epi32(u[6], rnding);
1259 7345760 : u[6] = _mm_srai_epi32(u[6], bit);
1260 :
1261 7345760 : u[8] = _mm_add_epi32(v[8], v[11]);
1262 7345760 : v[11] = _mm_sub_epi32(v[8], v[11]);
1263 7345760 : u[9] = _mm_add_epi32(v[9], v[10]);
1264 7345760 : u[10] = _mm_sub_epi32(v[9], v[10]);
1265 7345760 : u[12] = _mm_sub_epi32(v[15], v[12]);
1266 7345760 : v[15] = _mm_add_epi32(v[15], v[12]);
1267 7345760 : u[13] = _mm_sub_epi32(v[14], v[13]);
1268 7345760 : u[14] = _mm_add_epi32(v[14], v[13]);
1269 :
1270 : // stage 4
1271 7345760 : u[0] = _mm_mullo_epi32(u[0], cospi32);
1272 7345760 : u[1] = _mm_mullo_epi32(u[1], cospi32);
1273 7345760 : v[0] = _mm_add_epi32(u[0], u[1]);
1274 7345760 : v[0] = _mm_add_epi32(v[0], rnding);
1275 7345760 : out[0] = _mm_srai_epi32(v[0], bit);
1276 :
1277 7345760 : v[1] = _mm_sub_epi32(u[0], u[1]);
1278 7345760 : v[1] = _mm_add_epi32(v[1], rnding);
1279 7345760 : out[8] = _mm_srai_epi32(v[1], bit);
1280 :
1281 7345760 : v[2] = _mm_mullo_epi32(u[2], cospi48);
1282 7345760 : x = _mm_mullo_epi32(u[3], cospi16);
1283 7345760 : v[2] = _mm_add_epi32(v[2], x);
1284 7345760 : v[2] = _mm_add_epi32(v[2], rnding);
1285 7345760 : out[4] = _mm_srai_epi32(v[2], bit);
1286 :
1287 7345760 : v[3] = _mm_mullo_epi32(u[2], cospi16);
1288 7345760 : x = _mm_mullo_epi32(u[3], cospi48);
1289 7345760 : v[3] = _mm_sub_epi32(x, v[3]);
1290 7345760 : v[3] = _mm_add_epi32(v[3], rnding);
1291 7345760 : out[12] = _mm_srai_epi32(v[3], bit);
1292 :
1293 7345760 : v[4] = _mm_add_epi32(u[4], u[5]);
1294 7345760 : v[5] = _mm_sub_epi32(u[4], u[5]);
1295 7345760 : v[6] = _mm_sub_epi32(u[7], u[6]);
1296 7345760 : v[7] = _mm_add_epi32(u[7], u[6]);
1297 7345760 : v[8] = u[8];
1298 :
1299 7345760 : v[9] = _mm_mullo_epi32(u[9], cospim16);
1300 7345760 : x = _mm_mullo_epi32(u[14], cospi48);
1301 7345760 : v[9] = _mm_add_epi32(v[9], x);
1302 7345760 : v[9] = _mm_add_epi32(v[9], rnding);
1303 7345760 : v[9] = _mm_srai_epi32(v[9], bit);
1304 :
1305 7345760 : v[14] = _mm_mullo_epi32(u[9], cospi48);
1306 7345760 : x = _mm_mullo_epi32(u[14], cospim16);
1307 7345760 : v[14] = _mm_sub_epi32(v[14], x);
1308 7345760 : v[14] = _mm_add_epi32(v[14], rnding);
1309 7345760 : v[14] = _mm_srai_epi32(v[14], bit);
1310 :
1311 7345760 : v[10] = _mm_mullo_epi32(u[10], cospim48);
1312 7345760 : x = _mm_mullo_epi32(u[13], cospim16);
1313 7345760 : v[10] = _mm_add_epi32(v[10], x);
1314 7345760 : v[10] = _mm_add_epi32(v[10], rnding);
1315 7345760 : v[10] = _mm_srai_epi32(v[10], bit);
1316 :
1317 7345760 : v[13] = _mm_mullo_epi32(u[10], cospim16);
1318 7345760 : x = _mm_mullo_epi32(u[13], cospim48);
1319 7345760 : v[13] = _mm_sub_epi32(v[13], x);
1320 7345760 : v[13] = _mm_add_epi32(v[13], rnding);
1321 7345760 : v[13] = _mm_srai_epi32(v[13], bit);
1322 :
1323 7345760 : v[12] = u[12];
1324 :
1325 : // stage 5
1326 7345760 : u[4] = _mm_mullo_epi32(v[4], cospi56);
1327 7345760 : x = _mm_mullo_epi32(v[7], cospi8);
1328 7345760 : u[4] = _mm_add_epi32(u[4], x);
1329 7345760 : u[4] = _mm_add_epi32(u[4], rnding);
1330 7345760 : out[2] = _mm_srai_epi32(u[4], bit);
1331 :
1332 7345760 : u[7] = _mm_mullo_epi32(v[4], cospi8);
1333 7345760 : x = _mm_mullo_epi32(v[7], cospi56);
1334 7345760 : u[7] = _mm_sub_epi32(x, u[7]);
1335 7345760 : u[7] = _mm_add_epi32(u[7], rnding);
1336 7345760 : out[14] = _mm_srai_epi32(u[7], bit);
1337 :
1338 7345760 : u[5] = _mm_mullo_epi32(v[5], cospi24);
1339 7345760 : x = _mm_mullo_epi32(v[6], cospi40);
1340 7345760 : u[5] = _mm_add_epi32(u[5], x);
1341 7345760 : u[5] = _mm_add_epi32(u[5], rnding);
1342 7345760 : out[10] = _mm_srai_epi32(u[5], bit);
1343 :
1344 7345760 : u[6] = _mm_mullo_epi32(v[5], cospi40);
1345 7345760 : x = _mm_mullo_epi32(v[6], cospi24);
1346 7345760 : u[6] = _mm_sub_epi32(x, u[6]);
1347 7345760 : u[6] = _mm_add_epi32(u[6], rnding);
1348 7345760 : out[6] = _mm_srai_epi32(u[6], bit);
1349 :
1350 7345760 : u[8] = _mm_add_epi32(v[8], v[9]);
1351 7345760 : u[9] = _mm_sub_epi32(v[8], v[9]);
1352 7345760 : u[10] = _mm_sub_epi32(v[11], v[10]);
1353 7345760 : u[11] = _mm_add_epi32(v[11], v[10]);
1354 7345760 : u[12] = _mm_add_epi32(v[12], v[13]);
1355 7345760 : u[13] = _mm_sub_epi32(v[12], v[13]);
1356 7345760 : u[14] = _mm_sub_epi32(v[15], v[14]);
1357 7345760 : u[15] = _mm_add_epi32(v[15], v[14]);
1358 :
1359 : // stage 6
1360 7345760 : v[8] = _mm_mullo_epi32(u[8], cospi60);
1361 7345760 : x = _mm_mullo_epi32(u[15], cospi4);
1362 7345760 : v[8] = _mm_add_epi32(v[8], x);
1363 7345760 : v[8] = _mm_add_epi32(v[8], rnding);
1364 7345760 : out[1] = _mm_srai_epi32(v[8], bit);
1365 :
1366 7345760 : v[15] = _mm_mullo_epi32(u[8], cospi4);
1367 7345760 : x = _mm_mullo_epi32(u[15], cospi60);
1368 7345760 : v[15] = _mm_sub_epi32(x, v[15]);
1369 7345760 : v[15] = _mm_add_epi32(v[15], rnding);
1370 7345760 : out[15] = _mm_srai_epi32(v[15], bit);
1371 :
1372 7345760 : v[9] = _mm_mullo_epi32(u[9], cospi28);
1373 7345760 : x = _mm_mullo_epi32(u[14], cospi36);
1374 7345760 : v[9] = _mm_add_epi32(v[9], x);
1375 7345760 : v[9] = _mm_add_epi32(v[9], rnding);
1376 7345760 : out[9] = _mm_srai_epi32(v[9], bit);
1377 :
1378 7345760 : v[14] = _mm_mullo_epi32(u[9], cospi36);
1379 7345760 : x = _mm_mullo_epi32(u[14], cospi28);
1380 7345760 : v[14] = _mm_sub_epi32(x, v[14]);
1381 7345760 : v[14] = _mm_add_epi32(v[14], rnding);
1382 7345760 : out[7] = _mm_srai_epi32(v[14], bit);
1383 :
1384 7345760 : v[10] = _mm_mullo_epi32(u[10], cospi44);
1385 7345760 : x = _mm_mullo_epi32(u[13], cospi20);
1386 7345760 : v[10] = _mm_add_epi32(v[10], x);
1387 7345760 : v[10] = _mm_add_epi32(v[10], rnding);
1388 7345760 : out[5] = _mm_srai_epi32(v[10], bit);
1389 :
1390 7345760 : v[13] = _mm_mullo_epi32(u[10], cospi20);
1391 7345760 : x = _mm_mullo_epi32(u[13], cospi44);
1392 7345760 : v[13] = _mm_sub_epi32(x, v[13]);
1393 7345760 : v[13] = _mm_add_epi32(v[13], rnding);
1394 7345760 : out[11] = _mm_srai_epi32(v[13], bit);
1395 :
1396 7345760 : v[11] = _mm_mullo_epi32(u[11], cospi12);
1397 7345760 : x = _mm_mullo_epi32(u[12], cospi52);
1398 7345760 : v[11] = _mm_add_epi32(v[11], x);
1399 7345760 : v[11] = _mm_add_epi32(v[11], rnding);
1400 7345760 : out[13] = _mm_srai_epi32(v[11], bit);
1401 :
1402 7345760 : v[12] = _mm_mullo_epi32(u[11], cospi52);
1403 7345760 : x = _mm_mullo_epi32(u[12], cospi12);
1404 7345760 : v[12] = _mm_sub_epi32(x, v[12]);
1405 7345760 : v[12] = _mm_add_epi32(v[12], rnding);
1406 7345760 : out[3] = _mm_srai_epi32(v[12], bit);
1407 7345760 : }
1408 :
1409 3275680 : static INLINE void fadst8x4_avx2(__m256i *input, __m256i *output, int32_t bit,
1410 : const int32_t col_num) {
1411 3275680 : __m128i *in = (__m128i *)input;
1412 3275680 : __m128i *out = (__m128i *)output;
1413 3275680 : const int32_t *cospi = cospi_arr(bit);
1414 3275650 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1415 3275650 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1416 3275650 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1417 3275650 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1418 3275650 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1419 3275650 : const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1420 3275650 : const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
1421 3275650 : const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1422 3275650 : const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1423 3275650 : const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
1424 3275650 : const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1425 3275650 : const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1426 3275650 : const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1427 3275650 : const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
1428 3275650 : const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1429 3275650 : const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
1430 3275650 : const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1431 6551310 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1432 3275650 : const __m128i zero = _mm_setzero_si128();
1433 : __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1434 : __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1435 : __m128i x, y;
1436 : int32_t col;
1437 :
1438 : // Note:
1439 : // Even column: 0, 2, ..., 14
1440 : // Odd column: 1, 3, ..., 15
1441 : // one even column plus one odd column constructs one row (8 coeffs)
1442 : // total we have 8 rows (8x8).
1443 6551390 : for (col = 0; col < col_num; ++col) {
1444 : // stage 0
1445 : // stage 1
1446 3275740 : u0 = in[col_num * 0 + col];
1447 3275740 : u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]);
1448 3275740 : u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]);
1449 3275740 : u3 = in[col_num * 4 + col];
1450 3275740 : u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]);
1451 3275740 : u5 = in[col_num * 6 + col];
1452 3275740 : u6 = in[col_num * 2 + col];
1453 3275740 : u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]);
1454 :
1455 : // stage 2
1456 3275740 : v0 = u0;
1457 3275740 : v1 = u1;
1458 :
1459 3275740 : x = _mm_mullo_epi32(u2, cospi32);
1460 3275740 : y = _mm_mullo_epi32(u3, cospi32);
1461 3275740 : v2 = _mm_add_epi32(x, y);
1462 3275740 : v2 = _mm_add_epi32(v2, rnding);
1463 3275740 : v2 = _mm_srai_epi32(v2, bit);
1464 :
1465 3275740 : v3 = _mm_sub_epi32(x, y);
1466 3275740 : v3 = _mm_add_epi32(v3, rnding);
1467 3275740 : v3 = _mm_srai_epi32(v3, bit);
1468 :
1469 3275740 : v4 = u4;
1470 3275740 : v5 = u5;
1471 :
1472 3275740 : x = _mm_mullo_epi32(u6, cospi32);
1473 3275740 : y = _mm_mullo_epi32(u7, cospi32);
1474 3275740 : v6 = _mm_add_epi32(x, y);
1475 3275740 : v6 = _mm_add_epi32(v6, rnding);
1476 3275740 : v6 = _mm_srai_epi32(v6, bit);
1477 :
1478 3275740 : v7 = _mm_sub_epi32(x, y);
1479 3275740 : v7 = _mm_add_epi32(v7, rnding);
1480 3275740 : v7 = _mm_srai_epi32(v7, bit);
1481 :
1482 : // stage 3
1483 3275740 : u0 = _mm_add_epi32(v0, v2);
1484 3275740 : u1 = _mm_add_epi32(v1, v3);
1485 3275740 : u2 = _mm_sub_epi32(v0, v2);
1486 3275740 : u3 = _mm_sub_epi32(v1, v3);
1487 3275740 : u4 = _mm_add_epi32(v4, v6);
1488 3275740 : u5 = _mm_add_epi32(v5, v7);
1489 3275740 : u6 = _mm_sub_epi32(v4, v6);
1490 3275740 : u7 = _mm_sub_epi32(v5, v7);
1491 :
1492 : // stage 4
1493 3275740 : v0 = u0;
1494 3275740 : v1 = u1;
1495 3275740 : v2 = u2;
1496 3275740 : v3 = u3;
1497 :
1498 3275740 : x = _mm_mullo_epi32(u4, cospi16);
1499 3275740 : y = _mm_mullo_epi32(u5, cospi48);
1500 3275740 : v4 = _mm_add_epi32(x, y);
1501 3275740 : v4 = _mm_add_epi32(v4, rnding);
1502 3275740 : v4 = _mm_srai_epi32(v4, bit);
1503 :
1504 3275740 : x = _mm_mullo_epi32(u4, cospi48);
1505 3275740 : y = _mm_mullo_epi32(u5, cospim16);
1506 3275740 : v5 = _mm_add_epi32(x, y);
1507 3275740 : v5 = _mm_add_epi32(v5, rnding);
1508 3275740 : v5 = _mm_srai_epi32(v5, bit);
1509 :
1510 3275740 : x = _mm_mullo_epi32(u6, cospim48);
1511 3275740 : y = _mm_mullo_epi32(u7, cospi16);
1512 3275740 : v6 = _mm_add_epi32(x, y);
1513 3275740 : v6 = _mm_add_epi32(v6, rnding);
1514 3275740 : v6 = _mm_srai_epi32(v6, bit);
1515 :
1516 3275740 : x = _mm_mullo_epi32(u6, cospi16);
1517 3275740 : y = _mm_mullo_epi32(u7, cospi48);
1518 3275740 : v7 = _mm_add_epi32(x, y);
1519 3275740 : v7 = _mm_add_epi32(v7, rnding);
1520 3275740 : v7 = _mm_srai_epi32(v7, bit);
1521 :
1522 : // stage 5
1523 3275740 : u0 = _mm_add_epi32(v0, v4);
1524 3275740 : u1 = _mm_add_epi32(v1, v5);
1525 3275740 : u2 = _mm_add_epi32(v2, v6);
1526 3275740 : u3 = _mm_add_epi32(v3, v7);
1527 3275740 : u4 = _mm_sub_epi32(v0, v4);
1528 3275740 : u5 = _mm_sub_epi32(v1, v5);
1529 3275740 : u6 = _mm_sub_epi32(v2, v6);
1530 3275740 : u7 = _mm_sub_epi32(v3, v7);
1531 :
1532 : // stage 6
1533 3275740 : x = _mm_mullo_epi32(u0, cospi4);
1534 3275740 : y = _mm_mullo_epi32(u1, cospi60);
1535 3275740 : v0 = _mm_add_epi32(x, y);
1536 3275740 : v0 = _mm_add_epi32(v0, rnding);
1537 6551470 : out[col_num * 7 + col] = _mm_srai_epi32(v0, bit);
1538 :
1539 3275740 : x = _mm_mullo_epi32(u0, cospi60);
1540 3275740 : y = _mm_mullo_epi32(u1, cospim4);
1541 3275740 : v1 = _mm_add_epi32(x, y);
1542 3275740 : v1 = _mm_add_epi32(v1, rnding);
1543 6551470 : out[col_num * 0 + col] = _mm_srai_epi32(v1, bit);
1544 :
1545 3275740 : x = _mm_mullo_epi32(u2, cospi20);
1546 3275740 : y = _mm_mullo_epi32(u3, cospi44);
1547 3275740 : v2 = _mm_add_epi32(x, y);
1548 3275740 : v2 = _mm_add_epi32(v2, rnding);
1549 6551470 : out[col_num * 5 + col] = _mm_srai_epi32(v2, bit);
1550 :
1551 3275740 : x = _mm_mullo_epi32(u2, cospi44);
1552 3275740 : y = _mm_mullo_epi32(u3, cospim20);
1553 3275740 : v3 = _mm_add_epi32(x, y);
1554 3275740 : v3 = _mm_add_epi32(v3, rnding);
1555 6551470 : out[col_num * 2 + col] = _mm_srai_epi32(v3, bit);
1556 :
1557 3275740 : x = _mm_mullo_epi32(u4, cospi36);
1558 3275740 : y = _mm_mullo_epi32(u5, cospi28);
1559 3275740 : v4 = _mm_add_epi32(x, y);
1560 3275740 : v4 = _mm_add_epi32(v4, rnding);
1561 6551470 : out[col_num * 3 + col] = _mm_srai_epi32(v4, bit);
1562 :
1563 3275740 : x = _mm_mullo_epi32(u4, cospi28);
1564 3275740 : y = _mm_mullo_epi32(u5, cospim36);
1565 3275740 : v5 = _mm_add_epi32(x, y);
1566 3275740 : v5 = _mm_add_epi32(v5, rnding);
1567 6551470 : out[col_num * 4 + col] = _mm_srai_epi32(v5, bit);
1568 :
1569 3275740 : x = _mm_mullo_epi32(u6, cospi52);
1570 3275740 : y = _mm_mullo_epi32(u7, cospi12);
1571 3275740 : v6 = _mm_add_epi32(x, y);
1572 3275740 : v6 = _mm_add_epi32(v6, rnding);
1573 6551470 : out[col_num * 1 + col] = _mm_srai_epi32(v6, bit);
1574 :
1575 3275740 : x = _mm_mullo_epi32(u6, cospi12);
1576 3275740 : y = _mm_mullo_epi32(u7, cospim52);
1577 3275740 : v7 = _mm_add_epi32(x, y);
1578 3275740 : v7 = _mm_add_epi32(v7, rnding);
1579 6551470 : out[col_num * 6 + col] = _mm_srai_epi32(v7, bit);
1580 : }
1581 3275650 : }
1582 :
1583 2709950 : static INLINE void fadst16x4_avx2(__m256i *input, __m256i *output, int32_t bit) {
1584 2709950 : __m128i *in = (__m128i *)input;
1585 2709950 : __m128i *out = (__m128i *)output;
1586 :
1587 2709950 : const int32_t *cospi = cospi_arr(bit);
1588 2709930 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1589 2709930 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1590 2709930 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1591 2709930 : const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1592 2709930 : const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1593 2709930 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1594 2709930 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1595 2709930 : const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
1596 2709930 : const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
1597 2709930 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1598 2709930 : const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
1599 2709930 : const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1600 2709930 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1601 2709930 : const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
1602 2709930 : const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
1603 2709930 : const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
1604 2709930 : const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
1605 2709930 : const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
1606 2709930 : const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
1607 2709930 : const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
1608 2709930 : const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
1609 2709930 : const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
1610 2709930 : const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
1611 2709930 : const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
1612 2709930 : const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
1613 2709930 : const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
1614 2709930 : const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
1615 2709930 : const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
1616 2709930 : const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
1617 2709930 : const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
1618 2709930 : const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
1619 2709930 : const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
1620 2709930 : const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
1621 2709930 : const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
1622 2709930 : const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
1623 2709930 : const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
1624 2709930 : const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
1625 5419860 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1626 2709930 : const __m128i zero = _mm_setzero_si128();
1627 :
1628 : __m128i u[16], v[16], x, y;
1629 : __m128i tmp[13];
1630 :
1631 2709930 : tmp[0] = _mm_sub_epi32(zero, in[15]);
1632 2709930 : u[2] = _mm_sub_epi32(zero, in[7]);
1633 2709930 : tmp[1] = _mm_sub_epi32(zero, in[3]);
1634 2709930 : u[7] = _mm_sub_epi32(zero, in[11]);
1635 2709930 : tmp[2] = _mm_sub_epi32(zero, in[1]);
1636 2709930 : u[11] = _mm_sub_epi32(zero, in[9]);
1637 2709930 : tmp[3] = _mm_sub_epi32(zero, in[13]);
1638 2709930 : u[14] = _mm_sub_epi32(zero, in[5]);
1639 :
1640 : // stage 2
1641 :
1642 2709930 : x = _mm_mullo_epi32(u[2], cospi32);
1643 5419860 : y = _mm_mullo_epi32(in[8], cospi32);
1644 2709930 : v[2] = _mm_add_epi32(x, y);
1645 2709930 : v[2] = _mm_add_epi32(v[2], rnding);
1646 5419860 : v[2] = _mm_srai_epi32(v[2], bit);
1647 :
1648 2709930 : v[3] = _mm_sub_epi32(x, y);
1649 2709930 : v[3] = _mm_add_epi32(v[3], rnding);
1650 2709930 : v[3] = _mm_srai_epi32(v[3], bit);
1651 :
1652 2709930 : x = _mm_mullo_epi32(in[4], cospi32);
1653 5419860 : y = _mm_mullo_epi32(u[7], cospi32);
1654 2709930 : v[6] = _mm_add_epi32(x, y);
1655 2709930 : v[6] = _mm_add_epi32(v[6], rnding);
1656 5419860 : v[6] = _mm_srai_epi32(v[6], bit);
1657 :
1658 2709930 : v[7] = _mm_sub_epi32(x, y);
1659 2709930 : v[7] = _mm_add_epi32(v[7], rnding);
1660 2709930 : v[7] = _mm_srai_epi32(v[7], bit);
1661 :
1662 2709930 : x = _mm_mullo_epi32(in[6], cospi32);
1663 5419860 : y = _mm_mullo_epi32(u[11], cospi32);
1664 2709930 : v[10] = _mm_add_epi32(x, y);
1665 2709930 : v[10] = _mm_add_epi32(v[10], rnding);
1666 5419860 : v[10] = _mm_srai_epi32(v[10], bit);
1667 :
1668 2709930 : v[11] = _mm_sub_epi32(x, y);
1669 2709930 : v[11] = _mm_add_epi32(v[11], rnding);
1670 2709930 : v[11] = _mm_srai_epi32(v[11], bit);
1671 :
1672 2709930 : x = _mm_mullo_epi32(u[14], cospi32);
1673 5419860 : y = _mm_mullo_epi32(in[10], cospi32);
1674 2709930 : v[14] = _mm_add_epi32(x, y);
1675 2709930 : v[14] = _mm_add_epi32(v[14], rnding);
1676 5419860 : v[14] = _mm_srai_epi32(v[14], bit);
1677 :
1678 2709930 : v[15] = _mm_sub_epi32(x, y);
1679 2709930 : v[15] = _mm_add_epi32(v[15], rnding);
1680 2709930 : v[15] = _mm_srai_epi32(v[15], bit);
1681 :
1682 : // stage 3
1683 2709930 : tmp[4] = _mm_add_epi32(in[0], v[2]);
1684 2709930 : tmp[5] = _mm_add_epi32(tmp[0], v[3]);
1685 2709930 : tmp[6] = _mm_sub_epi32(in[0], v[2]);
1686 2709930 : tmp[0] = _mm_sub_epi32(tmp[0], v[3]);
1687 2709930 : u[4] = _mm_add_epi32(tmp[1], v[6]);
1688 2709930 : u[5] = _mm_add_epi32(in[12], v[7]);
1689 2709930 : u[6] = _mm_sub_epi32(tmp[1], v[6]);
1690 2709930 : u[7] = _mm_sub_epi32(in[12], v[7]);
1691 2709930 : tmp[1] = _mm_add_epi32(tmp[2], v[10]);
1692 2709930 : tmp[7] = _mm_add_epi32(in[14], v[11]);
1693 2709930 : tmp[2] = _mm_sub_epi32(tmp[2], v[10]);
1694 2709930 : tmp[8] = _mm_sub_epi32(in[14], v[11]);
1695 2709930 : u[12] = _mm_add_epi32(in[2], v[14]);
1696 2709930 : u[13] = _mm_add_epi32(tmp[3], v[15]);
1697 2709930 : u[14] = _mm_sub_epi32(in[2], v[14]);
1698 2709930 : u[15] = _mm_sub_epi32(tmp[3], v[15]);
1699 :
1700 : // stage 4
1701 2709930 : v[4] = half_btf_small(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
1702 2709900 : v[5] = half_btf_small(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
1703 2709880 : v[6] = half_btf_small(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
1704 2709860 : v[7] = half_btf_small(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
1705 2709860 : v[12] = half_btf_small(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
1706 2709860 : v[13] = half_btf_small(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
1707 2709850 : v[14] = half_btf_small(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
1708 2709850 : v[15] = half_btf_small(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
1709 :
1710 : // stage 5
1711 2709860 : tmp[9] = _mm_add_epi32(tmp[4], v[4]);
1712 2709860 : tmp[10] = _mm_add_epi32(tmp[5], v[5]);
1713 2709860 : tmp[11] = _mm_add_epi32(tmp[6], v[6]);
1714 2709860 : tmp[12] = _mm_add_epi32(tmp[0], v[7]);
1715 2709860 : tmp[4] = _mm_sub_epi32(tmp[4], v[4]);
1716 2709860 : tmp[5] = _mm_sub_epi32(tmp[5], v[5]);
1717 2709860 : tmp[6] = _mm_sub_epi32(tmp[6], v[6]);
1718 2709860 : tmp[0] = _mm_sub_epi32(tmp[0], v[7]);
1719 2709860 : u[8] = _mm_add_epi32(tmp[1], v[12]);
1720 2709860 : u[9] = _mm_add_epi32(tmp[7], v[13]);
1721 2709860 : u[10] = _mm_add_epi32(tmp[2], v[14]);
1722 2709860 : u[11] = _mm_add_epi32(tmp[8], v[15]);
1723 2709860 : u[12] = _mm_sub_epi32(tmp[1], v[12]);
1724 2709860 : u[13] = _mm_sub_epi32(tmp[7], v[13]);
1725 2709860 : u[14] = _mm_sub_epi32(tmp[2], v[14]);
1726 2709860 : u[15] = _mm_sub_epi32(tmp[8], v[15]);
1727 :
1728 : // stage 6
1729 2709860 : v[8] = half_btf_small(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
1730 2709900 : v[9] = half_btf_small(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
1731 2709870 : v[10] = half_btf_small(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
1732 2709860 : v[11] = half_btf_small(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
1733 2709870 : v[12] = half_btf_small(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
1734 2709870 : v[13] = half_btf_small(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
1735 2709870 : v[14] = half_btf_small(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
1736 2709860 : v[15] = half_btf_small(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
1737 :
1738 : // stage 7
1739 2709880 : u[0] = _mm_add_epi32(tmp[9], v[8]);
1740 2709880 : u[1] = _mm_add_epi32(tmp[10], v[9]);
1741 2709880 : u[2] = _mm_add_epi32(tmp[11], v[10]);
1742 2709880 : u[3] = _mm_add_epi32(tmp[12], v[11]);
1743 2709880 : u[4] = _mm_add_epi32(tmp[4], v[12]);
1744 2709880 : u[5] = _mm_add_epi32(tmp[5], v[13]);
1745 2709880 : u[6] = _mm_add_epi32(tmp[6], v[14]);
1746 2709880 : u[7] = _mm_add_epi32(tmp[0], v[15]);
1747 2709880 : u[8] = _mm_sub_epi32(tmp[9], v[8]);
1748 2709880 : u[9] = _mm_sub_epi32(tmp[10], v[9]);
1749 2709880 : u[10] = _mm_sub_epi32(tmp[11], v[10]);
1750 2709880 : u[11] = _mm_sub_epi32(tmp[12], v[11]);
1751 2709880 : u[12] = _mm_sub_epi32(tmp[4], v[12]);
1752 2709880 : u[13] = _mm_sub_epi32(tmp[5], v[13]);
1753 2709880 : u[14] = _mm_sub_epi32(tmp[6], v[14]);
1754 2709880 : u[15] = _mm_sub_epi32(tmp[0], v[15]);
1755 :
1756 : // stage 8
1757 2709880 : out[15] = half_btf_small(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
1758 2709910 : out[0] = half_btf_small(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
1759 2709900 : out[13] = half_btf_small(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
1760 2709890 : out[2] = half_btf_small(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
1761 2709890 : out[11] = half_btf_small(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
1762 2709890 : out[4] = half_btf_small(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
1763 2709890 : out[9] = half_btf_small(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
1764 2709890 : out[6] = half_btf_small(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
1765 2709890 : out[7] = half_btf_small(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
1766 2709890 : out[8] = half_btf_small(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
1767 2709890 : out[5] = half_btf_small(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
1768 2709890 : out[10] = half_btf_small(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
1769 2709900 : out[3] = half_btf_small(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
1770 2709890 : out[12] = half_btf_small(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
1771 2709890 : out[1] = half_btf_small(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
1772 2709880 : out[14] = half_btf_small(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
1773 2709880 : }
1774 :
1775 26337600 : static void fdct16x16_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
1776 26337600 : const int32_t *cospi = cospi_arr(bit);
1777 26334400 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1778 26334400 : const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
1779 26334400 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1780 26334400 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1781 26334400 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1782 26334400 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1783 26334400 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1784 26334400 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1785 26334400 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1786 26334400 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1787 26334400 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
1788 26334400 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
1789 26334400 : const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
1790 26334400 : const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
1791 26334400 : const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
1792 26334400 : const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
1793 26334400 : const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
1794 26334400 : const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
1795 26334400 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1796 : __m256i u[16], v[16], x;
1797 : int32_t col;
1798 :
1799 80563600 : for (col = 0; col < col_num; ++col) {
1800 : // stage 0
1801 : // stage 1
1802 54229100 : u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
1803 54229100 : u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
1804 54229100 : u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
1805 54229100 : u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
1806 54229100 : u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
1807 54229100 : u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
1808 54229100 : u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
1809 54229100 : u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
1810 54229100 : u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
1811 54229100 : u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
1812 54229100 : u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
1813 54229100 : u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
1814 54229100 : u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
1815 54229100 : u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
1816 54229100 : u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
1817 54229100 : u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
1818 :
1819 : // stage 2
1820 54229100 : v[0] = _mm256_add_epi32(u[0], u[7]);
1821 54229100 : v[7] = _mm256_sub_epi32(u[0], u[7]);
1822 54229100 : v[1] = _mm256_add_epi32(u[1], u[6]);
1823 54229100 : v[6] = _mm256_sub_epi32(u[1], u[6]);
1824 54229100 : v[2] = _mm256_add_epi32(u[2], u[5]);
1825 54229100 : v[5] = _mm256_sub_epi32(u[2], u[5]);
1826 54229100 : v[3] = _mm256_add_epi32(u[3], u[4]);
1827 54229100 : v[4] = _mm256_sub_epi32(u[3], u[4]);
1828 54229100 : v[8] = u[8];
1829 54229100 : v[9] = u[9];
1830 :
1831 54229100 : v[10] = _mm256_mullo_epi32(u[10], cospim32);
1832 54229100 : x = _mm256_mullo_epi32(u[13], cospi32);
1833 54229100 : v[10] = _mm256_add_epi32(v[10], x);
1834 54229100 : v[10] = _mm256_add_epi32(v[10], rnding);
1835 54229100 : v[10] = _mm256_srai_epi32(v[10], bit);
1836 :
1837 54229100 : v[13] = _mm256_mullo_epi32(u[10], cospi32);
1838 54229100 : x = _mm256_mullo_epi32(u[13], cospim32);
1839 54229100 : v[13] = _mm256_sub_epi32(v[13], x);
1840 54229100 : v[13] = _mm256_add_epi32(v[13], rnding);
1841 54229100 : v[13] = _mm256_srai_epi32(v[13], bit);
1842 :
1843 54229100 : v[11] = _mm256_mullo_epi32(u[11], cospim32);
1844 54229100 : x = _mm256_mullo_epi32(u[12], cospi32);
1845 54229100 : v[11] = _mm256_add_epi32(v[11], x);
1846 54229100 : v[11] = _mm256_add_epi32(v[11], rnding);
1847 54229100 : v[11] = _mm256_srai_epi32(v[11], bit);
1848 :
1849 54229100 : v[12] = _mm256_mullo_epi32(u[11], cospi32);
1850 54229100 : x = _mm256_mullo_epi32(u[12], cospim32);
1851 54229100 : v[12] = _mm256_sub_epi32(v[12], x);
1852 54229100 : v[12] = _mm256_add_epi32(v[12], rnding);
1853 54229100 : v[12] = _mm256_srai_epi32(v[12], bit);
1854 54229100 : v[14] = u[14];
1855 54229100 : v[15] = u[15];
1856 :
1857 : // stage 3
1858 54229100 : u[0] = _mm256_add_epi32(v[0], v[3]);
1859 54229100 : u[3] = _mm256_sub_epi32(v[0], v[3]);
1860 54229100 : u[1] = _mm256_add_epi32(v[1], v[2]);
1861 54229100 : u[2] = _mm256_sub_epi32(v[1], v[2]);
1862 54229100 : u[4] = v[4];
1863 :
1864 54229100 : u[5] = _mm256_mullo_epi32(v[5], cospim32);
1865 54229100 : x = _mm256_mullo_epi32(v[6], cospi32);
1866 54229100 : u[5] = _mm256_add_epi32(u[5], x);
1867 54229100 : u[5] = _mm256_add_epi32(u[5], rnding);
1868 54229100 : u[5] = _mm256_srai_epi32(u[5], bit);
1869 :
1870 54229100 : u[6] = _mm256_mullo_epi32(v[5], cospi32);
1871 54229100 : x = _mm256_mullo_epi32(v[6], cospim32);
1872 54229100 : u[6] = _mm256_sub_epi32(u[6], x);
1873 54229100 : u[6] = _mm256_add_epi32(u[6], rnding);
1874 54229100 : u[6] = _mm256_srai_epi32(u[6], bit);
1875 :
1876 54229100 : u[7] = v[7];
1877 54229100 : u[8] = _mm256_add_epi32(v[8], v[11]);
1878 54229100 : u[11] = _mm256_sub_epi32(v[8], v[11]);
1879 54229100 : u[9] = _mm256_add_epi32(v[9], v[10]);
1880 54229100 : u[10] = _mm256_sub_epi32(v[9], v[10]);
1881 54229100 : u[12] = _mm256_sub_epi32(v[15], v[12]);
1882 54229100 : u[15] = _mm256_add_epi32(v[15], v[12]);
1883 54229100 : u[13] = _mm256_sub_epi32(v[14], v[13]);
1884 54229100 : u[14] = _mm256_add_epi32(v[14], v[13]);
1885 :
1886 : // stage 4
1887 54229100 : u[0] = _mm256_mullo_epi32(u[0], cospi32);
1888 54229100 : u[1] = _mm256_mullo_epi32(u[1], cospi32);
1889 54229100 : v[0] = _mm256_add_epi32(u[0], u[1]);
1890 54229100 : v[0] = _mm256_add_epi32(v[0], rnding);
1891 54229100 : v[0] = _mm256_srai_epi32(v[0], bit);
1892 :
1893 54229100 : v[1] = _mm256_sub_epi32(u[0], u[1]);
1894 54229100 : v[1] = _mm256_add_epi32(v[1], rnding);
1895 54229100 : v[1] = _mm256_srai_epi32(v[1], bit);
1896 :
1897 54229100 : v[2] = _mm256_mullo_epi32(u[2], cospi48);
1898 54229100 : x = _mm256_mullo_epi32(u[3], cospi16);
1899 54229100 : v[2] = _mm256_add_epi32(v[2], x);
1900 54229100 : v[2] = _mm256_add_epi32(v[2], rnding);
1901 54229100 : v[2] = _mm256_srai_epi32(v[2], bit);
1902 :
1903 54229100 : v[3] = _mm256_mullo_epi32(u[2], cospi16);
1904 54229100 : x = _mm256_mullo_epi32(u[3], cospi48);
1905 54229100 : v[3] = _mm256_sub_epi32(x, v[3]);
1906 54229100 : v[3] = _mm256_add_epi32(v[3], rnding);
1907 54229100 : v[3] = _mm256_srai_epi32(v[3], bit);
1908 :
1909 54229100 : v[4] = _mm256_add_epi32(u[4], u[5]);
1910 54229100 : v[5] = _mm256_sub_epi32(u[4], u[5]);
1911 54229100 : v[6] = _mm256_sub_epi32(u[7], u[6]);
1912 54229100 : v[7] = _mm256_add_epi32(u[7], u[6]);
1913 54229100 : v[8] = u[8];
1914 :
1915 54229100 : v[9] = _mm256_mullo_epi32(u[9], cospim16);
1916 54229100 : x = _mm256_mullo_epi32(u[14], cospi48);
1917 54229100 : v[9] = _mm256_add_epi32(v[9], x);
1918 54229100 : v[9] = _mm256_add_epi32(v[9], rnding);
1919 54229100 : v[9] = _mm256_srai_epi32(v[9], bit);
1920 :
1921 54229100 : v[14] = _mm256_mullo_epi32(u[9], cospi48);
1922 54229100 : x = _mm256_mullo_epi32(u[14], cospim16);
1923 54229100 : v[14] = _mm256_sub_epi32(v[14], x);
1924 54229100 : v[14] = _mm256_add_epi32(v[14], rnding);
1925 54229100 : v[14] = _mm256_srai_epi32(v[14], bit);
1926 :
1927 54229100 : v[10] = _mm256_mullo_epi32(u[10], cospim48);
1928 54229100 : x = _mm256_mullo_epi32(u[13], cospim16);
1929 54229100 : v[10] = _mm256_add_epi32(v[10], x);
1930 54229100 : v[10] = _mm256_add_epi32(v[10], rnding);
1931 54229100 : v[10] = _mm256_srai_epi32(v[10], bit);
1932 :
1933 54229100 : v[13] = _mm256_mullo_epi32(u[10], cospim16);
1934 54229100 : x = _mm256_mullo_epi32(u[13], cospim48);
1935 54229100 : v[13] = _mm256_sub_epi32(v[13], x);
1936 54229100 : v[13] = _mm256_add_epi32(v[13], rnding);
1937 54229100 : v[13] = _mm256_srai_epi32(v[13], bit);
1938 :
1939 54229100 : v[11] = u[11];
1940 54229100 : v[12] = u[12];
1941 54229100 : v[15] = u[15];
1942 :
1943 : // stage 5
1944 54229100 : u[0] = v[0];
1945 54229100 : u[1] = v[1];
1946 54229100 : u[2] = v[2];
1947 54229100 : u[3] = v[3];
1948 :
1949 54229100 : u[4] = _mm256_mullo_epi32(v[4], cospi56);
1950 54229100 : x = _mm256_mullo_epi32(v[7], cospi8);
1951 54229100 : u[4] = _mm256_add_epi32(u[4], x);
1952 54229100 : u[4] = _mm256_add_epi32(u[4], rnding);
1953 54229100 : u[4] = _mm256_srai_epi32(u[4], bit);
1954 :
1955 54229100 : u[7] = _mm256_mullo_epi32(v[4], cospi8);
1956 54229100 : x = _mm256_mullo_epi32(v[7], cospi56);
1957 54229100 : u[7] = _mm256_sub_epi32(x, u[7]);
1958 54229100 : u[7] = _mm256_add_epi32(u[7], rnding);
1959 54229100 : u[7] = _mm256_srai_epi32(u[7], bit);
1960 :
1961 54229100 : u[5] = _mm256_mullo_epi32(v[5], cospi24);
1962 54229100 : x = _mm256_mullo_epi32(v[6], cospi40);
1963 54229100 : u[5] = _mm256_add_epi32(u[5], x);
1964 54229100 : u[5] = _mm256_add_epi32(u[5], rnding);
1965 54229100 : u[5] = _mm256_srai_epi32(u[5], bit);
1966 :
1967 54229100 : u[6] = _mm256_mullo_epi32(v[5], cospi40);
1968 54229100 : x = _mm256_mullo_epi32(v[6], cospi24);
1969 54229100 : u[6] = _mm256_sub_epi32(x, u[6]);
1970 54229100 : u[6] = _mm256_add_epi32(u[6], rnding);
1971 54229100 : u[6] = _mm256_srai_epi32(u[6], bit);
1972 :
1973 54229100 : u[8] = _mm256_add_epi32(v[8], v[9]);
1974 54229100 : u[9] = _mm256_sub_epi32(v[8], v[9]);
1975 54229100 : u[10] = _mm256_sub_epi32(v[11], v[10]);
1976 54229100 : u[11] = _mm256_add_epi32(v[11], v[10]);
1977 54229100 : u[12] = _mm256_add_epi32(v[12], v[13]);
1978 54229100 : u[13] = _mm256_sub_epi32(v[12], v[13]);
1979 54229100 : u[14] = _mm256_sub_epi32(v[15], v[14]);
1980 54229100 : u[15] = _mm256_add_epi32(v[15], v[14]);
1981 :
1982 : // stage 6
1983 54229100 : v[0] = u[0];
1984 54229100 : v[1] = u[1];
1985 54229100 : v[2] = u[2];
1986 54229100 : v[3] = u[3];
1987 54229100 : v[4] = u[4];
1988 54229100 : v[5] = u[5];
1989 54229100 : v[6] = u[6];
1990 54229100 : v[7] = u[7];
1991 :
1992 54229100 : v[8] = _mm256_mullo_epi32(u[8], cospi60);
1993 54229100 : x = _mm256_mullo_epi32(u[15], cospi4);
1994 54229100 : v[8] = _mm256_add_epi32(v[8], x);
1995 54229100 : v[8] = _mm256_add_epi32(v[8], rnding);
1996 54229100 : v[8] = _mm256_srai_epi32(v[8], bit);
1997 :
1998 54229100 : v[15] = _mm256_mullo_epi32(u[8], cospi4);
1999 54229100 : x = _mm256_mullo_epi32(u[15], cospi60);
2000 54229100 : v[15] = _mm256_sub_epi32(x, v[15]);
2001 54229100 : v[15] = _mm256_add_epi32(v[15], rnding);
2002 54229100 : v[15] = _mm256_srai_epi32(v[15], bit);
2003 :
2004 54229100 : v[9] = _mm256_mullo_epi32(u[9], cospi28);
2005 54229100 : x = _mm256_mullo_epi32(u[14], cospi36);
2006 54229100 : v[9] = _mm256_add_epi32(v[9], x);
2007 54229100 : v[9] = _mm256_add_epi32(v[9], rnding);
2008 54229100 : v[9] = _mm256_srai_epi32(v[9], bit);
2009 :
2010 54229100 : v[14] = _mm256_mullo_epi32(u[9], cospi36);
2011 54229100 : x = _mm256_mullo_epi32(u[14], cospi28);
2012 54229100 : v[14] = _mm256_sub_epi32(x, v[14]);
2013 54229100 : v[14] = _mm256_add_epi32(v[14], rnding);
2014 54229100 : v[14] = _mm256_srai_epi32(v[14], bit);
2015 :
2016 54229100 : v[10] = _mm256_mullo_epi32(u[10], cospi44);
2017 54229100 : x = _mm256_mullo_epi32(u[13], cospi20);
2018 54229100 : v[10] = _mm256_add_epi32(v[10], x);
2019 54229100 : v[10] = _mm256_add_epi32(v[10], rnding);
2020 54229100 : v[10] = _mm256_srai_epi32(v[10], bit);
2021 :
2022 54229100 : v[13] = _mm256_mullo_epi32(u[10], cospi20);
2023 54229100 : x = _mm256_mullo_epi32(u[13], cospi44);
2024 54229100 : v[13] = _mm256_sub_epi32(x, v[13]);
2025 54229100 : v[13] = _mm256_add_epi32(v[13], rnding);
2026 54229100 : v[13] = _mm256_srai_epi32(v[13], bit);
2027 :
2028 54229100 : v[11] = _mm256_mullo_epi32(u[11], cospi12);
2029 54229100 : x = _mm256_mullo_epi32(u[12], cospi52);
2030 54229100 : v[11] = _mm256_add_epi32(v[11], x);
2031 54229100 : v[11] = _mm256_add_epi32(v[11], rnding);
2032 54229100 : v[11] = _mm256_srai_epi32(v[11], bit);
2033 :
2034 54229100 : v[12] = _mm256_mullo_epi32(u[11], cospi52);
2035 54229100 : x = _mm256_mullo_epi32(u[12], cospi12);
2036 54229100 : v[12] = _mm256_sub_epi32(x, v[12]);
2037 54229100 : v[12] = _mm256_add_epi32(v[12], rnding);
2038 54229100 : v[12] = _mm256_srai_epi32(v[12], bit);
2039 :
2040 54229100 : out[0 * col_num + col] = v[0];
2041 54229100 : out[1 * col_num + col] = v[8];
2042 54229100 : out[2 * col_num + col] = v[4];
2043 54229100 : out[3 * col_num + col] = v[12];
2044 54229100 : out[4 * col_num + col] = v[2];
2045 54229100 : out[5 * col_num + col] = v[10];
2046 54229100 : out[6 * col_num + col] = v[6];
2047 54229100 : out[7 * col_num + col] = v[14];
2048 54229100 : out[8 * col_num + col] = v[1];
2049 54229100 : out[9 * col_num + col] = v[9];
2050 54229100 : out[10 * col_num + col] = v[5];
2051 54229100 : out[11 * col_num + col] = v[13];
2052 54229100 : out[12 * col_num + col] = v[3];
2053 54229100 : out[13 * col_num + col] = v[11];
2054 54229100 : out[14 * col_num + col] = v[7];
2055 54229100 : out[15 * col_num + col] = v[15];
2056 : }
2057 26334400 : }
2058 :
2059 4409730 : static INLINE void fadst4x8_row_avx2(__m256i *input, __m256i *output, int32_t bit,
2060 : const int32_t num_col) {
2061 4409730 : const int32_t *sinpi = sinpi_arr(bit);
2062 4409690 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2063 4409690 : const __m256i sinpi1 = _mm256_set1_epi32((int32_t)sinpi[1]);
2064 4409690 : const __m256i sinpi2 = _mm256_set1_epi32((int32_t)sinpi[2]);
2065 4409690 : const __m256i sinpi3 = _mm256_set1_epi32((int32_t)sinpi[3]);
2066 4409690 : const __m256i sinpi4 = _mm256_set1_epi32((int32_t)sinpi[4]);
2067 : __m256i t;
2068 : __m256i s0, s1, s2, s3, s4, s5, s6, s7;
2069 : __m256i x0, x1, x2, x3;
2070 : __m256i u0, u1, u2, u3;
2071 : __m256i v0, v1, v2, v3;
2072 : __m256i in[4];
2073 : __m256i out[4];
2074 :
2075 4409690 : in[0] = _mm256_permute2x128_si256(input[0], input[2], 0x20);
2076 4409690 : in[1] = _mm256_permute2x128_si256(input[0], input[2], 0x31);
2077 4409690 : in[2] = _mm256_permute2x128_si256(input[1], input[3], 0x20);
2078 4409690 : in[3] = _mm256_permute2x128_si256(input[1], input[3], 0x31);
2079 :
2080 4409690 : int32_t idx = 0 * num_col;
2081 4409690 : s0 = _mm256_mullo_epi32(in[idx], sinpi1);
2082 4409690 : s1 = _mm256_mullo_epi32(in[idx], sinpi4);
2083 4409690 : t = _mm256_add_epi32(in[idx], in[idx + num_col]);
2084 4409690 : idx += num_col;
2085 4409690 : s2 = _mm256_mullo_epi32(in[idx], sinpi2);
2086 4409690 : s3 = _mm256_mullo_epi32(in[idx], sinpi1);
2087 4409690 : idx += num_col;
2088 4409690 : s4 = _mm256_mullo_epi32(in[idx], sinpi3);
2089 4409690 : idx += num_col;
2090 4409690 : s5 = _mm256_mullo_epi32(in[idx], sinpi4);
2091 4409690 : s6 = _mm256_mullo_epi32(in[idx], sinpi2);
2092 8819390 : s7 = _mm256_sub_epi32(t, in[idx]);
2093 :
2094 4409690 : t = _mm256_add_epi32(s0, s2);
2095 4409690 : x0 = _mm256_add_epi32(t, s5);
2096 4409690 : x1 = _mm256_mullo_epi32(s7, sinpi3);
2097 4409690 : t = _mm256_sub_epi32(s1, s3);
2098 4409690 : x2 = _mm256_add_epi32(t, s6);
2099 4409690 : x3 = s4;
2100 :
2101 4409690 : s0 = _mm256_add_epi32(x0, x3);
2102 4409690 : s1 = x1;
2103 4409690 : s2 = _mm256_sub_epi32(x2, x3);
2104 4409690 : t = _mm256_sub_epi32(x2, x0);
2105 4409690 : s3 = _mm256_add_epi32(t, x3);
2106 :
2107 4409690 : u0 = _mm256_add_epi32(s0, rnding);
2108 4409690 : u0 = _mm256_srai_epi32(u0, bit);
2109 :
2110 4409690 : u1 = _mm256_add_epi32(s1, rnding);
2111 4409690 : u1 = _mm256_srai_epi32(u1, bit);
2112 :
2113 4409690 : u2 = _mm256_add_epi32(s2, rnding);
2114 4409690 : u2 = _mm256_srai_epi32(u2, bit);
2115 :
2116 4409690 : u3 = _mm256_add_epi32(s3, rnding);
2117 4409690 : u3 = _mm256_srai_epi32(u3, bit);
2118 :
2119 4409690 : v0 = _mm256_unpacklo_epi32(u0, u1);
2120 4409690 : v1 = _mm256_unpackhi_epi32(u0, u1);
2121 4409690 : v2 = _mm256_unpacklo_epi32(u2, u3);
2122 4409690 : v3 = _mm256_unpackhi_epi32(u2, u3);
2123 :
2124 4409690 : out[0] = _mm256_unpacklo_epi64(v0, v2);
2125 4409690 : out[1] = _mm256_unpackhi_epi64(v0, v2);
2126 4409690 : out[2] = _mm256_unpacklo_epi64(v1, v3);
2127 4409690 : out[3] = _mm256_unpackhi_epi64(v1, v3);
2128 :
2129 4409690 : output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
2130 4409690 : output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
2131 4409690 : output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
2132 4409690 : output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
2133 4409690 : }
2134 :
2135 4361490 : static INLINE void fadst4x8_col_avx2(__m256i *in, __m256i *output, int32_t bit,
2136 : const int32_t num_col) {
2137 4361490 : const int32_t *sinpi = sinpi_arr(bit);
2138 4361460 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2139 4361460 : const __m256i sinpi1 = _mm256_set1_epi32((int32_t)sinpi[1]);
2140 4361460 : const __m256i sinpi2 = _mm256_set1_epi32((int32_t)sinpi[2]);
2141 4361460 : const __m256i sinpi3 = _mm256_set1_epi32((int32_t)sinpi[3]);
2142 4361460 : const __m256i sinpi4 = _mm256_set1_epi32((int32_t)sinpi[4]);
2143 : __m256i t;
2144 : __m256i s0, s1, s2, s3, s4, s5, s6, s7;
2145 : __m256i x0, x1, x2, x3;
2146 : __m256i u0, u1, u2, u3;
2147 : __m256i v0, v1, v2, v3;
2148 : __m256i out[4];
2149 :
2150 4361460 : int32_t idx = 0 * num_col;
2151 4361460 : s0 = _mm256_mullo_epi32(in[idx], sinpi1);
2152 4361460 : s1 = _mm256_mullo_epi32(in[idx], sinpi4);
2153 4361460 : t = _mm256_add_epi32(in[idx], in[idx + num_col]);
2154 4361460 : idx += num_col;
2155 4361460 : s2 = _mm256_mullo_epi32(in[idx], sinpi2);
2156 4361460 : s3 = _mm256_mullo_epi32(in[idx], sinpi1);
2157 4361460 : idx += num_col;
2158 4361460 : s4 = _mm256_mullo_epi32(in[idx], sinpi3);
2159 4361460 : idx += num_col;
2160 4361460 : s5 = _mm256_mullo_epi32(in[idx], sinpi4);
2161 4361460 : s6 = _mm256_mullo_epi32(in[idx], sinpi2);
2162 8722910 : s7 = _mm256_sub_epi32(t, in[idx]);
2163 :
2164 4361460 : t = _mm256_add_epi32(s0, s2);
2165 4361460 : x0 = _mm256_add_epi32(t, s5);
2166 4361460 : x1 = _mm256_mullo_epi32(s7, sinpi3);
2167 4361460 : t = _mm256_sub_epi32(s1, s3);
2168 4361460 : x2 = _mm256_add_epi32(t, s6);
2169 4361460 : x3 = s4;
2170 :
2171 4361460 : s0 = _mm256_add_epi32(x0, x3);
2172 4361460 : s1 = x1;
2173 4361460 : s2 = _mm256_sub_epi32(x2, x3);
2174 4361460 : t = _mm256_sub_epi32(x2, x0);
2175 4361460 : s3 = _mm256_add_epi32(t, x3);
2176 :
2177 4361460 : u0 = _mm256_add_epi32(s0, rnding);
2178 4361460 : u0 = _mm256_srai_epi32(u0, bit);
2179 :
2180 4361460 : u1 = _mm256_add_epi32(s1, rnding);
2181 4361460 : u1 = _mm256_srai_epi32(u1, bit);
2182 :
2183 4361460 : u2 = _mm256_add_epi32(s2, rnding);
2184 4361460 : u2 = _mm256_srai_epi32(u2, bit);
2185 :
2186 4361460 : u3 = _mm256_add_epi32(s3, rnding);
2187 4361460 : u3 = _mm256_srai_epi32(u3, bit);
2188 :
2189 4361460 : v0 = _mm256_unpacklo_epi32(u0, u1);
2190 4361460 : v1 = _mm256_unpackhi_epi32(u0, u1);
2191 4361460 : v2 = _mm256_unpacklo_epi32(u2, u3);
2192 4361460 : v3 = _mm256_unpackhi_epi32(u2, u3);
2193 :
2194 4361460 : out[0] = _mm256_unpacklo_epi64(v0, v2);
2195 4361460 : out[1] = _mm256_unpackhi_epi64(v0, v2);
2196 4361460 : out[2] = _mm256_unpacklo_epi64(v1, v3);
2197 4361460 : out[3] = _mm256_unpackhi_epi64(v1, v3);
2198 :
2199 4361460 : output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
2200 4361460 : output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
2201 4361460 : output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
2202 4361460 : output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
2203 4361460 : }
2204 :
2205 10729000 : static INLINE void fdct4x8_avx2(__m256i *input, __m256i *output,
2206 : int32_t bit) {
2207 10729000 : __m128i *in = (__m128i *)input;
2208 10729000 : __m128i *out = (__m128i *)output;
2209 10729000 : const int32_t *cospi = cospi_arr(bit);
2210 10728500 : const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2211 10728500 : const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
2212 10728500 : const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2213 10728500 : const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2214 10728500 : const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2215 10728500 : const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2216 10728500 : const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2217 10728500 : const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2218 10728500 : const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2219 : __m128i u[8], v[8];
2220 :
2221 : // Even 8 points 0, 2, ..., 14
2222 : // stage 0
2223 : // stage 1
2224 10728500 : u[0] = _mm_add_epi32(in[0], in[7]);
2225 10728500 : v[7] = _mm_sub_epi32(in[0], in[7]); // v[7]
2226 10728500 : u[1] = _mm_add_epi32(in[1], in[6]);
2227 10728500 : u[6] = _mm_sub_epi32(in[1], in[6]);
2228 10728500 : u[2] = _mm_add_epi32(in[2], in[5]);
2229 10728500 : u[5] = _mm_sub_epi32(in[2], in[5]);
2230 10728500 : u[3] = _mm_add_epi32(in[3], in[4]);
2231 10728500 : v[4] = _mm_sub_epi32(in[3], in[4]); // v[4]
2232 :
2233 : // stage 2
2234 10728500 : v[0] = _mm_add_epi32(u[0], u[3]);
2235 10728500 : v[3] = _mm_sub_epi32(u[0], u[3]);
2236 10728500 : v[1] = _mm_add_epi32(u[1], u[2]);
2237 10728500 : v[2] = _mm_sub_epi32(u[1], u[2]);
2238 :
2239 10728500 : v[5] = _mm_mullo_epi32(u[5], cospim32);
2240 10728500 : v[6] = _mm_mullo_epi32(u[6], cospi32);
2241 10728500 : v[5] = _mm_add_epi32(v[5], v[6]);
2242 10728500 : v[5] = _mm_add_epi32(v[5], rnding);
2243 10728500 : v[5] = _mm_srai_epi32(v[5], bit);
2244 :
2245 10728500 : u[0] = _mm_mullo_epi32(u[5], cospi32);
2246 10728500 : v[6] = _mm_mullo_epi32(u[6], cospim32);
2247 10728500 : v[6] = _mm_sub_epi32(u[0], v[6]);
2248 10728500 : v[6] = _mm_add_epi32(v[6], rnding);
2249 10728500 : v[6] = _mm_srai_epi32(v[6], bit);
2250 :
2251 : // stage 3
2252 : // type 0
2253 10728500 : v[0] = _mm_mullo_epi32(v[0], cospi32);
2254 10728500 : v[1] = _mm_mullo_epi32(v[1], cospi32);
2255 10728500 : u[0] = _mm_add_epi32(v[0], v[1]);
2256 10728500 : u[0] = _mm_add_epi32(u[0], rnding);
2257 10728500 : out[0] = _mm_srai_epi32(u[0], bit);
2258 :
2259 10728500 : u[1] = _mm_sub_epi32(v[0], v[1]);
2260 10728500 : u[1] = _mm_add_epi32(u[1], rnding);
2261 10728500 : out[4] = _mm_srai_epi32(u[1], bit);
2262 :
2263 : // type 1
2264 10728500 : v[0] = _mm_mullo_epi32(v[2], cospi48);
2265 10728500 : v[1] = _mm_mullo_epi32(v[3], cospi16);
2266 10728500 : u[2] = _mm_add_epi32(v[0], v[1]);
2267 10728500 : u[2] = _mm_add_epi32(u[2], rnding);
2268 10728500 : out[2] = _mm_srai_epi32(u[2], bit);
2269 :
2270 10728500 : v[0] = _mm_mullo_epi32(v[2], cospi16);
2271 10728500 : v[1] = _mm_mullo_epi32(v[3], cospi48);
2272 10728500 : u[3] = _mm_sub_epi32(v[1], v[0]);
2273 10728500 : u[3] = _mm_add_epi32(u[3], rnding);
2274 10728500 : out[6] = _mm_srai_epi32(u[3], bit);
2275 :
2276 10728500 : u[4] = _mm_add_epi32(v[4], v[5]);
2277 10728500 : u[5] = _mm_sub_epi32(v[4], v[5]);
2278 10728500 : u[6] = _mm_sub_epi32(v[7], v[6]);
2279 10728500 : u[7] = _mm_add_epi32(v[7], v[6]);
2280 :
2281 : // stage 4
2282 : // stage 5
2283 10728500 : v[0] = _mm_mullo_epi32(u[4], cospi56);
2284 10728500 : v[1] = _mm_mullo_epi32(u[7], cospi8);
2285 10728500 : v[0] = _mm_add_epi32(v[0], v[1]);
2286 10728500 : v[0] = _mm_add_epi32(v[0], rnding);
2287 10728500 : out[1] = _mm_srai_epi32(v[0], bit); // buf0[4]
2288 :
2289 10728500 : v[0] = _mm_mullo_epi32(u[4], cospi8);
2290 10728500 : v[1] = _mm_mullo_epi32(u[7], cospi56);
2291 10728500 : v[0] = _mm_sub_epi32(v[1], v[0]);
2292 10728500 : v[0] = _mm_add_epi32(v[0], rnding);
2293 10728500 : out[7] = _mm_srai_epi32(v[0], bit); // buf0[7]
2294 :
2295 10728500 : v[0] = _mm_mullo_epi32(u[5], cospi24);
2296 10728500 : v[1] = _mm_mullo_epi32(u[6], cospi40);
2297 10728500 : v[0] = _mm_add_epi32(v[0], v[1]);
2298 10728500 : v[0] = _mm_add_epi32(v[0], rnding);
2299 10728500 : out[5] = _mm_srai_epi32(v[0], bit); // buf0[5]
2300 :
2301 10728500 : v[0] = _mm_mullo_epi32(u[5], cospi40);
2302 10728500 : v[1] = _mm_mullo_epi32(u[6], cospi24);
2303 10728500 : v[0] = _mm_sub_epi32(v[1], v[0]);
2304 10728500 : v[0] = _mm_add_epi32(v[0], rnding);
2305 10728500 : out[3] = _mm_srai_epi32(v[0], bit); // buf0[6]
2306 10728500 : }
2307 :
2308 5242180 : static void fadst16x16_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
2309 5242180 : const int32_t *cospi = cospi_arr(bit);
2310 5242090 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2311 5242090 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2312 5242090 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2313 5242090 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
2314 5242090 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
2315 5242090 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
2316 5242090 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
2317 5242090 : const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
2318 5242090 : const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
2319 5242090 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
2320 5242090 : const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
2321 5242090 : const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
2322 5242090 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
2323 5242090 : const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
2324 5242090 : const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
2325 5242090 : const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
2326 5242090 : const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
2327 5242090 : const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
2328 5242090 : const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
2329 5242090 : const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
2330 5242090 : const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
2331 5242090 : const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
2332 5242090 : const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
2333 5242090 : const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
2334 5242090 : const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
2335 5242090 : const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
2336 5242090 : const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
2337 5242090 : const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
2338 5242090 : const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
2339 5242090 : const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
2340 5242090 : const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
2341 5242090 : const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
2342 5242090 : const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
2343 5242090 : const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
2344 5242090 : const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
2345 5242090 : const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
2346 5242090 : const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
2347 10484200 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2348 5242090 : const __m256i zero = _mm256_setzero_si256();
2349 :
2350 : __m256i u[16], v[16], x, y;
2351 : int32_t col;
2352 :
2353 13052000 : for (col = 0; col < col_num; ++col) {
2354 : // stage 0
2355 : // stage 1
2356 7809750 : u[0] = in[0 * col_num + col];
2357 7809750 : u[1] = _mm256_sub_epi32(zero, in[15 * col_num + col]);
2358 7809750 : u[2] = _mm256_sub_epi32(zero, in[7 * col_num + col]);
2359 7809750 : u[3] = in[8 * col_num + col];
2360 7809750 : u[4] = _mm256_sub_epi32(zero, in[3 * col_num + col]);
2361 7809750 : u[5] = in[12 * col_num + col];
2362 7809750 : u[6] = in[4 * col_num + col];
2363 7809750 : u[7] = _mm256_sub_epi32(zero, in[11 * col_num + col]);
2364 7809750 : u[8] = _mm256_sub_epi32(zero, in[1 * col_num + col]);
2365 7809750 : u[9] = in[14 * col_num + col];
2366 7809750 : u[10] = in[6 * col_num + col];
2367 7809750 : u[11] = _mm256_sub_epi32(zero, in[9 * col_num + col]);
2368 7809750 : u[12] = in[2 * col_num + col];
2369 7809750 : u[13] = _mm256_sub_epi32(zero, in[13 * col_num + col]);
2370 7809750 : u[14] = _mm256_sub_epi32(zero, in[5 * col_num + col]);
2371 7809750 : u[15] = in[10 * col_num + col];
2372 :
2373 : // stage 2
2374 7809750 : v[0] = u[0];
2375 7809750 : v[1] = u[1];
2376 :
2377 7809750 : x = _mm256_mullo_epi32(u[2], cospi32);
2378 15619500 : y = _mm256_mullo_epi32(u[3], cospi32);
2379 7809750 : v[2] = _mm256_add_epi32(x, y);
2380 7809750 : v[2] = _mm256_add_epi32(v[2], rnding);
2381 15619500 : v[2] = _mm256_srai_epi32(v[2], bit);
2382 :
2383 7809750 : v[3] = _mm256_sub_epi32(x, y);
2384 7809750 : v[3] = _mm256_add_epi32(v[3], rnding);
2385 7809750 : v[3] = _mm256_srai_epi32(v[3], bit);
2386 :
2387 7809750 : v[4] = u[4];
2388 7809750 : v[5] = u[5];
2389 :
2390 7809750 : x = _mm256_mullo_epi32(u[6], cospi32);
2391 15619500 : y = _mm256_mullo_epi32(u[7], cospi32);
2392 7809750 : v[6] = _mm256_add_epi32(x, y);
2393 7809750 : v[6] = _mm256_add_epi32(v[6], rnding);
2394 15619500 : v[6] = _mm256_srai_epi32(v[6], bit);
2395 :
2396 7809750 : v[7] = _mm256_sub_epi32(x, y);
2397 7809750 : v[7] = _mm256_add_epi32(v[7], rnding);
2398 7809750 : v[7] = _mm256_srai_epi32(v[7], bit);
2399 :
2400 7809750 : v[8] = u[8];
2401 7809750 : v[9] = u[9];
2402 :
2403 7809750 : x = _mm256_mullo_epi32(u[10], cospi32);
2404 15619500 : y = _mm256_mullo_epi32(u[11], cospi32);
2405 7809750 : v[10] = _mm256_add_epi32(x, y);
2406 7809750 : v[10] = _mm256_add_epi32(v[10], rnding);
2407 15619500 : v[10] = _mm256_srai_epi32(v[10], bit);
2408 :
2409 7809750 : v[11] = _mm256_sub_epi32(x, y);
2410 7809750 : v[11] = _mm256_add_epi32(v[11], rnding);
2411 7809750 : v[11] = _mm256_srai_epi32(v[11], bit);
2412 :
2413 7809750 : v[12] = u[12];
2414 7809750 : v[13] = u[13];
2415 :
2416 7809750 : x = _mm256_mullo_epi32(u[14], cospi32);
2417 15619500 : y = _mm256_mullo_epi32(u[15], cospi32);
2418 7809750 : v[14] = _mm256_add_epi32(x, y);
2419 7809750 : v[14] = _mm256_add_epi32(v[14], rnding);
2420 15619500 : v[14] = _mm256_srai_epi32(v[14], bit);
2421 :
2422 7809750 : v[15] = _mm256_sub_epi32(x, y);
2423 7809750 : v[15] = _mm256_add_epi32(v[15], rnding);
2424 7809750 : v[15] = _mm256_srai_epi32(v[15], bit);
2425 :
2426 : // stage 3
2427 7809750 : u[0] = _mm256_add_epi32(v[0], v[2]);
2428 7809750 : u[1] = _mm256_add_epi32(v[1], v[3]);
2429 7809750 : u[2] = _mm256_sub_epi32(v[0], v[2]);
2430 7809750 : u[3] = _mm256_sub_epi32(v[1], v[3]);
2431 7809750 : u[4] = _mm256_add_epi32(v[4], v[6]);
2432 7809750 : u[5] = _mm256_add_epi32(v[5], v[7]);
2433 7809750 : u[6] = _mm256_sub_epi32(v[4], v[6]);
2434 7809750 : u[7] = _mm256_sub_epi32(v[5], v[7]);
2435 7809750 : u[8] = _mm256_add_epi32(v[8], v[10]);
2436 7809750 : u[9] = _mm256_add_epi32(v[9], v[11]);
2437 7809750 : u[10] = _mm256_sub_epi32(v[8], v[10]);
2438 7809750 : u[11] = _mm256_sub_epi32(v[9], v[11]);
2439 7809750 : u[12] = _mm256_add_epi32(v[12], v[14]);
2440 7809750 : u[13] = _mm256_add_epi32(v[13], v[15]);
2441 7809750 : u[14] = _mm256_sub_epi32(v[12], v[14]);
2442 7809750 : u[15] = _mm256_sub_epi32(v[13], v[15]);
2443 :
2444 : // stage 4
2445 7809750 : v[0] = u[0];
2446 7809750 : v[1] = u[1];
2447 7809750 : v[2] = u[2];
2448 7809750 : v[3] = u[3];
2449 7809750 : v[4] = half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
2450 7809660 : v[5] = half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
2451 7809480 : v[6] = half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
2452 7809290 : v[7] = half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
2453 7809200 : v[8] = u[8];
2454 7809200 : v[9] = u[9];
2455 7809200 : v[10] = u[10];
2456 7809200 : v[11] = u[11];
2457 7809200 : v[12] = half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
2458 7809340 : v[13] = half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
2459 7809210 : v[14] = half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
2460 7809160 : v[15] = half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
2461 :
2462 : // stage 5
2463 7809150 : u[0] = _mm256_add_epi32(v[0], v[4]);
2464 7809150 : u[1] = _mm256_add_epi32(v[1], v[5]);
2465 7809150 : u[2] = _mm256_add_epi32(v[2], v[6]);
2466 7809150 : u[3] = _mm256_add_epi32(v[3], v[7]);
2467 7809150 : u[4] = _mm256_sub_epi32(v[0], v[4]);
2468 7809150 : u[5] = _mm256_sub_epi32(v[1], v[5]);
2469 7809150 : u[6] = _mm256_sub_epi32(v[2], v[6]);
2470 7809150 : u[7] = _mm256_sub_epi32(v[3], v[7]);
2471 7809150 : u[8] = _mm256_add_epi32(v[8], v[12]);
2472 7809150 : u[9] = _mm256_add_epi32(v[9], v[13]);
2473 7809150 : u[10] = _mm256_add_epi32(v[10], v[14]);
2474 7809150 : u[11] = _mm256_add_epi32(v[11], v[15]);
2475 7809150 : u[12] = _mm256_sub_epi32(v[8], v[12]);
2476 7809150 : u[13] = _mm256_sub_epi32(v[9], v[13]);
2477 7809150 : u[14] = _mm256_sub_epi32(v[10], v[14]);
2478 7809150 : u[15] = _mm256_sub_epi32(v[11], v[15]);
2479 :
2480 : // stage 6
2481 7809150 : v[0] = u[0];
2482 7809150 : v[1] = u[1];
2483 7809150 : v[2] = u[2];
2484 7809150 : v[3] = u[3];
2485 7809150 : v[4] = u[4];
2486 7809150 : v[5] = u[5];
2487 7809150 : v[6] = u[6];
2488 7809150 : v[7] = u[7];
2489 7809150 : v[8] = half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
2490 7809440 : v[9] = half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
2491 7809290 : v[10] = half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
2492 7809200 : v[11] = half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
2493 7809080 : v[12] = half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
2494 7808950 : v[13] = half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
2495 7808880 : v[14] = half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
2496 7808840 : v[15] = half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
2497 :
2498 : // stage 7
2499 7808990 : u[0] = _mm256_add_epi32(v[0], v[8]);
2500 7808990 : u[1] = _mm256_add_epi32(v[1], v[9]);
2501 7808990 : u[2] = _mm256_add_epi32(v[2], v[10]);
2502 7808990 : u[3] = _mm256_add_epi32(v[3], v[11]);
2503 7808990 : u[4] = _mm256_add_epi32(v[4], v[12]);
2504 7808990 : u[5] = _mm256_add_epi32(v[5], v[13]);
2505 7808990 : u[6] = _mm256_add_epi32(v[6], v[14]);
2506 7808990 : u[7] = _mm256_add_epi32(v[7], v[15]);
2507 7808990 : u[8] = _mm256_sub_epi32(v[0], v[8]);
2508 7808990 : u[9] = _mm256_sub_epi32(v[1], v[9]);
2509 7808990 : u[10] = _mm256_sub_epi32(v[2], v[10]);
2510 7808990 : u[11] = _mm256_sub_epi32(v[3], v[11]);
2511 7808990 : u[12] = _mm256_sub_epi32(v[4], v[12]);
2512 7808990 : u[13] = _mm256_sub_epi32(v[5], v[13]);
2513 7808990 : u[14] = _mm256_sub_epi32(v[6], v[14]);
2514 7808990 : u[15] = _mm256_sub_epi32(v[7], v[15]);
2515 :
2516 : // stage 8
2517 7808990 : v[0] = half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
2518 7809460 : v[1] = half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
2519 7809210 : v[2] = half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
2520 7809140 : v[3] = half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
2521 7809090 : v[4] = half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
2522 7809070 : v[5] = half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
2523 7809040 : v[6] = half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
2524 7809040 : v[7] = half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
2525 7809020 : v[8] = half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
2526 7809000 : v[9] = half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
2527 7809030 : v[10] = half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
2528 7809030 : v[11] = half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
2529 7808870 : v[12] = half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
2530 7808800 : v[13] = half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
2531 7808780 : v[14] = half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
2532 7808730 : v[15] = half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
2533 :
2534 : // stage 9
2535 7809880 : out[0 * col_num + col] = v[1];
2536 7809880 : out[1 * col_num + col] = v[14];
2537 7809880 : out[2 * col_num + col] = v[3];
2538 7809880 : out[3 * col_num + col] = v[12];
2539 7809880 : out[4 * col_num + col] = v[5];
2540 7809880 : out[5 * col_num + col] = v[10];
2541 7809880 : out[6 * col_num + col] = v[7];
2542 7809880 : out[7 * col_num + col] = v[8];
2543 7809880 : out[8 * col_num + col] = v[9];
2544 7809880 : out[9 * col_num + col] = v[6];
2545 7809880 : out[10 * col_num + col] = v[11];
2546 7809880 : out[11 * col_num + col] = v[4];
2547 7809880 : out[12 * col_num + col] = v[13];
2548 7809880 : out[13 * col_num + col] = v[2];
2549 7809880 : out[14 * col_num + col] = v[15];
2550 7809880 : out[15 * col_num + col] = v[0];
2551 : }
2552 5242220 : }
2553 :
2554 7143920 : void eb_av1_fwd_txfm2d_16x16_avx2(int16_t *input, int32_t *coeff, uint32_t stride, TxType tx_type, uint8_t bd)
2555 : {
2556 : __m256i in[32], out[32];
2557 7143920 : const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
2558 7143920 : const int32_t txw_idx = get_txw_idx(TX_16X16);
2559 7143740 : const int32_t txh_idx = get_txh_idx(TX_16X16);
2560 7144920 : const int32_t col_num = 2;
2561 7144920 : switch (tx_type) {
2562 399596 : case IDTX:
2563 399596 : load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2564 399603 : fidtx16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2565 399604 : col_txfm_16x16_rounding(out, -shift[1]);
2566 399606 : fidtx16x16_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2567 399602 : write_buffer_16x16(out, coeff);
2568 399605 : break;
2569 4652780 : case DCT_DCT:
2570 4652780 : load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2571 4652930 : fdct16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2572 4653140 : col_txfm_16x16_rounding(out, -shift[1]);
2573 4653140 : transpose_16x16_avx2(out, in);
2574 4653080 : fdct16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2575 4653090 : transpose_16x16_avx2(out, in);
2576 4653100 : write_buffer_16x16(in, coeff);
2577 4652960 : break;
2578 410312 : case ADST_DCT:
2579 410312 : load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2580 410321 : fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2581 410321 : col_txfm_16x16_rounding(out, -shift[1]);
2582 410323 : transpose_16x16_avx2(out, in);
2583 410324 : fdct16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2584 410322 : transpose_16x16_avx2(out, in);
2585 410323 : write_buffer_16x16(in, coeff);
2586 410321 : break;
2587 411547 : case DCT_ADST:
2588 411547 : load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2589 411556 : fdct16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2590 411563 : col_txfm_16x16_rounding(out, -shift[1]);
2591 411563 : transpose_16x16_avx2(out, in);
2592 411567 : fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2593 411561 : transpose_16x16_avx2(out, in);
2594 411564 : write_buffer_16x16(in, coeff);
2595 411560 : break;
2596 356861 : case ADST_ADST:
2597 356861 : load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2598 356866 : fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2599 356874 : col_txfm_16x16_rounding(out, -shift[1]);
2600 356874 : transpose_16x16_avx2(out, in);
2601 356873 : fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2602 356871 : transpose_16x16_avx2(out, in);
2603 356872 : write_buffer_16x16(in, coeff);
2604 356869 : break;
2605 129117 : case DCT_FLIPADST:
2606 129117 : load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
2607 129118 : fdct16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2608 129118 : col_txfm_16x16_rounding(out, -shift[1]);
2609 129117 : transpose_16x16_avx2(out, in);
2610 129117 : fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2611 129117 : transpose_16x16_avx2(out, in);
2612 129118 : write_buffer_16x16(in, coeff);
2613 129116 : break;
2614 128915 : case FLIPADST_DCT:
2615 128915 : load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
2616 128916 : fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2617 128916 : col_txfm_16x16_rounding(out, -shift[1]);
2618 128916 : transpose_16x16_avx2(out, in);
2619 128916 : fdct16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2620 128916 : transpose_16x16_avx2(out, in);
2621 128916 : write_buffer_16x16(in, coeff);
2622 128916 : break;
2623 128990 : case FLIPADST_FLIPADST:
2624 128990 : load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
2625 128992 : fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2626 128993 : col_txfm_16x16_rounding(out, -shift[1]);
2627 128993 : transpose_16x16_avx2(out, in);
2628 128993 : fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2629 128991 : transpose_16x16_avx2(out, in);
2630 128993 : write_buffer_16x16(in, coeff);
2631 128992 : break;
2632 128942 : case ADST_FLIPADST:
2633 128942 : load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
2634 128942 : fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2635 128942 : col_txfm_16x16_rounding(out, -shift[1]);
2636 128942 : transpose_16x16_avx2(out, in);
2637 128942 : fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2638 128942 : transpose_16x16_avx2(out, in);
2639 128941 : write_buffer_16x16(in, coeff);
2640 128941 : break;
2641 129330 : case FLIPADST_ADST:
2642 129330 : load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
2643 129332 : fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2644 129332 : col_txfm_16x16_rounding(out, -shift[1]);
2645 129331 : transpose_16x16_avx2(out, in);
2646 129332 : fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2647 129331 : transpose_16x16_avx2(out, in);
2648 129332 : write_buffer_16x16(in, coeff);
2649 129332 : break;
2650 135833 : case V_DCT:
2651 135833 : load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2652 135833 : fdct16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2653 135833 : col_txfm_16x16_rounding(out, -shift[1]);
2654 135832 : fidtx16x16_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2655 135833 : write_buffer_16x16(out, coeff);
2656 135833 : break;
2657 132700 : case H_DCT:
2658 132700 : load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2659 132700 : fidtx16x16_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2660 132699 : col_txfm_16x16_rounding(in, -shift[1]);
2661 132699 : transpose_16x16_avx2(in, out);
2662 132700 : fdct16x16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2663 132700 : transpose_16x16_avx2(in, out);
2664 132700 : write_buffer_16x16(out, coeff);
2665 132700 : break;
2666 0 : case V_ADST:
2667 0 : load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2668 0 : fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2669 0 : col_txfm_16x16_rounding(out, -shift[1]);
2670 0 : fidtx16x16_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2671 0 : write_buffer_16x16(out, coeff);
2672 0 : break;
2673 0 : case H_ADST:
2674 0 : load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2675 0 : fidtx16x16_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2676 0 : col_txfm_16x16_rounding(in, -shift[1]);
2677 0 : transpose_16x16_avx2(in, out);
2678 0 : fadst16x16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2679 0 : transpose_16x16_avx2(in, out);
2680 0 : write_buffer_16x16(out, coeff);
2681 0 : break;
2682 0 : case V_FLIPADST:
2683 0 : load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
2684 0 : fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2685 0 : col_txfm_16x16_rounding(out, -shift[1]);
2686 0 : fidtx16x16_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2687 0 : write_buffer_16x16(out, coeff);
2688 0 : break;
2689 0 : case H_FLIPADST:
2690 0 : load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
2691 0 : fidtx16x16_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2692 0 : col_txfm_16x16_rounding(in, -shift[1]);
2693 0 : transpose_16x16_avx2(in, out);
2694 0 : fadst16x16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2695 0 : transpose_16x16_avx2(in, out);
2696 0 : write_buffer_16x16(out, coeff);
2697 0 : break;
2698 0 : default: assert(0);
2699 : }
2700 : (void)bd;
2701 7145150 : }
2702 :
2703 13048600 : static void av1_fdct32_new_avx2(const __m256i *input, __m256i *output,
2704 : int8_t cos_bit, const int32_t col_num, const int32_t stride) {
2705 13048600 : const int32_t *cospi = cospi_arr(cos_bit);
2706 13048500 : const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
2707 13048500 : const int32_t columns = col_num >> 3;
2708 :
2709 13048500 : __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
2710 13048500 : __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
2711 13048500 : __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
2712 13048500 : __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
2713 13048500 : __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
2714 13048500 : __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
2715 13048500 : __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
2716 13048500 : __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
2717 13048500 : __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
2718 13048500 : __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
2719 13048500 : __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
2720 13048500 : __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
2721 13048500 : __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
2722 13048500 : __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
2723 13048500 : __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
2724 13048500 : __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
2725 13048500 : __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
2726 13048500 : __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
2727 13048500 : __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
2728 13048500 : __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
2729 13048500 : __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
2730 13048500 : __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
2731 13048500 : __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
2732 13048500 : __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
2733 13048500 : __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
2734 13048500 : __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
2735 13048500 : __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
2736 13048500 : __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
2737 13048500 : __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
2738 13048500 : __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
2739 13048500 : __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
2740 13048500 : __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
2741 13048500 : __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
2742 13048500 : __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
2743 13048500 : __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
2744 13048500 : __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
2745 13048500 : __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
2746 13048500 : __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
2747 :
2748 : __m256i buf0[32];
2749 : __m256i buf1[32];
2750 :
2751 43311900 : for (int32_t col = 0; col < columns; col++) {
2752 30263400 : const __m256i *in = &input[col];
2753 30263400 : __m256i *out = &output[col];
2754 :
2755 : // stage 0
2756 : // stage 1
2757 30263400 : buf1[0] = _mm256_add_epi32(in[0 * stride], in[31 * stride]);
2758 30263400 : buf1[31] = _mm256_sub_epi32(in[0 * stride], in[31 * stride]);
2759 30263400 : buf1[1] = _mm256_add_epi32(in[1 * stride], in[30 * stride]);
2760 30263400 : buf1[30] = _mm256_sub_epi32(in[1 * stride], in[30 * stride]);
2761 30263400 : buf1[2] = _mm256_add_epi32(in[2 * stride], in[29 * stride]);
2762 30263400 : buf1[29] = _mm256_sub_epi32(in[2 * stride], in[29 * stride]);
2763 30263400 : buf1[3] = _mm256_add_epi32(in[3 * stride], in[28 * stride]);
2764 30263400 : buf1[28] = _mm256_sub_epi32(in[3 * stride], in[28 * stride]);
2765 30263400 : buf1[4] = _mm256_add_epi32(in[4 * stride], in[27 * stride]);
2766 30263400 : buf1[27] = _mm256_sub_epi32(in[4 * stride], in[27 * stride]);
2767 30263400 : buf1[5] = _mm256_add_epi32(in[5 * stride], in[26 * stride]);
2768 30263400 : buf1[26] = _mm256_sub_epi32(in[5 * stride], in[26 * stride]);
2769 30263400 : buf1[6] = _mm256_add_epi32(in[6 * stride], in[25 * stride]);
2770 30263400 : buf1[25] = _mm256_sub_epi32(in[6 * stride], in[25 * stride]);
2771 30263400 : buf1[7] = _mm256_add_epi32(in[7 * stride], in[24 * stride]);
2772 30263400 : buf1[24] = _mm256_sub_epi32(in[7 * stride], in[24 * stride]);
2773 30263400 : buf1[8] = _mm256_add_epi32(in[8 * stride], in[23 * stride]);
2774 30263400 : buf1[23] = _mm256_sub_epi32(in[8 * stride], in[23 * stride]);
2775 30263400 : buf1[9] = _mm256_add_epi32(in[9 * stride], in[22 * stride]);
2776 30263400 : buf1[22] = _mm256_sub_epi32(in[9 * stride], in[22 * stride]);
2777 30263400 : buf1[10] = _mm256_add_epi32(in[10 * stride], in[21 * stride]);
2778 30263400 : buf1[21] = _mm256_sub_epi32(in[10 * stride], in[21 * stride]);
2779 30263400 : buf1[11] = _mm256_add_epi32(in[11 * stride], in[20 * stride]);
2780 30263400 : buf1[20] = _mm256_sub_epi32(in[11 * stride], in[20 * stride]);
2781 30263400 : buf1[12] = _mm256_add_epi32(in[12 * stride], in[19 * stride]);
2782 30263400 : buf1[19] = _mm256_sub_epi32(in[12 * stride], in[19 * stride]);
2783 30263400 : buf1[13] = _mm256_add_epi32(in[13 * stride], in[18 * stride]);
2784 30263400 : buf1[18] = _mm256_sub_epi32(in[13 * stride], in[18 * stride]);
2785 30263400 : buf1[14] = _mm256_add_epi32(in[14 * stride], in[17 * stride]);
2786 30263400 : buf1[17] = _mm256_sub_epi32(in[14 * stride], in[17 * stride]);
2787 30263400 : buf1[15] = _mm256_add_epi32(in[15 * stride], in[16 * stride]);
2788 30263400 : buf1[16] = _mm256_sub_epi32(in[15 * stride], in[16 * stride]);
2789 :
2790 : // stage 2
2791 30263400 : buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]);
2792 30263400 : buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]);
2793 30263400 : buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]);
2794 30263400 : buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]);
2795 30263400 : buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]);
2796 30263400 : buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]);
2797 30263400 : buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]);
2798 30263400 : buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]);
2799 30263400 : buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]);
2800 30263400 : buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]);
2801 30263400 : buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]);
2802 30263400 : buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]);
2803 30263400 : buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]);
2804 30263400 : buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]);
2805 30263400 : buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]);
2806 30263400 : buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]);
2807 30263400 : buf0[16] = buf1[16];
2808 30263400 : buf0[17] = buf1[17];
2809 30263400 : buf0[18] = buf1[18];
2810 30263400 : buf0[19] = buf1[19];
2811 302634000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, buf1[20], buf1[27],
2812 : buf0[20], buf0[27], __rounding, cos_bit);
2813 302634000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, buf1[21], buf1[26],
2814 : buf0[21], buf0[26], __rounding, cos_bit);
2815 302634000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, buf1[22], buf1[25],
2816 : buf0[22], buf0[25], __rounding, cos_bit);
2817 302634000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, buf1[23], buf1[24],
2818 : buf0[23], buf0[24], __rounding, cos_bit);
2819 30263400 : buf0[28] = buf1[28];
2820 30263400 : buf0[29] = buf1[29];
2821 30263400 : buf0[30] = buf1[30];
2822 30263400 : buf0[31] = buf1[31];
2823 :
2824 : // stage 3
2825 30263400 : buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]);
2826 30263400 : buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]);
2827 30263400 : buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]);
2828 30263400 : buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]);
2829 30263400 : buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]);
2830 30263400 : buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]);
2831 30263400 : buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]);
2832 30263400 : buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]);
2833 30263400 : buf1[8] = buf0[8];
2834 30263400 : buf1[9] = buf0[9];
2835 302634000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, buf0[10], buf0[13],
2836 : buf1[10], buf1[13], __rounding, cos_bit);
2837 302634000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, buf0[11], buf0[12],
2838 : buf1[11], buf1[12], __rounding, cos_bit);
2839 30263400 : buf1[14] = buf0[14];
2840 30263400 : buf1[15] = buf0[15];
2841 30263400 : buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]);
2842 30263400 : buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]);
2843 30263400 : buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]);
2844 30263400 : buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]);
2845 30263400 : buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]);
2846 30263400 : buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]);
2847 30263400 : buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]);
2848 30263400 : buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]);
2849 30263400 : buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]);
2850 30263400 : buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]);
2851 30263400 : buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]);
2852 30263400 : buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]);
2853 30263400 : buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]);
2854 30263400 : buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]);
2855 30263400 : buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]);
2856 30263400 : buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]);
2857 :
2858 : // stage 4
2859 30263400 : buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]);
2860 30263400 : buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]);
2861 30263400 : buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]);
2862 30263400 : buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]);
2863 30263400 : buf0[4] = buf1[4];
2864 302634000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, buf1[5], buf1[6],
2865 : buf0[5], buf0[6], __rounding, cos_bit);
2866 30263400 : buf0[7] = buf1[7];
2867 30263400 : buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]);
2868 30263400 : buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]);
2869 30263400 : buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]);
2870 30263400 : buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]);
2871 30263400 : buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]);
2872 30263400 : buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]);
2873 30263400 : buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]);
2874 30263400 : buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]);
2875 30263400 : buf0[16] = buf1[16];
2876 30263400 : buf0[17] = buf1[17];
2877 302634000 : btf_32_type0_avx2_new(cospi_m16, cospi_p48, buf1[18], buf1[29],
2878 : buf0[18], buf0[29], __rounding, cos_bit);
2879 302634000 : btf_32_type0_avx2_new(cospi_m16, cospi_p48, buf1[19], buf1[28],
2880 : buf0[19], buf0[28], __rounding, cos_bit);
2881 302634000 : btf_32_type0_avx2_new(cospi_m48, cospi_m16, buf1[20], buf1[27],
2882 : buf0[20], buf0[27], __rounding, cos_bit);
2883 302634000 : btf_32_type0_avx2_new(cospi_m48, cospi_m16, buf1[21], buf1[26],
2884 : buf0[21], buf0[26], __rounding, cos_bit);
2885 30263400 : buf0[22] = buf1[22];
2886 30263400 : buf0[23] = buf1[23];
2887 30263400 : buf0[24] = buf1[24];
2888 30263400 : buf0[25] = buf1[25];
2889 30263400 : buf0[30] = buf1[30];
2890 30263400 : buf0[31] = buf1[31];
2891 :
2892 : // stage 5
2893 302634000 : btf_32_type0_avx2_new(cospi_p32, cospi_p32, buf0[0], buf0[1],
2894 : buf1[0], buf1[1], __rounding, cos_bit);
2895 302634000 : btf_32_type1_avx2_new(cospi_p48, cospi_p16, buf0[2], buf0[3],
2896 : buf1[2], buf1[3], __rounding, cos_bit);
2897 30263400 : buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]);
2898 30263400 : buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]);
2899 30263400 : buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]);
2900 30263400 : buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]);
2901 30263400 : buf1[8] = buf0[8];
2902 302634000 : btf_32_type0_avx2_new(cospi_m16, cospi_p48, buf0[9], buf0[14],
2903 : buf1[9], buf1[14], __rounding, cos_bit);
2904 302634000 : btf_32_type0_avx2_new(cospi_m48, cospi_m16, buf0[10], buf0[13],
2905 : buf1[10], buf1[13], __rounding, cos_bit);
2906 30263400 : buf1[11] = buf0[11];
2907 30263400 : buf1[12] = buf0[12];
2908 30263400 : buf1[15] = buf0[15];
2909 30263400 : buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]);
2910 30263400 : buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]);
2911 30263400 : buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]);
2912 30263400 : buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]);
2913 30263400 : buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]);
2914 30263400 : buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]);
2915 30263400 : buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]);
2916 30263400 : buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]);
2917 30263400 : buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]);
2918 30263400 : buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]);
2919 30263400 : buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]);
2920 30263400 : buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]);
2921 30263400 : buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]);
2922 30263400 : buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]);
2923 30263400 : buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]);
2924 30263400 : buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]);
2925 :
2926 : // stage 6
2927 30263400 : buf0[0] = buf1[0];
2928 30263400 : buf0[1] = buf1[1];
2929 30263400 : buf0[2] = buf1[2];
2930 30263400 : buf0[3] = buf1[3];
2931 302634000 : btf_32_type1_avx2_new(cospi_p56, cospi_p08, buf1[4], buf1[7],
2932 : buf0[4], buf0[7], __rounding, cos_bit);
2933 302634000 : btf_32_type1_avx2_new(cospi_p24, cospi_p40, buf1[5], buf1[6],
2934 : buf0[5], buf0[6], __rounding, cos_bit);
2935 30263400 : buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]);
2936 30263400 : buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]);
2937 30263400 : buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]);
2938 30263400 : buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]);
2939 30263400 : buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]);
2940 30263400 : buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]);
2941 30263400 : buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]);
2942 30263400 : buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]);
2943 30263400 : buf0[16] = buf1[16];
2944 302634000 : btf_32_type0_avx2_new(cospi_m08, cospi_p56, buf1[17], buf1[30],
2945 : buf0[17], buf0[30], __rounding, cos_bit);
2946 302634000 : btf_32_type0_avx2_new(cospi_m56, cospi_m08, buf1[18], buf1[29],
2947 : buf0[18],
2948 : buf0[29], __rounding, cos_bit);
2949 30263400 : buf0[19] = buf1[19];
2950 30263400 : buf0[20] = buf1[20];
2951 302634000 : btf_32_type0_avx2_new(cospi_m40, cospi_p24, buf1[21], buf1[26],
2952 : buf0[21], buf0[26], __rounding, cos_bit);
2953 302634000 : btf_32_type0_avx2_new(cospi_m24, cospi_m40, buf1[22], buf1[25],
2954 : buf0[22], buf0[25], __rounding, cos_bit);
2955 30263400 : buf0[23] = buf1[23];
2956 30263400 : buf0[24] = buf1[24];
2957 30263400 : buf0[27] = buf1[27];
2958 30263400 : buf0[28] = buf1[28];
2959 30263400 : buf0[31] = buf1[31];
2960 :
2961 : // stage 7
2962 30263400 : buf1[0] = buf0[0];
2963 30263400 : buf1[1] = buf0[1];
2964 30263400 : buf1[2] = buf0[2];
2965 30263400 : buf1[3] = buf0[3];
2966 30263400 : buf1[4] = buf0[4];
2967 30263400 : buf1[5] = buf0[5];
2968 30263400 : buf1[6] = buf0[6];
2969 30263400 : buf1[7] = buf0[7];
2970 302634000 : btf_32_type1_avx2_new(cospi_p60, cospi_p04, buf0[8], buf0[15],
2971 : buf1[8], buf1[15], __rounding, cos_bit);
2972 302634000 : btf_32_type1_avx2_new(cospi_p28, cospi_p36, buf0[9], buf0[14],
2973 : buf1[9], buf1[14], __rounding, cos_bit);
2974 302634000 : btf_32_type1_avx2_new(cospi_p44, cospi_p20, buf0[10], buf0[13],
2975 : buf1[10], buf1[13], __rounding, cos_bit);
2976 302634000 : btf_32_type1_avx2_new(cospi_p12, cospi_p52, buf0[11], buf0[12],
2977 : buf1[11], buf1[12], __rounding, cos_bit);
2978 30263400 : buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]);
2979 30263400 : buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]);
2980 30263400 : buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]);
2981 30263400 : buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]);
2982 30263400 : buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]);
2983 30263400 : buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]);
2984 30263400 : buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]);
2985 30263400 : buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]);
2986 30263400 : buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]);
2987 30263400 : buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]);
2988 30263400 : buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]);
2989 30263400 : buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]);
2990 30263400 : buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]);
2991 30263400 : buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]);
2992 30263400 : buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]);
2993 30263400 : buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]);
2994 :
2995 : // stage 8
2996 30263400 : buf0[0] = buf1[0];
2997 30263400 : buf0[1] = buf1[1];
2998 30263400 : buf0[2] = buf1[2];
2999 30263400 : buf0[3] = buf1[3];
3000 30263400 : buf0[4] = buf1[4];
3001 30263400 : buf0[5] = buf1[5];
3002 30263400 : buf0[6] = buf1[6];
3003 30263400 : buf0[7] = buf1[7];
3004 30263400 : buf0[8] = buf1[8];
3005 30263400 : buf0[9] = buf1[9];
3006 30263400 : buf0[10] = buf1[10];
3007 30263400 : buf0[11] = buf1[11];
3008 30263400 : buf0[12] = buf1[12];
3009 30263400 : buf0[13] = buf1[13];
3010 30263400 : buf0[14] = buf1[14];
3011 30263400 : buf0[15] = buf1[15];
3012 302634000 : btf_32_type1_avx2_new(cospi_p62, cospi_p02, buf1[16], buf1[31],
3013 : buf0[16], buf0[31], __rounding, cos_bit);
3014 302634000 : btf_32_type1_avx2_new(cospi_p30, cospi_p34, buf1[17], buf1[30],
3015 : buf0[17], buf0[30], __rounding, cos_bit);
3016 302634000 : btf_32_type1_avx2_new(cospi_p46, cospi_p18, buf1[18], buf1[29],
3017 : buf0[18], buf0[29], __rounding, cos_bit);
3018 302634000 : btf_32_type1_avx2_new(cospi_p14, cospi_p50, buf1[19], buf1[28],
3019 : buf0[19], buf0[28], __rounding, cos_bit);
3020 302634000 : btf_32_type1_avx2_new(cospi_p54, cospi_p10, buf1[20], buf1[27],
3021 : buf0[20], buf0[27], __rounding, cos_bit);
3022 302634000 : btf_32_type1_avx2_new(cospi_p22, cospi_p42, buf1[21], buf1[26],
3023 : buf0[21], buf0[26], __rounding, cos_bit);
3024 302634000 : btf_32_type1_avx2_new(cospi_p38, cospi_p26, buf1[22], buf1[25],
3025 : buf0[22], buf0[25], __rounding, cos_bit);
3026 302634000 : btf_32_type1_avx2_new(cospi_p06, cospi_p58, buf1[23], buf1[24],
3027 : buf0[23], buf0[24], __rounding, cos_bit);
3028 :
3029 : // stage 9
3030 30263400 : out[0 * stride] = buf0[0];
3031 30263400 : out[1 * stride] = buf0[16];
3032 30263400 : out[2 * stride] = buf0[8];
3033 30263400 : out[3 * stride] = buf0[24];
3034 30263400 : out[4 * stride] = buf0[4];
3035 30263400 : out[5 * stride] = buf0[20];
3036 30263400 : out[6 * stride] = buf0[12];
3037 30263400 : out[7 * stride] = buf0[28];
3038 30263400 : out[8 * stride] = buf0[2];
3039 30263400 : out[9 * stride] = buf0[18];
3040 30263400 : out[10 * stride] = buf0[10];
3041 30263400 : out[11 * stride] = buf0[26];
3042 30263400 : out[12 * stride] = buf0[6];
3043 30263400 : out[13 * stride] = buf0[22];
3044 30263400 : out[14 * stride] = buf0[14];
3045 30263400 : out[15 * stride] = buf0[30];
3046 30263400 : out[16 * stride] = buf0[1];
3047 30263400 : out[17 * stride] = buf0[17];
3048 30263400 : out[18 * stride] = buf0[9];
3049 30263400 : out[19 * stride] = buf0[25];
3050 30263400 : out[20 * stride] = buf0[5];
3051 30263400 : out[21 * stride] = buf0[21];
3052 30263400 : out[22 * stride] = buf0[13];
3053 30263400 : out[23 * stride] = buf0[29];
3054 30263400 : out[24 * stride] = buf0[3];
3055 30263400 : out[25 * stride] = buf0[19];
3056 30263400 : out[26 * stride] = buf0[11];
3057 30263400 : out[27 * stride] = buf0[27];
3058 30263400 : out[28 * stride] = buf0[7];
3059 30263400 : out[29 * stride] = buf0[23];
3060 30263400 : out[30 * stride] = buf0[15];
3061 30263400 : out[31 * stride] = buf0[31];
3062 : }
3063 13048500 : }
3064 :
3065 8737700 : static void av1_fdct32_new_line_wraper_avx2(const __m256i *input,
3066 : __m256i *output, int8_t cos_bit, const int32_t stride) {
3067 8737700 : av1_fdct32_new_avx2(input, output, cos_bit, 8, stride);
3068 8737820 : }
3069 :
3070 3344550 : static void av1_fdct64_new_avx2(const __m256i *input, __m256i *output,
3071 : int8_t cos_bit, const int32_t col_num, const int32_t stride) {
3072 3344550 : const int32_t *cospi = cospi_arr(cos_bit);
3073 3344570 : const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
3074 3344570 : const int32_t columns = col_num >> 3;
3075 :
3076 3344570 : __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
3077 3344570 : __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
3078 3344570 : __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
3079 3344570 : __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
3080 3344570 : __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
3081 3344570 : __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
3082 3344570 : __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
3083 3344570 : __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
3084 3344570 : __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
3085 3344570 : __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
3086 3344570 : __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
3087 3344570 : __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
3088 3344570 : __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
3089 3344570 : __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
3090 3344570 : __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
3091 3344570 : __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
3092 3344570 : __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
3093 3344570 : __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
3094 3344570 : __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
3095 3344570 : __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
3096 3344570 : __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
3097 3344570 : __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
3098 3344570 : __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
3099 3344570 : __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
3100 3344570 : __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
3101 3344570 : __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
3102 3344570 : __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
3103 3344570 : __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
3104 3344570 : __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
3105 3344570 : __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
3106 3344570 : __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
3107 3344570 : __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
3108 3344570 : __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
3109 3344570 : __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
3110 3344570 : __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
3111 3344570 : __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
3112 3344570 : __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
3113 3344570 : __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
3114 3344570 : __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
3115 3344570 : __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
3116 3344570 : __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
3117 3344570 : __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
3118 3344570 : __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
3119 3344570 : __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
3120 3344570 : __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
3121 3344570 : __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
3122 3344570 : __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
3123 3344570 : __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
3124 3344570 : __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
3125 3344570 : __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
3126 3344570 : __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
3127 3344570 : __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
3128 3344570 : __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
3129 3344570 : __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
3130 3344570 : __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
3131 3344570 : __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
3132 3344570 : __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
3133 3344570 : __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
3134 3344570 : __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
3135 3344570 : __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
3136 3344570 : __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
3137 3344570 : __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
3138 3344570 : __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
3139 3344570 : __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
3140 3344570 : __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
3141 3344570 : __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
3142 3344570 : __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
3143 3344570 : __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
3144 3344570 : __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
3145 3344570 : __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
3146 3344570 : __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
3147 3344570 : __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
3148 3344570 : __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
3149 3344570 : __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
3150 3344570 : __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
3151 3344570 : __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
3152 3344570 : __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
3153 3344570 : __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
3154 :
3155 15943600 : for (int32_t col = 0; col < columns; col++) {
3156 12599000 : const __m256i *in = &input[col];
3157 12599000 : __m256i *out = &output[col];
3158 :
3159 : // stage 1
3160 : __m256i x1[64];
3161 12599000 : x1[0] = _mm256_add_epi32(in[0 * stride], in[63 * stride]);
3162 12599000 : x1[63] = _mm256_sub_epi32(in[0 * stride], in[63 * stride]);
3163 12599000 : x1[1] = _mm256_add_epi32(in[1 * stride], in[62 * stride]);
3164 12599000 : x1[62] = _mm256_sub_epi32(in[1 * stride], in[62 * stride]);
3165 12599000 : x1[2] = _mm256_add_epi32(in[2 * stride], in[61 * stride]);
3166 12599000 : x1[61] = _mm256_sub_epi32(in[2 * stride], in[61 * stride]);
3167 12599000 : x1[3] = _mm256_add_epi32(in[3 * stride], in[60 * stride]);
3168 12599000 : x1[60] = _mm256_sub_epi32(in[3 * stride], in[60 * stride]);
3169 12599000 : x1[4] = _mm256_add_epi32(in[4 * stride], in[59 * stride]);
3170 12599000 : x1[59] = _mm256_sub_epi32(in[4 * stride], in[59 * stride]);
3171 12599000 : x1[5] = _mm256_add_epi32(in[5 * stride], in[58 * stride]);
3172 12599000 : x1[58] = _mm256_sub_epi32(in[5 * stride], in[58 * stride]);
3173 12599000 : x1[6] = _mm256_add_epi32(in[6 * stride], in[57 * stride]);
3174 12599000 : x1[57] = _mm256_sub_epi32(in[6 * stride], in[57 * stride]);
3175 12599000 : x1[7] = _mm256_add_epi32(in[7 * stride], in[56 * stride]);
3176 12599000 : x1[56] = _mm256_sub_epi32(in[7 * stride], in[56 * stride]);
3177 12599000 : x1[8] = _mm256_add_epi32(in[8 * stride], in[55 * stride]);
3178 12599000 : x1[55] = _mm256_sub_epi32(in[8 * stride], in[55 * stride]);
3179 12599000 : x1[9] = _mm256_add_epi32(in[9 * stride], in[54 * stride]);
3180 12599000 : x1[54] = _mm256_sub_epi32(in[9 * stride], in[54 * stride]);
3181 12599000 : x1[10] = _mm256_add_epi32(in[10 * stride], in[53 * stride]);
3182 12599000 : x1[53] = _mm256_sub_epi32(in[10 * stride], in[53 * stride]);
3183 12599000 : x1[11] = _mm256_add_epi32(in[11 * stride], in[52 * stride]);
3184 12599000 : x1[52] = _mm256_sub_epi32(in[11 * stride], in[52 * stride]);
3185 12599000 : x1[12] = _mm256_add_epi32(in[12 * stride], in[51 * stride]);
3186 12599000 : x1[51] = _mm256_sub_epi32(in[12 * stride], in[51 * stride]);
3187 12599000 : x1[13] = _mm256_add_epi32(in[13 * stride], in[50 * stride]);
3188 12599000 : x1[50] = _mm256_sub_epi32(in[13 * stride], in[50 * stride]);
3189 12599000 : x1[14] = _mm256_add_epi32(in[14 * stride], in[49 * stride]);
3190 12599000 : x1[49] = _mm256_sub_epi32(in[14 * stride], in[49 * stride]);
3191 12599000 : x1[15] = _mm256_add_epi32(in[15 * stride], in[48 * stride]);
3192 12599000 : x1[48] = _mm256_sub_epi32(in[15 * stride], in[48 * stride]);
3193 12599000 : x1[16] = _mm256_add_epi32(in[16 * stride], in[47 * stride]);
3194 12599000 : x1[47] = _mm256_sub_epi32(in[16 * stride], in[47 * stride]);
3195 12599000 : x1[17] = _mm256_add_epi32(in[17 * stride], in[46 * stride]);
3196 12599000 : x1[46] = _mm256_sub_epi32(in[17 * stride], in[46 * stride]);
3197 12599000 : x1[18] = _mm256_add_epi32(in[18 * stride], in[45 * stride]);
3198 12599000 : x1[45] = _mm256_sub_epi32(in[18 * stride], in[45 * stride]);
3199 12599000 : x1[19] = _mm256_add_epi32(in[19 * stride], in[44 * stride]);
3200 12599000 : x1[44] = _mm256_sub_epi32(in[19 * stride], in[44 * stride]);
3201 12599000 : x1[20] = _mm256_add_epi32(in[20 * stride], in[43 * stride]);
3202 12599000 : x1[43] = _mm256_sub_epi32(in[20 * stride], in[43 * stride]);
3203 12599000 : x1[21] = _mm256_add_epi32(in[21 * stride], in[42 * stride]);
3204 12599000 : x1[42] = _mm256_sub_epi32(in[21 * stride], in[42 * stride]);
3205 12599000 : x1[22] = _mm256_add_epi32(in[22 * stride], in[41 * stride]);
3206 12599000 : x1[41] = _mm256_sub_epi32(in[22 * stride], in[41 * stride]);
3207 12599000 : x1[23] = _mm256_add_epi32(in[23 * stride], in[40 * stride]);
3208 12599000 : x1[40] = _mm256_sub_epi32(in[23 * stride], in[40 * stride]);
3209 12599000 : x1[24] = _mm256_add_epi32(in[24 * stride], in[39 * stride]);
3210 12599000 : x1[39] = _mm256_sub_epi32(in[24 * stride], in[39 * stride]);
3211 12599000 : x1[25] = _mm256_add_epi32(in[25 * stride], in[38 * stride]);
3212 12599000 : x1[38] = _mm256_sub_epi32(in[25 * stride], in[38 * stride]);
3213 12599000 : x1[26] = _mm256_add_epi32(in[26 * stride], in[37 * stride]);
3214 12599000 : x1[37] = _mm256_sub_epi32(in[26 * stride], in[37 * stride]);
3215 12599000 : x1[27] = _mm256_add_epi32(in[27 * stride], in[36 * stride]);
3216 12599000 : x1[36] = _mm256_sub_epi32(in[27 * stride], in[36 * stride]);
3217 12599000 : x1[28] = _mm256_add_epi32(in[28 * stride], in[35 * stride]);
3218 12599000 : x1[35] = _mm256_sub_epi32(in[28 * stride], in[35 * stride]);
3219 12599000 : x1[29] = _mm256_add_epi32(in[29 * stride], in[34 * stride]);
3220 12599000 : x1[34] = _mm256_sub_epi32(in[29 * stride], in[34 * stride]);
3221 12599000 : x1[30] = _mm256_add_epi32(in[30 * stride], in[33 * stride]);
3222 12599000 : x1[33] = _mm256_sub_epi32(in[30 * stride], in[33 * stride]);
3223 12599000 : x1[31] = _mm256_add_epi32(in[31 * stride], in[32 * stride]);
3224 12599000 : x1[32] = _mm256_sub_epi32(in[31 * stride], in[32 * stride]);
3225 :
3226 : // stage 2
3227 : __m256i x2[64];
3228 12599000 : x2[0] = _mm256_add_epi32(x1[0], x1[31]);
3229 12599000 : x2[31] = _mm256_sub_epi32(x1[0], x1[31]);
3230 12599000 : x2[1] = _mm256_add_epi32(x1[1], x1[30]);
3231 12599000 : x2[30] = _mm256_sub_epi32(x1[1], x1[30]);
3232 12599000 : x2[2] = _mm256_add_epi32(x1[2], x1[29]);
3233 12599000 : x2[29] = _mm256_sub_epi32(x1[2], x1[29]);
3234 12599000 : x2[3] = _mm256_add_epi32(x1[3], x1[28]);
3235 12599000 : x2[28] = _mm256_sub_epi32(x1[3], x1[28]);
3236 12599000 : x2[4] = _mm256_add_epi32(x1[4], x1[27]);
3237 12599000 : x2[27] = _mm256_sub_epi32(x1[4], x1[27]);
3238 12599000 : x2[5] = _mm256_add_epi32(x1[5], x1[26]);
3239 12599000 : x2[26] = _mm256_sub_epi32(x1[5], x1[26]);
3240 12599000 : x2[6] = _mm256_add_epi32(x1[6], x1[25]);
3241 12599000 : x2[25] = _mm256_sub_epi32(x1[6], x1[25]);
3242 12599000 : x2[7] = _mm256_add_epi32(x1[7], x1[24]);
3243 12599000 : x2[24] = _mm256_sub_epi32(x1[7], x1[24]);
3244 12599000 : x2[8] = _mm256_add_epi32(x1[8], x1[23]);
3245 12599000 : x2[23] = _mm256_sub_epi32(x1[8], x1[23]);
3246 12599000 : x2[9] = _mm256_add_epi32(x1[9], x1[22]);
3247 12599000 : x2[22] = _mm256_sub_epi32(x1[9], x1[22]);
3248 12599000 : x2[10] = _mm256_add_epi32(x1[10], x1[21]);
3249 12599000 : x2[21] = _mm256_sub_epi32(x1[10], x1[21]);
3250 12599000 : x2[11] = _mm256_add_epi32(x1[11], x1[20]);
3251 12599000 : x2[20] = _mm256_sub_epi32(x1[11], x1[20]);
3252 12599000 : x2[12] = _mm256_add_epi32(x1[12], x1[19]);
3253 12599000 : x2[19] = _mm256_sub_epi32(x1[12], x1[19]);
3254 12599000 : x2[13] = _mm256_add_epi32(x1[13], x1[18]);
3255 12599000 : x2[18] = _mm256_sub_epi32(x1[13], x1[18]);
3256 12599000 : x2[14] = _mm256_add_epi32(x1[14], x1[17]);
3257 12599000 : x2[17] = _mm256_sub_epi32(x1[14], x1[17]);
3258 12599000 : x2[15] = _mm256_add_epi32(x1[15], x1[16]);
3259 12599000 : x2[16] = _mm256_sub_epi32(x1[15], x1[16]);
3260 12599000 : x2[32] = x1[32];
3261 12599000 : x2[33] = x1[33];
3262 12599000 : x2[34] = x1[34];
3263 12599000 : x2[35] = x1[35];
3264 12599000 : x2[36] = x1[36];
3265 12599000 : x2[37] = x1[37];
3266 12599000 : x2[38] = x1[38];
3267 12599000 : x2[39] = x1[39];
3268 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[40], x1[55],
3269 : x2[40], x2[55], __rounding, cos_bit);
3270 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[41], x1[54],
3271 : x2[41], x2[54], __rounding, cos_bit);
3272 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[42], x1[53],
3273 : x2[42], x2[53], __rounding, cos_bit);
3274 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[43], x1[52],
3275 : x2[43], x2[52], __rounding, cos_bit);
3276 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[44], x1[51],
3277 : x2[44], x2[51], __rounding, cos_bit);
3278 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[45], x1[50],
3279 : x2[45], x2[50], __rounding, cos_bit);
3280 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[46], x1[49],
3281 : x2[46], x2[49], __rounding, cos_bit);
3282 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[47], x1[48],
3283 : x2[47], x2[48], __rounding, cos_bit);
3284 12599000 : x2[56] = x1[56];
3285 12599000 : x2[57] = x1[57];
3286 12599000 : x2[58] = x1[58];
3287 12599000 : x2[59] = x1[59];
3288 12599000 : x2[60] = x1[60];
3289 12599000 : x2[61] = x1[61];
3290 12599000 : x2[62] = x1[62];
3291 12599000 : x2[63] = x1[63];
3292 :
3293 : // stage 3
3294 : __m256i x3[64];
3295 12599000 : x3[0] = _mm256_add_epi32(x2[0], x2[15]);
3296 12599000 : x3[15] = _mm256_sub_epi32(x2[0], x2[15]);
3297 12599000 : x3[1] = _mm256_add_epi32(x2[1], x2[14]);
3298 12599000 : x3[14] = _mm256_sub_epi32(x2[1], x2[14]);
3299 12599000 : x3[2] = _mm256_add_epi32(x2[2], x2[13]);
3300 12599000 : x3[13] = _mm256_sub_epi32(x2[2], x2[13]);
3301 12599000 : x3[3] = _mm256_add_epi32(x2[3], x2[12]);
3302 12599000 : x3[12] = _mm256_sub_epi32(x2[3], x2[12]);
3303 12599000 : x3[4] = _mm256_add_epi32(x2[4], x2[11]);
3304 12599000 : x3[11] = _mm256_sub_epi32(x2[4], x2[11]);
3305 12599000 : x3[5] = _mm256_add_epi32(x2[5], x2[10]);
3306 12599000 : x3[10] = _mm256_sub_epi32(x2[5], x2[10]);
3307 12599000 : x3[6] = _mm256_add_epi32(x2[6], x2[9]);
3308 12599000 : x3[9] = _mm256_sub_epi32(x2[6], x2[9]);
3309 12599000 : x3[7] = _mm256_add_epi32(x2[7], x2[8]);
3310 12599000 : x3[8] = _mm256_sub_epi32(x2[7], x2[8]);
3311 12599000 : x3[16] = x2[16];
3312 12599000 : x3[17] = x2[17];
3313 12599000 : x3[18] = x2[18];
3314 12599000 : x3[19] = x2[19];
3315 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x2[20], x2[27],
3316 : x3[20], x3[27], __rounding, cos_bit);
3317 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x2[21], x2[26],
3318 : x3[21], x3[26], __rounding, cos_bit);
3319 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x2[22], x2[25],
3320 : x3[22], x3[25], __rounding, cos_bit);
3321 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x2[23], x2[24],
3322 : x3[23], x3[24], __rounding, cos_bit);
3323 12599000 : x3[28] = x2[28];
3324 12599000 : x3[29] = x2[29];
3325 12599000 : x3[30] = x2[30];
3326 12599000 : x3[31] = x2[31];
3327 12599000 : x3[32] = _mm256_add_epi32(x2[32], x2[47]);
3328 12599000 : x3[47] = _mm256_sub_epi32(x2[32], x2[47]);
3329 12599000 : x3[33] = _mm256_add_epi32(x2[33], x2[46]);
3330 12599000 : x3[46] = _mm256_sub_epi32(x2[33], x2[46]);
3331 12599000 : x3[34] = _mm256_add_epi32(x2[34], x2[45]);
3332 12599000 : x3[45] = _mm256_sub_epi32(x2[34], x2[45]);
3333 12599000 : x3[35] = _mm256_add_epi32(x2[35], x2[44]);
3334 12599000 : x3[44] = _mm256_sub_epi32(x2[35], x2[44]);
3335 12599000 : x3[36] = _mm256_add_epi32(x2[36], x2[43]);
3336 12599000 : x3[43] = _mm256_sub_epi32(x2[36], x2[43]);
3337 12599000 : x3[37] = _mm256_add_epi32(x2[37], x2[42]);
3338 12599000 : x3[42] = _mm256_sub_epi32(x2[37], x2[42]);
3339 12599000 : x3[38] = _mm256_add_epi32(x2[38], x2[41]);
3340 12599000 : x3[41] = _mm256_sub_epi32(x2[38], x2[41]);
3341 12599000 : x3[39] = _mm256_add_epi32(x2[39], x2[40]);
3342 12599000 : x3[40] = _mm256_sub_epi32(x2[39], x2[40]);
3343 12599000 : x3[48] = _mm256_sub_epi32(x2[63], x2[48]);
3344 12599000 : x3[63] = _mm256_add_epi32(x2[63], x2[48]);
3345 12599000 : x3[49] = _mm256_sub_epi32(x2[62], x2[49]);
3346 12599000 : x3[62] = _mm256_add_epi32(x2[62], x2[49]);
3347 12599000 : x3[50] = _mm256_sub_epi32(x2[61], x2[50]);
3348 12599000 : x3[61] = _mm256_add_epi32(x2[61], x2[50]);
3349 12599000 : x3[51] = _mm256_sub_epi32(x2[60], x2[51]);
3350 12599000 : x3[60] = _mm256_add_epi32(x2[60], x2[51]);
3351 12599000 : x3[52] = _mm256_sub_epi32(x2[59], x2[52]);
3352 12599000 : x3[59] = _mm256_add_epi32(x2[59], x2[52]);
3353 12599000 : x3[53] = _mm256_sub_epi32(x2[58], x2[53]);
3354 12599000 : x3[58] = _mm256_add_epi32(x2[58], x2[53]);
3355 12599000 : x3[54] = _mm256_sub_epi32(x2[57], x2[54]);
3356 12599000 : x3[57] = _mm256_add_epi32(x2[57], x2[54]);
3357 12599000 : x3[55] = _mm256_sub_epi32(x2[56], x2[55]);
3358 12599000 : x3[56] = _mm256_add_epi32(x2[56], x2[55]);
3359 :
3360 : // stage 4
3361 : __m256i x4[64];
3362 12599000 : x4[0] = _mm256_add_epi32(x3[0], x3[7]);
3363 12599000 : x4[7] = _mm256_sub_epi32(x3[0], x3[7]);
3364 12599000 : x4[1] = _mm256_add_epi32(x3[1], x3[6]);
3365 12599000 : x4[6] = _mm256_sub_epi32(x3[1], x3[6]);
3366 12599000 : x4[2] = _mm256_add_epi32(x3[2], x3[5]);
3367 12599000 : x4[5] = _mm256_sub_epi32(x3[2], x3[5]);
3368 12599000 : x4[3] = _mm256_add_epi32(x3[3], x3[4]);
3369 12599000 : x4[4] = _mm256_sub_epi32(x3[3], x3[4]);
3370 12599000 : x4[8] = x3[8];
3371 12599000 : x4[9] = x3[9];
3372 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x3[10], x3[13],
3373 : x4[10], x4[13], __rounding, cos_bit);
3374 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x3[11], x3[12],
3375 : x4[11], x4[12], __rounding, cos_bit);
3376 12599000 : x4[14] = x3[14];
3377 12599000 : x4[15] = x3[15];
3378 12599000 : x4[16] = _mm256_add_epi32(x3[16], x3[23]);
3379 12599000 : x4[23] = _mm256_sub_epi32(x3[16], x3[23]);
3380 12599000 : x4[17] = _mm256_add_epi32(x3[17], x3[22]);
3381 12599000 : x4[22] = _mm256_sub_epi32(x3[17], x3[22]);
3382 12599000 : x4[18] = _mm256_add_epi32(x3[18], x3[21]);
3383 12599000 : x4[21] = _mm256_sub_epi32(x3[18], x3[21]);
3384 12599000 : x4[19] = _mm256_add_epi32(x3[19], x3[20]);
3385 12599000 : x4[20] = _mm256_sub_epi32(x3[19], x3[20]);
3386 12599000 : x4[24] = _mm256_sub_epi32(x3[31], x3[24]);
3387 12599000 : x4[31] = _mm256_add_epi32(x3[31], x3[24]);
3388 12599000 : x4[25] = _mm256_sub_epi32(x3[30], x3[25]);
3389 12599000 : x4[30] = _mm256_add_epi32(x3[30], x3[25]);
3390 12599000 : x4[26] = _mm256_sub_epi32(x3[29], x3[26]);
3391 12599000 : x4[29] = _mm256_add_epi32(x3[29], x3[26]);
3392 12599000 : x4[27] = _mm256_sub_epi32(x3[28], x3[27]);
3393 12599000 : x4[28] = _mm256_add_epi32(x3[28], x3[27]);
3394 12599000 : x4[32] = x3[32];
3395 12599000 : x4[33] = x3[33];
3396 12599000 : x4[34] = x3[34];
3397 12599000 : x4[35] = x3[35];
3398 125990000 : btf_32_type0_avx2_new(cospi_m16, cospi_p48, x3[36], x3[59],
3399 : x4[36], x4[59], __rounding, cos_bit);
3400 125990000 : btf_32_type0_avx2_new(cospi_m16, cospi_p48, x3[37], x3[58],
3401 : x4[37], x4[58], __rounding, cos_bit);
3402 125990000 : btf_32_type0_avx2_new(cospi_m16, cospi_p48, x3[38], x3[57],
3403 : x4[38], x4[57], __rounding, cos_bit);
3404 125990000 : btf_32_type0_avx2_new(cospi_m16, cospi_p48, x3[39], x3[56],
3405 : x4[39], x4[56], __rounding, cos_bit);
3406 125990000 : btf_32_type0_avx2_new(cospi_m48, cospi_m16, x3[40], x3[55],
3407 : x4[40], x4[55], __rounding, cos_bit);
3408 125990000 : btf_32_type0_avx2_new(cospi_m48, cospi_m16, x3[41], x3[54],
3409 : x4[41], x4[54], __rounding, cos_bit);
3410 125990000 : btf_32_type0_avx2_new(cospi_m48, cospi_m16, x3[42], x3[53],
3411 : x4[42], x4[53], __rounding, cos_bit);
3412 125990000 : btf_32_type0_avx2_new(cospi_m48, cospi_m16, x3[43], x3[52],
3413 : x4[43], x4[52], __rounding, cos_bit);
3414 12599000 : x4[44] = x3[44];
3415 12599000 : x4[45] = x3[45];
3416 12599000 : x4[46] = x3[46];
3417 12599000 : x4[47] = x3[47];
3418 12599000 : x4[48] = x3[48];
3419 12599000 : x4[49] = x3[49];
3420 12599000 : x4[50] = x3[50];
3421 12599000 : x4[51] = x3[51];
3422 12599000 : x4[60] = x3[60];
3423 12599000 : x4[61] = x3[61];
3424 12599000 : x4[62] = x3[62];
3425 12599000 : x4[63] = x3[63];
3426 :
3427 : // stage 5
3428 : __m256i x5[64];
3429 12599000 : x5[0] = _mm256_add_epi32(x4[0], x4[3]);
3430 12599000 : x5[3] = _mm256_sub_epi32(x4[0], x4[3]);
3431 12599000 : x5[1] = _mm256_add_epi32(x4[1], x4[2]);
3432 12599000 : x5[2] = _mm256_sub_epi32(x4[1], x4[2]);
3433 12599000 : x5[4] = x4[4];
3434 125990000 : btf_32_type0_avx2_new(cospi_m32, cospi_p32, x4[5], x4[6],
3435 : x5[5], x5[6], __rounding, cos_bit);
3436 12599000 : x5[7] = x4[7];
3437 12599000 : x5[8] = _mm256_add_epi32(x4[8], x4[11]);
3438 12599000 : x5[11] = _mm256_sub_epi32(x4[8], x4[11]);
3439 12599000 : x5[9] = _mm256_add_epi32(x4[9], x4[10]);
3440 12599000 : x5[10] = _mm256_sub_epi32(x4[9], x4[10]);
3441 12599000 : x5[12] = _mm256_sub_epi32(x4[15], x4[12]);
3442 12599000 : x5[15] = _mm256_add_epi32(x4[15], x4[12]);
3443 12599000 : x5[13] = _mm256_sub_epi32(x4[14], x4[13]);
3444 12599000 : x5[14] = _mm256_add_epi32(x4[14], x4[13]);
3445 12599000 : x5[16] = x4[16];
3446 12599000 : x5[17] = x4[17];
3447 125990000 : btf_32_type0_avx2_new(cospi_m16, cospi_p48, x4[18], x4[29],
3448 : x5[18], x5[29], __rounding, cos_bit);
3449 125990000 : btf_32_type0_avx2_new(cospi_m16, cospi_p48, x4[19], x4[28],
3450 : x5[19], x5[28], __rounding, cos_bit);
3451 125990000 : btf_32_type0_avx2_new(cospi_m48, cospi_m16, x4[20], x4[27],
3452 : x5[20], x5[27], __rounding, cos_bit);
3453 125990000 : btf_32_type0_avx2_new(cospi_m48, cospi_m16, x4[21], x4[26],
3454 : x5[21], x5[26], __rounding, cos_bit);
3455 12599000 : x5[22] = x4[22];
3456 12599000 : x5[23] = x4[23];
3457 12599000 : x5[24] = x4[24];
3458 12599000 : x5[25] = x4[25];
3459 12599000 : x5[30] = x4[30];
3460 12599000 : x5[31] = x4[31];
3461 12599000 : x5[32] = _mm256_add_epi32(x4[32], x4[39]);
3462 12599000 : x5[39] = _mm256_sub_epi32(x4[32], x4[39]);
3463 12599000 : x5[33] = _mm256_add_epi32(x4[33], x4[38]);
3464 12599000 : x5[38] = _mm256_sub_epi32(x4[33], x4[38]);
3465 12599000 : x5[34] = _mm256_add_epi32(x4[34], x4[37]);
3466 12599000 : x5[37] = _mm256_sub_epi32(x4[34], x4[37]);
3467 12599000 : x5[35] = _mm256_add_epi32(x4[35], x4[36]);
3468 12599000 : x5[36] = _mm256_sub_epi32(x4[35], x4[36]);
3469 12599000 : x5[40] = _mm256_sub_epi32(x4[47], x4[40]);
3470 12599000 : x5[47] = _mm256_add_epi32(x4[47], x4[40]);
3471 12599000 : x5[41] = _mm256_sub_epi32(x4[46], x4[41]);
3472 12599000 : x5[46] = _mm256_add_epi32(x4[46], x4[41]);
3473 12599000 : x5[42] = _mm256_sub_epi32(x4[45], x4[42]);
3474 12599000 : x5[45] = _mm256_add_epi32(x4[45], x4[42]);
3475 12599000 : x5[43] = _mm256_sub_epi32(x4[44], x4[43]);
3476 12599000 : x5[44] = _mm256_add_epi32(x4[44], x4[43]);
3477 12599000 : x5[48] = _mm256_add_epi32(x4[48], x4[55]);
3478 12599000 : x5[55] = _mm256_sub_epi32(x4[48], x4[55]);
3479 12599000 : x5[49] = _mm256_add_epi32(x4[49], x4[54]);
3480 12599000 : x5[54] = _mm256_sub_epi32(x4[49], x4[54]);
3481 12599000 : x5[50] = _mm256_add_epi32(x4[50], x4[53]);
3482 12599000 : x5[53] = _mm256_sub_epi32(x4[50], x4[53]);
3483 12599000 : x5[51] = _mm256_add_epi32(x4[51], x4[52]);
3484 12599000 : x5[52] = _mm256_sub_epi32(x4[51], x4[52]);
3485 12599000 : x5[56] = _mm256_sub_epi32(x4[63], x4[56]);
3486 12599000 : x5[63] = _mm256_add_epi32(x4[63], x4[56]);
3487 12599000 : x5[57] = _mm256_sub_epi32(x4[62], x4[57]);
3488 12599000 : x5[62] = _mm256_add_epi32(x4[62], x4[57]);
3489 12599000 : x5[58] = _mm256_sub_epi32(x4[61], x4[58]);
3490 12599000 : x5[61] = _mm256_add_epi32(x4[61], x4[58]);
3491 12599000 : x5[59] = _mm256_sub_epi32(x4[60], x4[59]);
3492 12599000 : x5[60] = _mm256_add_epi32(x4[60], x4[59]);
3493 :
3494 : // stage 6
3495 : __m256i x6[64];
3496 125990000 : btf_32_type0_avx2_new(cospi_p32, cospi_p32, x5[0], x5[1],
3497 : x6[0], x6[1], __rounding, cos_bit);
3498 125990000 : btf_32_type1_avx2_new(cospi_p48, cospi_p16, x5[2], x5[3],
3499 : x6[2], x6[3], __rounding, cos_bit);
3500 12599000 : x6[4] = _mm256_add_epi32(x5[4], x5[5]);
3501 12599000 : x6[5] = _mm256_sub_epi32(x5[4], x5[5]);
3502 12599000 : x6[6] = _mm256_sub_epi32(x5[7], x5[6]);
3503 12599000 : x6[7] = _mm256_add_epi32(x5[7], x5[6]);
3504 12599000 : x6[8] = x5[8];
3505 125990000 : btf_32_type0_avx2_new(cospi_m16, cospi_p48, x5[9], x5[14],
3506 : x6[9], x6[14], __rounding, cos_bit);
3507 125990000 : btf_32_type0_avx2_new(cospi_m48, cospi_m16, x5[10], x5[13],
3508 : x6[10], x6[13], __rounding, cos_bit);
3509 12599000 : x6[11] = x5[11];
3510 12599000 : x6[12] = x5[12];
3511 12599000 : x6[15] = x5[15];
3512 12599000 : x6[16] = _mm256_add_epi32(x5[16], x5[19]);
3513 12599000 : x6[19] = _mm256_sub_epi32(x5[16], x5[19]);
3514 12599000 : x6[17] = _mm256_add_epi32(x5[17], x5[18]);
3515 12599000 : x6[18] = _mm256_sub_epi32(x5[17], x5[18]);
3516 12599000 : x6[20] = _mm256_sub_epi32(x5[23], x5[20]);
3517 12599000 : x6[23] = _mm256_add_epi32(x5[23], x5[20]);
3518 12599000 : x6[21] = _mm256_sub_epi32(x5[22], x5[21]);
3519 12599000 : x6[22] = _mm256_add_epi32(x5[22], x5[21]);
3520 12599000 : x6[24] = _mm256_add_epi32(x5[24], x5[27]);
3521 12599000 : x6[27] = _mm256_sub_epi32(x5[24], x5[27]);
3522 12599000 : x6[25] = _mm256_add_epi32(x5[25], x5[26]);
3523 12599000 : x6[26] = _mm256_sub_epi32(x5[25], x5[26]);
3524 12599000 : x6[28] = _mm256_sub_epi32(x5[31], x5[28]);
3525 12599000 : x6[31] = _mm256_add_epi32(x5[31], x5[28]);
3526 12599000 : x6[29] = _mm256_sub_epi32(x5[30], x5[29]);
3527 12599000 : x6[30] = _mm256_add_epi32(x5[30], x5[29]);
3528 12599000 : x6[32] = x5[32];
3529 12599000 : x6[33] = x5[33];
3530 125990000 : btf_32_type0_avx2_new(cospi_m08, cospi_p56, x5[34], x5[61],
3531 : x6[34], x6[61], __rounding, cos_bit);
3532 125990000 : btf_32_type0_avx2_new(cospi_m08, cospi_p56, x5[35], x5[60],
3533 : x6[35], x6[60], __rounding, cos_bit);
3534 125990000 : btf_32_type0_avx2_new(cospi_m56, cospi_m08, x5[36], x5[59],
3535 : x6[36], x6[59], __rounding, cos_bit);
3536 125990000 : btf_32_type0_avx2_new(cospi_m56, cospi_m08, x5[37], x5[58],
3537 : x6[37], x6[58], __rounding, cos_bit);
3538 12599000 : x6[38] = x5[38];
3539 12599000 : x6[39] = x5[39];
3540 12599000 : x6[40] = x5[40];
3541 12599000 : x6[41] = x5[41];
3542 125990000 : btf_32_type0_avx2_new(cospi_m40, cospi_p24, x5[42], x5[53],
3543 : x6[42], x6[53], __rounding, cos_bit);
3544 125990000 : btf_32_type0_avx2_new(cospi_m40, cospi_p24, x5[43], x5[52],
3545 : x6[43], x6[52], __rounding, cos_bit);
3546 125990000 : btf_32_type0_avx2_new(cospi_m24, cospi_m40, x5[44], x5[51],
3547 : x6[44], x6[51], __rounding, cos_bit);
3548 125990000 : btf_32_type0_avx2_new(cospi_m24, cospi_m40, x5[45], x5[50],
3549 : x6[45], x6[50], __rounding, cos_bit);
3550 12599000 : x6[46] = x5[46];
3551 12599000 : x6[47] = x5[47];
3552 12599000 : x6[48] = x5[48];
3553 12599000 : x6[49] = x5[49];
3554 12599000 : x6[54] = x5[54];
3555 12599000 : x6[55] = x5[55];
3556 12599000 : x6[56] = x5[56];
3557 12599000 : x6[57] = x5[57];
3558 12599000 : x6[62] = x5[62];
3559 12599000 : x6[63] = x5[63];
3560 :
3561 : // stage 7
3562 : __m256i x7[64];
3563 12599000 : x7[0] = x6[0];
3564 12599000 : x7[1] = x6[1];
3565 12599000 : x7[2] = x6[2];
3566 12599000 : x7[3] = x6[3];
3567 125990000 : btf_32_type1_avx2_new(cospi_p56, cospi_p08, x6[4], x6[7],
3568 : x7[4], x7[7], __rounding, cos_bit);
3569 125990000 : btf_32_type1_avx2_new(cospi_p24, cospi_p40, x6[5], x6[6],
3570 : x7[5], x7[6], __rounding, cos_bit);
3571 12599000 : x7[8] = _mm256_add_epi32(x6[8], x6[9]);
3572 12599000 : x7[9] = _mm256_sub_epi32(x6[8], x6[9]);
3573 12599000 : x7[10] = _mm256_sub_epi32(x6[11], x6[10]);
3574 12599000 : x7[11] = _mm256_add_epi32(x6[11], x6[10]);
3575 12599000 : x7[12] = _mm256_add_epi32(x6[12], x6[13]);
3576 12599000 : x7[13] = _mm256_sub_epi32(x6[12], x6[13]);
3577 12599000 : x7[14] = _mm256_sub_epi32(x6[15], x6[14]);
3578 12599000 : x7[15] = _mm256_add_epi32(x6[15], x6[14]);
3579 12599000 : x7[16] = x6[16];
3580 125990000 : btf_32_type0_avx2_new(cospi_m08, cospi_p56, x6[17], x6[30],
3581 : x7[17], x7[30], __rounding, cos_bit);
3582 125990000 : btf_32_type0_avx2_new(cospi_m56, cospi_m08, x6[18], x6[29],
3583 : x7[18], x7[29], __rounding, cos_bit);
3584 12599000 : x7[19] = x6[19];
3585 12599000 : x7[20] = x6[20];
3586 125990000 : btf_32_type0_avx2_new(cospi_m40, cospi_p24, x6[21], x6[26],
3587 : x7[21], x7[26], __rounding, cos_bit);
3588 125990000 : btf_32_type0_avx2_new(cospi_m24, cospi_m40, x6[22], x6[25],
3589 : x7[22], x7[25], __rounding, cos_bit);
3590 12599000 : x7[23] = x6[23];
3591 12599000 : x7[24] = x6[24];
3592 12599000 : x7[27] = x6[27];
3593 12599000 : x7[28] = x6[28];
3594 12599000 : x7[31] = x6[31];
3595 12599000 : x7[32] = _mm256_add_epi32(x6[32], x6[35]);
3596 12599000 : x7[35] = _mm256_sub_epi32(x6[32], x6[35]);
3597 12599000 : x7[33] = _mm256_add_epi32(x6[33], x6[34]);
3598 12599000 : x7[34] = _mm256_sub_epi32(x6[33], x6[34]);
3599 12599000 : x7[36] = _mm256_sub_epi32(x6[39], x6[36]);
3600 12599000 : x7[39] = _mm256_add_epi32(x6[39], x6[36]);
3601 12599000 : x7[37] = _mm256_sub_epi32(x6[38], x6[37]);
3602 12599000 : x7[38] = _mm256_add_epi32(x6[38], x6[37]);
3603 12599000 : x7[40] = _mm256_add_epi32(x6[40], x6[43]);
3604 12599000 : x7[43] = _mm256_sub_epi32(x6[40], x6[43]);
3605 12599000 : x7[41] = _mm256_add_epi32(x6[41], x6[42]);
3606 12599000 : x7[42] = _mm256_sub_epi32(x6[41], x6[42]);
3607 12599000 : x7[44] = _mm256_sub_epi32(x6[47], x6[44]);
3608 12599000 : x7[47] = _mm256_add_epi32(x6[47], x6[44]);
3609 12599000 : x7[45] = _mm256_sub_epi32(x6[46], x6[45]);
3610 12599000 : x7[46] = _mm256_add_epi32(x6[46], x6[45]);
3611 12599000 : x7[48] = _mm256_add_epi32(x6[48], x6[51]);
3612 12599000 : x7[51] = _mm256_sub_epi32(x6[48], x6[51]);
3613 12599000 : x7[49] = _mm256_add_epi32(x6[49], x6[50]);
3614 12599000 : x7[50] = _mm256_sub_epi32(x6[49], x6[50]);
3615 12599000 : x7[52] = _mm256_sub_epi32(x6[55], x6[52]);
3616 12599000 : x7[55] = _mm256_add_epi32(x6[55], x6[52]);
3617 12599000 : x7[53] = _mm256_sub_epi32(x6[54], x6[53]);
3618 12599000 : x7[54] = _mm256_add_epi32(x6[54], x6[53]);
3619 12599000 : x7[56] = _mm256_add_epi32(x6[56], x6[59]);
3620 12599000 : x7[59] = _mm256_sub_epi32(x6[56], x6[59]);
3621 12599000 : x7[57] = _mm256_add_epi32(x6[57], x6[58]);
3622 12599000 : x7[58] = _mm256_sub_epi32(x6[57], x6[58]);
3623 12599000 : x7[60] = _mm256_sub_epi32(x6[63], x6[60]);
3624 12599000 : x7[63] = _mm256_add_epi32(x6[63], x6[60]);
3625 12599000 : x7[61] = _mm256_sub_epi32(x6[62], x6[61]);
3626 12599000 : x7[62] = _mm256_add_epi32(x6[62], x6[61]);
3627 :
3628 : // stage 8
3629 : __m256i x8[64];
3630 12599000 : x8[0] = x7[0];
3631 12599000 : x8[1] = x7[1];
3632 12599000 : x8[2] = x7[2];
3633 12599000 : x8[3] = x7[3];
3634 12599000 : x8[4] = x7[4];
3635 12599000 : x8[5] = x7[5];
3636 12599000 : x8[6] = x7[6];
3637 12599000 : x8[7] = x7[7];
3638 :
3639 125990000 : btf_32_type1_avx2_new(cospi_p60, cospi_p04, x7[8], x7[15],
3640 : x8[8], x8[15], __rounding, cos_bit);
3641 125990000 : btf_32_type1_avx2_new(cospi_p28, cospi_p36, x7[9], x7[14],
3642 : x8[9], x8[14], __rounding, cos_bit);
3643 125990000 : btf_32_type1_avx2_new(cospi_p44, cospi_p20, x7[10], x7[13],
3644 : x8[10], x8[13], __rounding, cos_bit);
3645 125990000 : btf_32_type1_avx2_new(cospi_p12, cospi_p52, x7[11], x7[12],
3646 : x8[11], x8[12], __rounding, cos_bit);
3647 12599000 : x8[16] = _mm256_add_epi32(x7[16], x7[17]);
3648 12599000 : x8[17] = _mm256_sub_epi32(x7[16], x7[17]);
3649 12599000 : x8[18] = _mm256_sub_epi32(x7[19], x7[18]);
3650 12599000 : x8[19] = _mm256_add_epi32(x7[19], x7[18]);
3651 12599000 : x8[20] = _mm256_add_epi32(x7[20], x7[21]);
3652 12599000 : x8[21] = _mm256_sub_epi32(x7[20], x7[21]);
3653 12599000 : x8[22] = _mm256_sub_epi32(x7[23], x7[22]);
3654 12599000 : x8[23] = _mm256_add_epi32(x7[23], x7[22]);
3655 12599000 : x8[24] = _mm256_add_epi32(x7[24], x7[25]);
3656 12599000 : x8[25] = _mm256_sub_epi32(x7[24], x7[25]);
3657 12599000 : x8[26] = _mm256_sub_epi32(x7[27], x7[26]);
3658 12599000 : x8[27] = _mm256_add_epi32(x7[27], x7[26]);
3659 12599000 : x8[28] = _mm256_add_epi32(x7[28], x7[29]);
3660 12599000 : x8[29] = _mm256_sub_epi32(x7[28], x7[29]);
3661 12599000 : x8[30] = _mm256_sub_epi32(x7[31], x7[30]);
3662 12599000 : x8[31] = _mm256_add_epi32(x7[31], x7[30]);
3663 12599000 : x8[32] = x7[32];
3664 125990000 : btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62],
3665 : x8[33], x8[62], __rounding, cos_bit);
3666 125990000 : btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61],
3667 : x8[34], x8[61], __rounding, cos_bit);
3668 12599000 : x8[35] = x7[35];
3669 12599000 : x8[36] = x7[36];
3670 125990000 : btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58],
3671 : x8[37], x8[58], __rounding, cos_bit);
3672 125990000 : btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57],
3673 : x8[38], x8[57], __rounding, cos_bit);
3674 12599000 : x8[39] = x7[39];
3675 12599000 : x8[40] = x7[40];
3676 125990000 : btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54],
3677 : x8[41], x8[54], __rounding, cos_bit);
3678 125990000 : btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53],
3679 : x8[42], x8[53], __rounding, cos_bit);
3680 12599000 : x8[43] = x7[43];
3681 12599000 : x8[44] = x7[44];
3682 125990000 : btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50],
3683 : x8[45], x8[50], __rounding, cos_bit);
3684 125990000 : btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49],
3685 : x8[46], x8[49], __rounding, cos_bit);
3686 12599000 : x8[47] = x7[47];
3687 12599000 : x8[48] = x7[48];
3688 12599000 : x8[51] = x7[51];
3689 12599000 : x8[52] = x7[52];
3690 12599000 : x8[55] = x7[55];
3691 12599000 : x8[56] = x7[56];
3692 12599000 : x8[59] = x7[59];
3693 12599000 : x8[60] = x7[60];
3694 12599000 : x8[63] = x7[63];
3695 :
3696 : // stage 9
3697 : __m256i x9[64];
3698 12599000 : x9[0] = x8[0];
3699 12599000 : x9[1] = x8[1];
3700 12599000 : x9[2] = x8[2];
3701 12599000 : x9[3] = x8[3];
3702 12599000 : x9[4] = x8[4];
3703 12599000 : x9[5] = x8[5];
3704 12599000 : x9[6] = x8[6];
3705 12599000 : x9[7] = x8[7];
3706 12599000 : x9[8] = x8[8];
3707 12599000 : x9[9] = x8[9];
3708 12599000 : x9[10] = x8[10];
3709 12599000 : x9[11] = x8[11];
3710 12599000 : x9[12] = x8[12];
3711 12599000 : x9[13] = x8[13];
3712 12599000 : x9[14] = x8[14];
3713 12599000 : x9[15] = x8[15];
3714 125990000 : btf_32_type1_avx2_new(cospi_p62, cospi_p02, x8[16], x8[31],
3715 : x9[16], x9[31], __rounding, cos_bit);
3716 125990000 : btf_32_type1_avx2_new(cospi_p30, cospi_p34, x8[17], x8[30],
3717 : x9[17], x9[30], __rounding, cos_bit);
3718 125990000 : btf_32_type1_avx2_new(cospi_p46, cospi_p18, x8[18], x8[29],
3719 : x9[18], x9[29], __rounding, cos_bit);
3720 125990000 : btf_32_type1_avx2_new(cospi_p14, cospi_p50, x8[19], x8[28],
3721 : x9[19], x9[28], __rounding, cos_bit);
3722 125990000 : btf_32_type1_avx2_new(cospi_p54, cospi_p10, x8[20], x8[27],
3723 : x9[20], x9[27], __rounding, cos_bit);
3724 125990000 : btf_32_type1_avx2_new(cospi_p22, cospi_p42, x8[21], x8[26],
3725 : x9[21], x9[26], __rounding, cos_bit);
3726 125990000 : btf_32_type1_avx2_new(cospi_p38, cospi_p26, x8[22], x8[25],
3727 : x9[22], x9[25], __rounding, cos_bit);
3728 125990000 : btf_32_type1_avx2_new(cospi_p06, cospi_p58, x8[23], x8[24],
3729 : x9[23], x9[24], __rounding, cos_bit);
3730 12599000 : x9[32] = _mm256_add_epi32(x8[32], x8[33]);
3731 12599000 : x9[33] = _mm256_sub_epi32(x8[32], x8[33]);
3732 12599000 : x9[34] = _mm256_sub_epi32(x8[35], x8[34]);
3733 12599000 : x9[35] = _mm256_add_epi32(x8[35], x8[34]);
3734 12599000 : x9[36] = _mm256_add_epi32(x8[36], x8[37]);
3735 12599000 : x9[37] = _mm256_sub_epi32(x8[36], x8[37]);
3736 12599000 : x9[38] = _mm256_sub_epi32(x8[39], x8[38]);
3737 12599000 : x9[39] = _mm256_add_epi32(x8[39], x8[38]);
3738 12599000 : x9[40] = _mm256_add_epi32(x8[40], x8[41]);
3739 12599000 : x9[41] = _mm256_sub_epi32(x8[40], x8[41]);
3740 12599000 : x9[42] = _mm256_sub_epi32(x8[43], x8[42]);
3741 12599000 : x9[43] = _mm256_add_epi32(x8[43], x8[42]);
3742 12599000 : x9[44] = _mm256_add_epi32(x8[44], x8[45]);
3743 12599000 : x9[45] = _mm256_sub_epi32(x8[44], x8[45]);
3744 12599000 : x9[46] = _mm256_sub_epi32(x8[47], x8[46]);
3745 12599000 : x9[47] = _mm256_add_epi32(x8[47], x8[46]);
3746 12599000 : x9[48] = _mm256_add_epi32(x8[48], x8[49]);
3747 12599000 : x9[49] = _mm256_sub_epi32(x8[48], x8[49]);
3748 12599000 : x9[50] = _mm256_sub_epi32(x8[51], x8[50]);
3749 12599000 : x9[51] = _mm256_add_epi32(x8[51], x8[50]);
3750 12599000 : x9[52] = _mm256_add_epi32(x8[52], x8[53]);
3751 12599000 : x9[53] = _mm256_sub_epi32(x8[52], x8[53]);
3752 12599000 : x9[54] = _mm256_sub_epi32(x8[55], x8[54]);
3753 12599000 : x9[55] = _mm256_add_epi32(x8[55], x8[54]);
3754 12599000 : x9[56] = _mm256_add_epi32(x8[56], x8[57]);
3755 12599000 : x9[57] = _mm256_sub_epi32(x8[56], x8[57]);
3756 12599000 : x9[58] = _mm256_sub_epi32(x8[59], x8[58]);
3757 12599000 : x9[59] = _mm256_add_epi32(x8[59], x8[58]);
3758 12599000 : x9[60] = _mm256_add_epi32(x8[60], x8[61]);
3759 12599000 : x9[61] = _mm256_sub_epi32(x8[60], x8[61]);
3760 12599000 : x9[62] = _mm256_sub_epi32(x8[63], x8[62]);
3761 12599000 : x9[63] = _mm256_add_epi32(x8[63], x8[62]);
3762 :
3763 : // stage 10
3764 : __m256i x10[64];
3765 12599000 : out[0 * stride] = x9[0];
3766 12599000 : out[32 * stride] = x9[1];
3767 12599000 : out[16 * stride] = x9[2];
3768 12599000 : out[48 * stride] = x9[3];
3769 12599000 : out[8 * stride] = x9[4];
3770 12599000 : out[40 * stride] = x9[5];
3771 12599000 : out[24 * stride] = x9[6];
3772 12599000 : out[56 * stride] = x9[7];
3773 12599000 : out[4 * stride] = x9[8];
3774 12599000 : out[36 * stride] = x9[9];
3775 12599000 : out[20 * stride] = x9[10];
3776 12599000 : out[52 * stride] = x9[11];
3777 12599000 : out[12 * stride] = x9[12];
3778 12599000 : out[44 * stride] = x9[13];
3779 12599000 : out[28 * stride] = x9[14];
3780 12599000 : out[60 * stride] = x9[15];
3781 12599000 : out[2 * stride] = x9[16];
3782 12599000 : out[34 * stride] = x9[17];
3783 12599000 : out[18 * stride] = x9[18];
3784 12599000 : out[50 * stride] = x9[19];
3785 12599000 : out[10 * stride] = x9[20];
3786 12599000 : out[42 * stride] = x9[21];
3787 12599000 : out[26 * stride] = x9[22];
3788 12599000 : out[58 * stride] = x9[23];
3789 12599000 : out[6 * stride] = x9[24];
3790 12599000 : out[38 * stride] = x9[25];
3791 12599000 : out[22 * stride] = x9[26];
3792 12599000 : out[54 * stride] = x9[27];
3793 12599000 : out[14 * stride] = x9[28];
3794 12599000 : out[46 * stride] = x9[29];
3795 12599000 : out[30 * stride] = x9[30];
3796 12599000 : out[62 * stride] = x9[31];
3797 125990000 : btf_32_type1_avx2_new(cospi_p63, cospi_p01, x9[32], x9[63],
3798 : x10[32], x10[63], __rounding, cos_bit);
3799 125990000 : btf_32_type1_avx2_new(cospi_p31, cospi_p33, x9[33], x9[62],
3800 : x10[33], x10[62], __rounding, cos_bit);
3801 125990000 : btf_32_type1_avx2_new(cospi_p47, cospi_p17, x9[34], x9[61],
3802 : x10[34], x10[61], __rounding, cos_bit);
3803 125990000 : btf_32_type1_avx2_new(cospi_p15, cospi_p49, x9[35], x9[60],
3804 : x10[35], x10[60], __rounding, cos_bit);
3805 125990000 : btf_32_type1_avx2_new(cospi_p55, cospi_p09, x9[36], x9[59],
3806 : x10[36], x10[59], __rounding, cos_bit);
3807 125990000 : btf_32_type1_avx2_new(cospi_p23, cospi_p41, x9[37], x9[58],
3808 : x10[37], x10[58], __rounding, cos_bit);
3809 125990000 : btf_32_type1_avx2_new(cospi_p39, cospi_p25, x9[38], x9[57],
3810 : x10[38], x10[57], __rounding, cos_bit);
3811 125990000 : btf_32_type1_avx2_new(cospi_p07, cospi_p57, x9[39], x9[56],
3812 : x10[39], x10[56], __rounding, cos_bit);
3813 125990000 : btf_32_type1_avx2_new(cospi_p59, cospi_p05, x9[40], x9[55],
3814 : x10[40], x10[55], __rounding, cos_bit);
3815 125990000 : btf_32_type1_avx2_new(cospi_p27, cospi_p37, x9[41], x9[54],
3816 : x10[41], x10[54], __rounding, cos_bit);
3817 125990000 : btf_32_type1_avx2_new(cospi_p43, cospi_p21, x9[42], x9[53],
3818 : x10[42], x10[53], __rounding, cos_bit);
3819 125990000 : btf_32_type1_avx2_new(cospi_p11, cospi_p53, x9[43], x9[52],
3820 : x10[43], x10[52], __rounding, cos_bit);
3821 125990000 : btf_32_type1_avx2_new(cospi_p51, cospi_p13, x9[44], x9[51],
3822 : x10[44], x10[51], __rounding, cos_bit);
3823 125990000 : btf_32_type1_avx2_new(cospi_p19, cospi_p45, x9[45], x9[50],
3824 : x10[45], x10[50], __rounding, cos_bit);
3825 125990000 : btf_32_type1_avx2_new(cospi_p35, cospi_p29, x9[46], x9[49],
3826 : x10[46], x10[49], __rounding, cos_bit);
3827 125990000 : btf_32_type1_avx2_new(cospi_p03, cospi_p61, x9[47], x9[48],
3828 : x10[47], x10[48], __rounding, cos_bit);
3829 :
3830 : // stage 11
3831 12599000 : out[1 * stride] = x10[32];
3832 12599000 : out[3 * stride] = x10[48];
3833 12599000 : out[5 * stride] = x10[40];
3834 12599000 : out[7 * stride] = x10[56];
3835 12599000 : out[9 * stride] = x10[36];
3836 12599000 : out[11 * stride] = x10[52];
3837 12599000 : out[13 * stride] = x10[44];
3838 12599000 : out[15 * stride] = x10[60];
3839 12599000 : out[17 * stride] = x10[34];
3840 12599000 : out[19 * stride] = x10[50];
3841 12599000 : out[21 * stride] = x10[42];
3842 12599000 : out[23 * stride] = x10[58];
3843 12599000 : out[25 * stride] = x10[38];
3844 12599000 : out[27 * stride] = x10[54];
3845 12599000 : out[29 * stride] = x10[46];
3846 12599000 : out[31 * stride] = x10[62];
3847 12599000 : out[33 * stride] = x10[33];
3848 12599000 : out[35 * stride] = x10[49];
3849 12599000 : out[37 * stride] = x10[41];
3850 12599000 : out[39 * stride] = x10[57];
3851 12599000 : out[41 * stride] = x10[37];
3852 12599000 : out[43 * stride] = x10[53];
3853 12599000 : out[45 * stride] = x10[45];
3854 12599000 : out[47 * stride] = x10[61];
3855 12599000 : out[49 * stride] = x10[35];
3856 12599000 : out[51 * stride] = x10[51];
3857 12599000 : out[53 * stride] = x10[43];
3858 12599000 : out[55 * stride] = x10[59];
3859 12599000 : out[57 * stride] = x10[39];
3860 12599000 : out[59 * stride] = x10[55];
3861 12599000 : out[61 * stride] = x10[47];
3862 12599000 : out[63 * stride] = x10[63];
3863 : }
3864 3344570 : }
3865 :
3866 : typedef void(*TxfmFuncAVX2)(const __m256i *input, __m256i *output,
3867 : const int8_t cos_bit, const int8_t *stage_range);
3868 :
3869 3245160 : static INLINE void fdct32x32_avx2(const __m256i *input, __m256i *output,
3870 : const int8_t cos_bit, const int8_t *stage_range) {
3871 3245160 : const int32_t txfm_size = 32;
3872 3245160 : const int32_t num_per_256 = 8;
3873 3245160 : int32_t col_num = txfm_size / num_per_256;
3874 : (void)stage_range;
3875 3245160 : av1_fdct32_new_avx2(input, output, cos_bit, txfm_size, col_num);
3876 3245290 : }
3877 :
3878 628650 : static INLINE void fdct64x64_avx2(const __m256i *input, __m256i *output,
3879 : const int8_t cos_bit) {
3880 628650 : const int32_t txfm_size = 64;
3881 628650 : const int32_t num_per_256 = 8;
3882 628650 : int32_t col_num = txfm_size / num_per_256;
3883 628650 : av1_fdct64_new_avx2(input, output, cos_bit, txfm_size, col_num);
3884 628655 : }
3885 :
3886 1827500 : static INLINE void fidtx4x8_row_avx2(__m256i *input, __m256i *output, int32_t bit, int32_t col_num) {
3887 : (void)bit;
3888 : __m256i in[4];
3889 : __m256i out[4];
3890 1827500 : __m256i fact = _mm256_set1_epi32(NewSqrt2);
3891 1827500 : __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1));
3892 : __m256i a_low;
3893 : __m256i v[4];
3894 :
3895 1827500 : in[0] = _mm256_permute2x128_si256(input[0], input[2], 0x20);
3896 1827500 : in[1] = _mm256_permute2x128_si256(input[0], input[2], 0x31);
3897 1827500 : in[2] = _mm256_permute2x128_si256(input[1], input[3], 0x20);
3898 1827500 : in[3] = _mm256_permute2x128_si256(input[1], input[3], 0x31);
3899 :
3900 9137420 : for (int32_t i = 0; i < 4; i++) {
3901 14619800 : a_low = _mm256_mullo_epi32(in[i * col_num], fact);
3902 7309920 : a_low = _mm256_add_epi32(a_low, offset);
3903 14619800 : out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits);
3904 : }
3905 :
3906 : // Transpose for 4x4
3907 1827500 : v[0] = _mm256_unpacklo_epi32(out[0], out[1]);
3908 1827500 : v[1] = _mm256_unpackhi_epi32(out[0], out[1]);
3909 1827500 : v[2] = _mm256_unpacklo_epi32(out[2], out[3]);
3910 1827500 : v[3] = _mm256_unpackhi_epi32(out[2], out[3]);
3911 :
3912 1827500 : out[0] = _mm256_unpacklo_epi64(v[0], v[2]);
3913 1827500 : out[1] = _mm256_unpackhi_epi64(v[0], v[2]);
3914 1827500 : out[2] = _mm256_unpacklo_epi64(v[1], v[3]);
3915 1827500 : out[3] = _mm256_unpackhi_epi64(v[1], v[3]);
3916 :
3917 1827500 : output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
3918 1827500 : output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
3919 1827500 : output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
3920 1827500 : output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
3921 1827500 : }
3922 :
3923 1734870 : static INLINE void fidtx4x8_col_avx2(__m256i *in, __m256i *output, int32_t bit, int32_t col_num) {
3924 : (void)bit;
3925 : __m256i out[4];
3926 1734870 : __m256i fact = _mm256_set1_epi32(NewSqrt2);
3927 1734870 : __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1));
3928 : __m256i a_low;
3929 : __m256i v[4];
3930 :
3931 8674210 : for (int32_t i = 0; i < 4; i++) {
3932 13878700 : a_low = _mm256_mullo_epi32(in[i * col_num], fact);
3933 6939340 : a_low = _mm256_add_epi32(a_low, offset);
3934 13878700 : out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits);
3935 : }
3936 :
3937 : // Transpose for 4x4
3938 1734870 : v[0] = _mm256_unpacklo_epi32(out[0], out[1]);
3939 1734870 : v[1] = _mm256_unpackhi_epi32(out[0], out[1]);
3940 1734870 : v[2] = _mm256_unpacklo_epi32(out[2], out[3]);
3941 1734870 : v[3] = _mm256_unpackhi_epi32(out[2], out[3]);
3942 :
3943 1734870 : out[0] = _mm256_unpacklo_epi64(v[0], v[2]);
3944 1734870 : out[1] = _mm256_unpackhi_epi64(v[0], v[2]);
3945 1734870 : out[2] = _mm256_unpacklo_epi64(v[1], v[3]);
3946 1734870 : out[3] = _mm256_unpackhi_epi64(v[1], v[3]);
3947 :
3948 1734870 : output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
3949 1734870 : output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
3950 1734870 : output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
3951 1734870 : output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
3952 1734870 : }
3953 :
3954 1038570 : static INLINE void fidtx8x4_avx2(__m256i *in, __m256i *out, int32_t bit) {
3955 : (void)bit;
3956 :
3957 1038570 : out[0] = _mm256_add_epi32(in[0], in[0]);
3958 1038570 : out[1] = _mm256_add_epi32(in[1], in[1]);
3959 1038570 : out[2] = _mm256_add_epi32(in[2], in[2]);
3960 1038570 : out[3] = _mm256_add_epi32(in[3], in[3]);
3961 1038570 : }
3962 :
3963 1249020 : void av1_idtx32_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit,
3964 : const int32_t col_num) {
3965 : (void)cos_bit;
3966 41214400 : for (int32_t i = 0; i < 32; i++)
3967 79930700 : output[i * col_num] = _mm256_slli_epi32(input[i * col_num], 2);
3968 1249020 : }
3969 :
3970 182030 : static void fidtx32x32_avx2(const __m256i *input, __m256i *output,
3971 : const int8_t cos_bit, const int8_t *stage_range) {
3972 : (void)stage_range;
3973 :
3974 910129 : for (int32_t i = 0; i < 4; i++)
3975 728099 : av1_idtx32_new_avx2(&input[i * 32], &output[i * 32], cos_bit, 1);
3976 182030 : }
3977 :
3978 701042 : static void fidtx32x8_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
3979 : (void)bit;
3980 : (void)col_num;
3981 701042 : out[4 * 0] = _mm256_slli_epi32(in[4 * 0], 1);
3982 701042 : out[4 * 1] = _mm256_slli_epi32(in[4 * 1], 1);
3983 701042 : out[4 * 2] = _mm256_slli_epi32(in[4 * 2], 1);
3984 701042 : out[4 * 3] = _mm256_slli_epi32(in[4 * 3], 1);
3985 701042 : out[4 * 4] = _mm256_slli_epi32(in[4 * 4], 1);
3986 701042 : out[4 * 5] = _mm256_slli_epi32(in[4 * 5], 1);
3987 701042 : out[4 * 6] = _mm256_slli_epi32(in[4 * 6], 1);
3988 701042 : out[4 * 7] = _mm256_slli_epi32(in[4 * 7], 1);
3989 701042 : }
3990 :
3991 0 : static void fidtx64x64_avx2(const __m256i *input, __m256i *output) {
3992 0 : const int32_t bits = 12; // NewSqrt2Bits = 12
3993 0 : const int32_t sqrt = 4 * 5793; // 4 * NewSqrt2
3994 0 : const int32_t col_num = 8;
3995 0 : const __m256i newsqrt = _mm256_set1_epi32(sqrt);
3996 0 : const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
3997 :
3998 : __m256i temp;
3999 0 : int32_t num_iters = 64 * col_num;
4000 0 : for (int32_t i = 0; i < num_iters; i++) {
4001 0 : temp = _mm256_mullo_epi32(input[i], newsqrt);
4002 0 : temp = _mm256_add_epi32(temp, rounding);
4003 0 : output[i] = _mm256_srai_epi32(temp, bits);
4004 : }
4005 0 : }
4006 :
4007 3427250 : static INLINE TxfmFuncAVX2 fwd_txfm_type_to_func(TxfmType TxfmType) {
4008 3427250 : switch (TxfmType) {
4009 3245250 : case TXFM_TYPE_DCT32: return fdct32x32_avx2; break;
4010 182027 : case TXFM_TYPE_IDENTITY32: return fidtx32x32_avx2; break;
4011 0 : default: assert(0);
4012 : }
4013 : return NULL;
4014 : }
4015 :
4016 1713650 : static INLINE void load_buffer_32x32_avx2(const int16_t *input,
4017 : __m256i *output, int32_t stride) {
4018 : __m128i temp[4];
4019 : int32_t i;
4020 :
4021 56527200 : for (i = 0; i < 32; ++i) {
4022 54813600 : temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * 8));
4023 54813600 : temp[1] = _mm_load_si128((const __m128i *)(input + 1 * 8));
4024 54813600 : temp[2] = _mm_load_si128((const __m128i *)(input + 2 * 8));
4025 54813600 : temp[3] = _mm_load_si128((const __m128i *)(input + 3 * 8));
4026 :
4027 54813600 : output[0] = _mm256_cvtepi16_epi32(temp[0]);
4028 54813600 : output[1] = _mm256_cvtepi16_epi32(temp[1]);
4029 54813600 : output[2] = _mm256_cvtepi16_epi32(temp[2]);
4030 54813600 : output[3] = _mm256_cvtepi16_epi32(temp[3]);
4031 54813600 : input += stride;
4032 54813600 : output += 4;
4033 : }
4034 1713650 : }
4035 :
4036 1713640 : static INLINE void fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
4037 : const int32_t stride,
4038 : const Txfm2DFlipCfg *cfg,
4039 : int32_t *txfm_buf) {
4040 1713640 : assert(cfg->tx_size < TX_SIZES);
4041 1713640 : const int32_t txfm_size = tx_size_wide[cfg->tx_size];
4042 1713640 : const int8_t *shift = cfg->shift;
4043 1713640 : const int8_t *stage_range_col = cfg->stage_range_col;
4044 1713640 : const int8_t *stage_range_row = cfg->stage_range_row;
4045 1713640 : const int8_t cos_bit_col = cfg->cos_bit_col;
4046 1713640 : const int8_t cos_bit_row = cfg->cos_bit_row;
4047 1713640 : const TxfmFuncAVX2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
4048 1713640 : const TxfmFuncAVX2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
4049 : ASSERT(txfm_func_col);
4050 : ASSERT(txfm_func_row);
4051 1713660 : __m256i *buf_256 = (__m256i *)txfm_buf;
4052 1713660 : __m256i *out_256 = (__m256i *)output;
4053 1713660 : int32_t num_per_256 = 8;
4054 1713660 : int32_t txfm2d_size_256 = txfm_size * txfm_size / num_per_256;
4055 :
4056 1713660 : load_buffer_32x32_avx2(input, buf_256, stride);
4057 1713720 : av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256, -shift[0]);
4058 1713690 : txfm_func_col(out_256, buf_256, cos_bit_col, stage_range_col);
4059 1713710 : av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256, -shift[1]);
4060 1713710 : transpose_32_avx2(txfm_size, out_256, buf_256);
4061 1713710 : txfm_func_row(buf_256, out_256, cos_bit_row, stage_range_row);
4062 1713700 : av1_round_shift_array_32_avx2(out_256, buf_256, txfm2d_size_256, -shift[2]);
4063 1713700 : transpose_32_avx2(txfm_size, buf_256, out_256);
4064 1713710 : }
4065 :
4066 1713660 : void eb_av1_fwd_txfm2d_32x32_avx2(int16_t *input, int32_t *output,
4067 : uint32_t stride, TxType tx_type, uint8_t bd)
4068 : {
4069 : DECLARE_ALIGNED(32, int32_t, txfm_buf[1024]);
4070 : Txfm2DFlipCfg cfg;
4071 1713660 : Av1TransformConfig(tx_type, TX_32X32, &cfg);
4072 : (void)bd;
4073 1713660 : fwd_txfm2d_32x32_avx2(input, output, stride, &cfg, txfm_buf);
4074 1713700 : }
4075 :
4076 314330 : static INLINE void load_buffer_64x64_avx2(const int16_t *input,
4077 : int32_t stride, __m256i *output) {
4078 : __m128i x0, x1, x2, x3, x4, x5, x6, x7;
4079 : __m256i v0, v1, v2, v3, v4, v5, v6, v7;
4080 : int32_t i;
4081 :
4082 20413100 : for (i = 0; i < 64; ++i) {
4083 20098800 : x0 = _mm_loadu_si128((const __m128i *)(input + 0 * 8));
4084 20098800 : x1 = _mm_loadu_si128((const __m128i *)(input + 1 * 8));
4085 20098800 : x2 = _mm_loadu_si128((const __m128i *)(input + 2 * 8));
4086 20098800 : x3 = _mm_loadu_si128((const __m128i *)(input + 3 * 8));
4087 20098800 : x4 = _mm_loadu_si128((const __m128i *)(input + 4 * 8));
4088 20098800 : x5 = _mm_loadu_si128((const __m128i *)(input + 5 * 8));
4089 20098800 : x6 = _mm_loadu_si128((const __m128i *)(input + 6 * 8));
4090 40197500 : x7 = _mm_loadu_si128((const __m128i *)(input + 7 * 8));
4091 :
4092 20098800 : v0 = _mm256_cvtepi16_epi32(x0);
4093 20098800 : v1 = _mm256_cvtepi16_epi32(x1);
4094 20098800 : v2 = _mm256_cvtepi16_epi32(x2);
4095 20098800 : v3 = _mm256_cvtepi16_epi32(x3);
4096 20098800 : v4 = _mm256_cvtepi16_epi32(x4);
4097 20098800 : v5 = _mm256_cvtepi16_epi32(x5);
4098 20098800 : v6 = _mm256_cvtepi16_epi32(x6);
4099 20098800 : v7 = _mm256_cvtepi16_epi32(x7);
4100 :
4101 : _mm256_storeu_si256(output + 0, v0);
4102 20098800 : _mm256_storeu_si256(output + 1, v1);
4103 20098800 : _mm256_storeu_si256(output + 2, v2);
4104 20098800 : _mm256_storeu_si256(output + 3, v3);
4105 20098800 : _mm256_storeu_si256(output + 4, v4);
4106 20098800 : _mm256_storeu_si256(output + 5, v5);
4107 20098800 : _mm256_storeu_si256(output + 6, v6);
4108 20098800 : _mm256_storeu_si256(output + 7, v7);
4109 :
4110 20098800 : input += stride;
4111 20098800 : output += 8;
4112 : }
4113 314330 : }
4114 :
4115 314332 : void eb_av1_fwd_txfm2d_64x64_avx2(int16_t *input, int32_t *output,
4116 : uint32_t stride, TxType tx_type, uint8_t bd) {
4117 : (void)bd;
4118 : __m256i in[512];
4119 314332 : __m256i *out = (__m256i *)output;
4120 314332 : const int32_t txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
4121 314332 : const int32_t txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
4122 314332 : const int8_t *shift = fwd_txfm_shift_ls[TX_64X64];
4123 :
4124 314332 : switch (tx_type) {
4125 0 : case IDTX:
4126 0 : load_buffer_64x64_avx2(input, stride, out);
4127 0 : fidtx64x64_avx2(out, in);
4128 0 : av1_round_shift_array_32_avx2(in, out, 512, -shift[1]);
4129 0 : transpose_8nx8n(out, in, 64, 64);
4130 :
4131 : /*row wise transform*/
4132 0 : fidtx64x64_avx2(in, out);
4133 0 : av1_round_shift_array_32_avx2(out, in, 512, -shift[2]);
4134 0 : transpose_8nx8n(in, out, 64, 64);
4135 0 : break;
4136 314330 : case DCT_DCT:
4137 314330 : load_buffer_64x64_avx2(input, stride, out);
4138 314333 : fdct64x64_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx]);
4139 314330 : av1_round_shift_array_32_avx2(in, out, 512, -shift[1]);
4140 314330 : transpose_8nx8n(out, in, 64, 64);
4141 :
4142 : /*row wise transform*/
4143 314333 : fdct64x64_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
4144 314332 : av1_round_shift_array_32_avx2(out, in, 512, -shift[2]);
4145 314336 : transpose_8nx8n(in, out, 64, 64);
4146 314333 : break;
4147 2 : default: assert(0);
4148 : }
4149 314333 : }
4150 :
4151 90076600 : static INLINE void load_buffer_32_avx2(const int16_t *input, __m256i *in,
4152 : int32_t stride, int32_t flipud, int32_t fliplr,
4153 : int32_t shift) {
4154 : __m128i temp[4];
4155 90076600 : if (!flipud) {
4156 90077400 : temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
4157 90077400 : temp[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
4158 90077400 : temp[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
4159 180155000 : temp[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
4160 : }
4161 : else {
4162 0 : temp[0] = _mm_load_si128((const __m128i *)(input + 3 * stride));
4163 0 : temp[1] = _mm_load_si128((const __m128i *)(input + 2 * stride));
4164 0 : temp[2] = _mm_load_si128((const __m128i *)(input + 1 * stride));
4165 0 : temp[3] = _mm_load_si128((const __m128i *)(input + 0 * stride));
4166 : }
4167 :
4168 90076600 : if (fliplr) {
4169 0 : temp[0] = mm_reverse_epi16(temp[0]);
4170 0 : temp[1] = mm_reverse_epi16(temp[1]);
4171 0 : temp[2] = mm_reverse_epi16(temp[2]);
4172 0 : temp[3] = mm_reverse_epi16(temp[3]);
4173 : }
4174 :
4175 90074900 : in[0] = _mm256_cvtepi16_epi32(temp[0]);
4176 90074900 : in[1] = _mm256_cvtepi16_epi32(temp[1]);
4177 90074900 : in[2] = _mm256_cvtepi16_epi32(temp[2]);
4178 90074900 : in[3] = _mm256_cvtepi16_epi32(temp[3]);
4179 :
4180 90074900 : in[0] = _mm256_slli_epi32(in[0], shift);
4181 90074900 : in[1] = _mm256_slli_epi32(in[1], shift);
4182 90074900 : in[2] = _mm256_slli_epi32(in[2], shift);
4183 90074900 : in[3] = _mm256_slli_epi32(in[3], shift);
4184 90074900 : }
4185 :
4186 105216000 : static INLINE void load_buffer_16_avx2(const int16_t *input, __m256i *in,
4187 : int32_t stride, int32_t flipud, int32_t fliplr,
4188 : int32_t shift) {
4189 : __m128i temp[2];
4190 105216000 : if (!flipud) {
4191 105221000 : temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
4192 210442000 : temp[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
4193 : }
4194 : else {
4195 0 : temp[0] = _mm_load_si128((const __m128i *)(input + 1 * stride));
4196 0 : temp[1] = _mm_load_si128((const __m128i *)(input + 0 * stride));
4197 : }
4198 :
4199 105216000 : if (fliplr) {
4200 0 : temp[0] = mm_reverse_epi16(temp[0]);
4201 0 : temp[1] = mm_reverse_epi16(temp[1]);
4202 : }
4203 :
4204 105226000 : in[0] = _mm256_cvtepi16_epi32(temp[0]);
4205 105226000 : in[1] = _mm256_cvtepi16_epi32(temp[1]);
4206 :
4207 105226000 : in[0] = _mm256_slli_epi32(in[0], shift);
4208 105226000 : in[1] = _mm256_slli_epi32(in[1], shift);
4209 105226000 : }
4210 :
4211 3177980 : static INLINE void load_buffer_32x8n(const int16_t *input, __m256i *out,
4212 : int32_t stride, int32_t flipud, int32_t fliplr,
4213 : int32_t shift, const int32_t height) {
4214 3177980 : const int16_t *in = input;
4215 3177980 : __m256i *output = out;
4216 60453900 : for (int32_t col = 0; col < height; col++) {
4217 57276200 : in = input + col * stride;
4218 57276200 : output = out + col * 4;
4219 57276200 : load_buffer_32_avx2(in, output, 8, flipud, fliplr, shift);
4220 : }
4221 3177780 : }
4222 :
4223 13900600 : static INLINE void load_buffer_8x16(const int16_t *input, __m256i *out,
4224 : int32_t stride, int32_t flipud, int32_t fliplr,
4225 : int32_t shift) {
4226 13900600 : const int16_t *topL = input;
4227 13900600 : const int16_t *botL = input + 8 * stride;
4228 :
4229 : const int16_t *tmp;
4230 :
4231 13900600 : if (flipud) {
4232 590827 : tmp = topL;
4233 590827 : topL = botL;
4234 590827 : botL = tmp;
4235 : }
4236 :
4237 13900600 : load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
4238 13903100 : load_buffer_8x8(botL, out + 8, stride, flipud, fliplr, shift);
4239 13903900 : }
4240 :
4241 15041400 : static INLINE void col_txfm_8x4_rounding(__m256i *in, int32_t shift) {
4242 15041400 : const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
4243 :
4244 15041400 : in[0] = _mm256_add_epi32(in[0], rounding);
4245 15041400 : in[1] = _mm256_add_epi32(in[1], rounding);
4246 15041400 : in[2] = _mm256_add_epi32(in[2], rounding);
4247 15041400 : in[3] = _mm256_add_epi32(in[3], rounding);
4248 :
4249 15041400 : in[0] = _mm256_srai_epi32(in[0], shift);
4250 15041400 : in[1] = _mm256_srai_epi32(in[1], shift);
4251 15041400 : in[2] = _mm256_srai_epi32(in[2], shift);
4252 15041400 : in[3] = _mm256_srai_epi32(in[3], shift);
4253 15041400 : }
4254 :
4255 8829060 : static INLINE void col_txfm_8x16_rounding(__m256i *in, int32_t shift) {
4256 8829060 : col_txfm_8x8_rounding(&in[0], shift);
4257 8829170 : col_txfm_8x8_rounding(&in[8], shift);
4258 8829210 : }
4259 :
4260 14563300 : static INLINE void write_buffer_16x8_avx2(const __m256i *res, int32_t *output,
4261 : const int32_t stride) {
4262 14563300 : _mm256_storeu_si256((__m256i *)(output), res[0]);
4263 14563300 : _mm256_storeu_si256((__m256i *)(output + stride), res[1]);
4264 14563300 : _mm256_storeu_si256((__m256i *)(output + (stride * 2)), res[2]);
4265 14563300 : _mm256_storeu_si256((__m256i *)(output + (stride * 3)), res[3]);
4266 14563300 : _mm256_storeu_si256((__m256i *)(output + (stride * 4)), res[4]);
4267 14563300 : _mm256_storeu_si256((__m256i *)(output + (stride * 5)), res[5]);
4268 14563300 : _mm256_storeu_si256((__m256i *)(output + (stride * 6)), res[6]);
4269 14563300 : _mm256_storeu_si256((__m256i *)(output + (stride * 7)), res[7]);
4270 14563300 : }
4271 :
4272 557018 : void eb_av1_fwd_txfm2d_32x64_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t bd)
4273 : {
4274 : (void)tx_type;
4275 : __m256i in[256];
4276 557018 : __m256i *outcoef256 = (__m256i *)output;
4277 557018 : const int8_t *shift = fwd_txfm_shift_ls[TX_32X64];
4278 557018 : const int32_t txw_idx = get_txw_idx(TX_32X64);
4279 557016 : const int32_t txh_idx = get_txh_idx(TX_32X64);
4280 557018 : const int32_t txfm_size_col = tx_size_wide[TX_32X64];
4281 557018 : const int32_t txfm_size_row = tx_size_high[TX_32X64];
4282 557018 : int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
4283 557018 : int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
4284 557018 : const int32_t num_row = txfm_size_row >> 3;
4285 557018 : const int32_t num_col = txfm_size_col >> 3;
4286 :
4287 : // column transform
4288 557018 : load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
4289 557019 : av1_fdct64_new_avx2(in, in, bitcol, txfm_size_col, num_col);
4290 :
4291 5012880 : for (int32_t i = 0; i < num_row; i++)
4292 4455860 : col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]);
4293 557018 : transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
4294 :
4295 : // row transform
4296 557016 : av1_fdct32_new_avx2(outcoef256, in, bitrow, txfm_size_row, num_row);
4297 557021 : transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
4298 557019 : av1_round_shift_rect_array_32_avx2(outcoef256, outcoef256, 256, -shift[2],
4299 : NewSqrt2);
4300 : (void)bd;
4301 557015 : }
4302 :
4303 513578 : void eb_av1_fwd_txfm2d_64x32_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t bd)
4304 : {
4305 : (void)tx_type;
4306 : __m256i in[256];
4307 513578 : __m256i *outcoef256 = (__m256i *)output;
4308 513578 : const int8_t *shift = fwd_txfm_shift_ls[TX_64X32];
4309 513578 : const int32_t txw_idx = get_txw_idx(TX_64X32);
4310 513577 : const int32_t txh_idx = get_txh_idx(TX_64X32);
4311 513576 : const int32_t txfm_size_col = tx_size_wide[TX_64X32];
4312 513576 : const int32_t txfm_size_row = tx_size_high[TX_64X32];
4313 513576 : int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
4314 513576 : int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
4315 513576 : const int32_t num_row = txfm_size_row >> 3;
4316 513576 : const int32_t num_col = txfm_size_col >> 3;
4317 :
4318 : // column transform
4319 16943300 : for (int32_t i = 0; i < 32; i++) {
4320 16429700 : load_buffer_32_avx2(input + 0 + i * stride, in + 0 + i * 8, 8, 0, 0, shift[0]);
4321 16429700 : load_buffer_32_avx2(input + 32 + i * stride, in + 4 + i * 8, 8, 0, 0, shift[0]);
4322 : }
4323 :
4324 513592 : av1_fdct32_new_avx2(in, in, bitcol, txfm_size_col, num_col);
4325 :
4326 4622060 : for (int32_t i = 0; i < num_col; i++)
4327 4108420 : col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]);
4328 513641 : transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
4329 :
4330 : // row transform
4331 513592 : av1_fdct64_new_avx2(outcoef256, in, bitrow, txfm_size_row, num_row);
4332 513583 : transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
4333 513591 : av1_round_shift_rect_array_32_avx2(outcoef256, outcoef256, 256,
4334 513591 : -shift[2], NewSqrt2);
4335 : (void)bd;
4336 513589 : }
4337 :
4338 855371 : void eb_av1_fwd_txfm2d_16x64_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t bd)
4339 : {
4340 : __m256i in[128];
4341 855371 : __m256i *outcoeff256 = (__m256i *)output;
4342 855371 : const int8_t *shift = fwd_txfm_shift_ls[TX_16X64];
4343 855371 : const int32_t txw_idx = get_txw_idx(TX_16X64);
4344 855363 : const int32_t txh_idx = get_txh_idx(TX_16X64);
4345 855367 : const int32_t txfm_size_col = tx_size_wide[TX_16X64];
4346 855367 : const int32_t txfm_size_row = tx_size_high[TX_16X64];
4347 855367 : int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
4348 855367 : int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
4349 : int32_t ud_flip, lr_flip;
4350 855367 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4351 855370 : const int32_t num_row = txfm_size_row >> 3;
4352 855370 : const int32_t num_col = txfm_size_col >> 3;
4353 : // col tranform
4354 28214300 : for (int32_t i = 0; i < txfm_size_row; i += num_col) {
4355 27358900 : load_buffer_16_avx2(input + (i + 0) * stride, in + (i + 0) * num_col, 8,
4356 27358900 : ud_flip, lr_flip, shift[0]);
4357 27359200 : load_buffer_16_avx2(input + (i + 1) * stride, in + (i + 1) * num_col, 8,
4358 27359200 : ud_flip, lr_flip, shift[0]);
4359 : }
4360 :
4361 855375 : av1_fdct64_new_avx2(in, outcoeff256, bitcol, txfm_size_col, num_col);
4362 :
4363 855374 : col_txfm_16x16_rounding(outcoeff256, -shift[1]);
4364 855379 : col_txfm_16x16_rounding(outcoeff256 + 32, -shift[1]);
4365 855383 : col_txfm_16x16_rounding(outcoeff256 + 64, -shift[1]);
4366 855378 : col_txfm_16x16_rounding(outcoeff256 + 96, -shift[1]);
4367 855379 : transpose_8nx8n(outcoeff256, in, txfm_size_col, txfm_size_row);
4368 : // row tranform
4369 855380 : fdct16x16_avx2(in, in, bitrow, num_row);
4370 855375 : transpose_8nx8n(in, outcoeff256, txfm_size_row, txfm_size_col);
4371 : (void)bd;
4372 855376 : }
4373 :
4374 790600 : void eb_av1_fwd_txfm2d_64x16_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t bd)
4375 : {
4376 : __m256i in[128];
4377 790600 : __m256i *outcoeff256 = (__m256i *)output;
4378 790600 : const int8_t *shift = fwd_txfm_shift_ls[TX_64X16];
4379 790600 : const int32_t txw_idx = get_txw_idx(TX_64X16);
4380 790597 : const int32_t txh_idx = get_txh_idx(TX_64X16);
4381 790600 : const int32_t txfm_size_col = tx_size_wide[TX_64X16];
4382 790600 : const int32_t txfm_size_row = tx_size_high[TX_64X16];
4383 790600 : int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
4384 790600 : int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
4385 : int32_t ud_flip, lr_flip;
4386 790600 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4387 790599 : const int32_t num_row = txfm_size_row >> 3;
4388 790599 : const int32_t num_col = txfm_size_col >> 3;
4389 : // col tranform
4390 13437600 : for (int32_t i = 0; i < txfm_size_row; i++) {
4391 12647000 : load_buffer_16_avx2(input + 0 + i * stride, in + 0 + i * 8, 8,
4392 12647000 : ud_flip, lr_flip, shift[0]);
4393 12647200 : load_buffer_16_avx2(input + 16 + i * stride, in + 2 + i * 8, 8,
4394 12647200 : ud_flip, lr_flip, shift[0]);
4395 12647100 : load_buffer_16_avx2(input + 32 + i * stride, in + 4 + i * 8, 8,
4396 12647100 : ud_flip, lr_flip, shift[0]);
4397 12647100 : load_buffer_16_avx2(input + 48 + i * stride, in + 6 + i * 8, 8,
4398 12647100 : ud_flip, lr_flip, shift[0]);
4399 : }
4400 :
4401 790619 : fdct16x16_avx2(in, outcoeff256, bitcol, num_col);
4402 790614 : col_txfm_16x16_rounding(outcoeff256, -shift[1]);
4403 790615 : col_txfm_16x16_rounding(outcoeff256 + 32, -shift[1]);
4404 790617 : col_txfm_16x16_rounding(outcoeff256 + 64, -shift[1]);
4405 790616 : col_txfm_16x16_rounding(outcoeff256 + 96, -shift[1]);
4406 790612 : transpose_8nx8n(outcoeff256, in, txfm_size_col, txfm_size_row);
4407 : // row tranform
4408 790620 : av1_fdct64_new_avx2(in, in, bitrow, txfm_size_row, num_row);
4409 790607 : transpose_8nx8n(in, outcoeff256, txfm_size_row, txfm_size_col);
4410 : (void)bd;
4411 790611 : }
4412 :
4413 : static const fwd_transform_1d_avx2 col_fwdtxfm_8x32_arr[TX_TYPES] = {
4414 : av1_fdct32_new_line_wraper_avx2,// DCT_DCT
4415 : NULL, // ADST_DCT
4416 : NULL, // DCT_ADST
4417 : NULL, // ADST_ADST
4418 : NULL, // FLIPADST_DCT
4419 : NULL, // DCT_FLIPADST
4420 : NULL, // FLIPADST_FLIPADST
4421 : NULL, // ADST_FLIPADST
4422 : NULL, // FLIPADST_ADST
4423 : av1_idtx32_new_avx2, // IDTX
4424 : NULL, // V_DCT
4425 : NULL, // H_DCT
4426 : NULL, // V_ADST
4427 : NULL, // H_ADST
4428 : NULL, // V_FLIPADST
4429 : NULL // H_FLIPADST
4430 : };
4431 :
4432 : static const fwd_transform_1d_avx2 row_fwdtxfm_8x32_arr[TX_TYPES] = {
4433 : fdct16x16_avx2, // DCT_DCT
4434 : NULL, // ADST_DCT
4435 : NULL, // DCT_ADST
4436 : NULL, // ADST_ADST
4437 : NULL, // FLIPADST_DCT
4438 : NULL, // DCT_FLIPADST
4439 : NULL, // FLIPADST_FLIPADST
4440 : NULL, // ADST_FLIPADST
4441 : NULL, // FLIPADST_ADST
4442 : fidtx16x16_avx2, // IDTX
4443 : NULL, // V_DCT
4444 : NULL, // H_DCT
4445 : NULL, // V_ADST
4446 : NULL, // H_ADST
4447 : NULL, // V_FLIPADST
4448 : NULL // H_FLIPADST
4449 : };
4450 :
4451 : static const fwd_transform_1d_avx2 row_fwdtxfm_32x8_arr[TX_TYPES] = {
4452 : fdct8x8_avx2, // DCT_DCT
4453 : NULL, // ADST_DCT
4454 : NULL, // DCT_ADST
4455 : NULL, // ADST_ADST
4456 : NULL, // FLIPADST_DCT
4457 : NULL, // DCT_FLIPADST
4458 : NULL, // FLIPADST_FLIPADST
4459 : NULL, // ADST_FLIPADST
4460 : NULL, // FLIPADST-ADST
4461 : fidtx32x8_avx2, // IDTX
4462 : NULL, // V_DCT
4463 : NULL, // H_DCT
4464 : NULL, // V_ADST
4465 : NULL, // H_ADST
4466 : NULL, // V_FLIPADST
4467 : NULL, // H_FLIPADST
4468 : };
4469 :
4470 : static const fwd_transform_1d_avx2 col_fwdtxfm_8x16_arr[TX_TYPES] = {
4471 : fdct16x16_avx2, // DCT_DCT
4472 : fadst16x16_avx2, // ADST_DCT
4473 : fdct16x16_avx2, // DCT_ADST
4474 : fadst16x16_avx2, // ADST_ADST
4475 : fadst16x16_avx2, // FLIPADST_DCT
4476 : fdct16x16_avx2, // DCT_FLIPADST
4477 : fadst16x16_avx2, // FLIPADST_FLIPADST
4478 : fadst16x16_avx2, // ADST_FLIPADST
4479 : fadst16x16_avx2, // FLIPADST_ADST
4480 : fidtx16x16_avx2, // IDTX
4481 : fdct16x16_avx2, // V_DCT
4482 : fidtx16x16_avx2, // H_DCT
4483 : fadst16x16_avx2, // V_ADST
4484 : fidtx16x16_avx2, // H_ADST
4485 : fadst16x16_avx2, // V_FLIPADST
4486 : fidtx16x16_avx2 // H_FLIPADST
4487 : };
4488 :
4489 : static const fwd_transform_1d_avx2 row_fwdtxfm_8x8_arr[TX_TYPES] = {
4490 : fdct8x8_avx2, // DCT_DCT
4491 : fdct8x8_avx2, // ADST_DCT
4492 : fadst8x8_avx2, // DCT_ADST
4493 : fadst8x8_avx2, // ADST_ADST
4494 : fdct8x8_avx2, // FLIPADST_DCT
4495 : fadst8x8_avx2, // DCT_FLIPADST
4496 : fadst8x8_avx2, // FLIPADST_FLIPADST
4497 : fadst8x8_avx2, // ADST_FLIPADST
4498 : fadst8x8_avx2, // FLIPADST_ADST
4499 : fidtx8x8_avx2, // IDTX
4500 : fidtx8x8_avx2, // V_DCT
4501 : fdct8x8_avx2, // H_DCT
4502 : fidtx8x8_avx2, // V_ADST
4503 : fadst8x8_avx2, // H_ADST
4504 : fidtx8x8_avx2, // V_FLIPADST
4505 : fadst8x8_avx2 // H_FLIPADST
4506 : };
4507 :
4508 : static const fwd_transform_1d_avx2 col_fwdtxfm_8x8_arr[TX_TYPES] = {
4509 : fdct8x8_avx2, // DCT_DCT
4510 : fadst8x8_avx2, // ADST_DCT
4511 : fdct8x8_avx2, // DCT_ADST
4512 : fadst8x8_avx2, // ADST_ADST
4513 : fadst8x8_avx2, // FLIPADST_DCT
4514 : fdct8x8_avx2, // DCT_FLIPADST
4515 : fadst8x8_avx2, // FLIPADST_FLIPADST
4516 : fadst8x8_avx2, // ADST_FLIPADST
4517 : fadst8x8_avx2, // FLIPADST_ADST
4518 : fidtx8x8_avx2, // IDTX
4519 : fdct8x8_avx2, // V_DCT
4520 : fidtx8x8_avx2, // H_DCT
4521 : fadst8x8_avx2, // V_ADST
4522 : fidtx8x8_avx2, // H_ADST
4523 : fadst8x8_avx2, // V_FLIPADST
4524 : fidtx8x8_avx2 // H_FLIPADST
4525 : };
4526 :
4527 : static const fwd_transform_1d_avx2 row_fwdtxfm_8x16_arr[TX_TYPES] = {
4528 : fdct16x16_avx2, // DCT_DCT
4529 : fdct16x16_avx2, // ADST_DCT
4530 : fadst16x16_avx2, // DCT_ADST
4531 : fadst16x16_avx2, // ADST_ADST
4532 : fdct16x16_avx2, // FLIPADST_DCT
4533 : fadst16x16_avx2, // DCT_FLIPADST
4534 : fadst16x16_avx2, // FLIPADST_FLIPADST
4535 : fadst16x16_avx2, // ADST_FLIPADST
4536 : fadst16x16_avx2, // FLIPADST_ADST
4537 : fidtx16x16_avx2, // IDTX
4538 : fidtx16x16_avx2, // V_DCT
4539 : fdct16x16_avx2, // H_DCT
4540 : fidtx16x16_avx2, // V_ADST
4541 : fadst16x16_avx2, // H_ADST
4542 : fidtx16x16_avx2, // V_FLIPADST
4543 : fadst16x16_avx2 // H_FLIPADST
4544 : };
4545 :
4546 : /* call this function only for DCT_DCT, IDTX */
4547 2006840 : void eb_av1_fwd_txfm2d_16x32_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t bd)
4548 : {
4549 : __m256i in[64];
4550 2006840 : __m256i *outcoef256 = (__m256i *)output;
4551 2006840 : const int8_t *shift = fwd_txfm_shift_ls[TX_16X32];
4552 2006840 : const int32_t txw_idx = get_txw_idx(TX_16X32);
4553 2006820 : const int32_t txh_idx = get_txh_idx(TX_16X32);
4554 2006850 : const fwd_transform_1d_avx2 col_txfm = col_fwdtxfm_8x32_arr[tx_type];
4555 2006850 : const fwd_transform_1d_avx2 row_txfm = row_fwdtxfm_8x32_arr[tx_type];
4556 2006850 : int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
4557 2006850 : int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
4558 2006850 : const int32_t txfm_size_col = tx_size_wide[TX_16X32];
4559 2006850 : const int32_t txfm_size_row = tx_size_high[TX_16X32];
4560 2006850 : const int32_t num_row = txfm_size_row >> 3;
4561 2006850 : const int32_t num_col = txfm_size_col >> 3;
4562 :
4563 : // column transform
4564 2006850 : load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
4565 2006910 : load_buffer_16x16(input + 16 * stride, in + 32, stride, 0, 0, shift[0]);
4566 :
4567 6020630 : for (int32_t i = 0; i < num_col; i++)
4568 4013700 : col_txfm((in + i), (in + i), bitcol, num_col);
4569 2006930 : col_txfm_16x16_rounding(&in[0], -shift[1]);
4570 2006930 : col_txfm_16x16_rounding(&in[32], -shift[1]);
4571 2006940 : transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
4572 :
4573 : // row transform
4574 2006950 : row_txfm(outcoef256, in, bitrow, num_row);
4575 2006910 : transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
4576 2006940 : av1_round_shift_rect_array_32_avx2(outcoef256, outcoef256, 64, -shift[2],
4577 : NewSqrt2);
4578 : (void)bd;
4579 2006880 : }
4580 :
4581 : /* call this function only for IDTX */
4582 87136 : void eb_av1_fwd_txfm2d_32x16_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t bd)
4583 : {
4584 : __m256i in[64];
4585 87136 : __m256i *outcoef256 = (__m256i *)output;
4586 87136 : const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
4587 87136 : const int32_t txw_idx = get_txw_idx(TX_32X16);
4588 87135 : const int32_t txh_idx = get_txh_idx(TX_32X16);
4589 87135 : const fwd_transform_1d_avx2 col_txfm = row_fwdtxfm_8x32_arr[tx_type];
4590 87135 : const fwd_transform_1d_avx2 row_txfm = col_fwdtxfm_8x32_arr[tx_type];
4591 87135 : int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
4592 87135 : int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
4593 87135 : const int32_t txfm_size_col = tx_size_wide[TX_32X16];
4594 87135 : const int32_t txfm_size_row = tx_size_high[TX_32X16];
4595 87135 : const int32_t num_row = txfm_size_row >> 3;
4596 87135 : const int32_t num_col = txfm_size_col >> 3;
4597 :
4598 : // column transform
4599 87135 : load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
4600 87138 : col_txfm(in, in, bitcol, num_col);
4601 87139 : col_txfm_16x16_rounding(&in[0], -shift[1]);
4602 87139 : col_txfm_16x16_rounding(&in[32], -shift[1]);
4603 87139 : transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
4604 :
4605 : // row transform
4606 261417 : for (int32_t i = 0; i < num_row; i++)
4607 174278 : row_txfm((outcoef256 + i), (in + i), bitrow, num_row);
4608 87139 : transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
4609 87139 : av1_round_shift_rect_array_32_avx2(outcoef256, outcoef256, 64, -shift[2],
4610 : NewSqrt2);
4611 : (void)bd;
4612 87138 : }
4613 :
4614 : /* call this function only for DCT_DCT, IDTX */
4615 2537690 : void eb_av1_fwd_txfm2d_8x32_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t bd)
4616 : {
4617 : __m256i in[32];
4618 2537690 : __m256i *outcoef256 = (__m256i *)output;
4619 2537690 : const int8_t *shift = fwd_txfm_shift_ls[TX_8X32];
4620 2537690 : const int32_t txw_idx = get_txw_idx(TX_8X32);
4621 2537660 : const int32_t txh_idx = get_txh_idx(TX_8X32);
4622 2537710 : const fwd_transform_1d_avx2 col_txfm = col_fwdtxfm_8x32_arr[tx_type];
4623 2537710 : const fwd_transform_1d_avx2 row_txfm = row_fwdtxfm_32x8_arr[tx_type];
4624 2537710 : int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
4625 2537710 : int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
4626 :
4627 2537710 : const int32_t txfm_size_col = tx_size_wide[TX_8X32];
4628 2537710 : const int32_t txfm_size_row = tx_size_high[TX_8X32];
4629 2537710 : const int32_t num_row = txfm_size_row >> 3;
4630 2537710 : const int32_t num_col = txfm_size_col >> 3;
4631 :
4632 : // column transform
4633 2537710 : load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
4634 2537830 : load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + 16,
4635 2537830 : stride, 0, 0, shift[0]);
4636 :
4637 2537810 : col_txfm(in, in, bitcol, num_col);
4638 2537770 : col_txfm_16x16_rounding(in, -shift[1]);
4639 2537820 : transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
4640 :
4641 : // row transform
4642 12688000 : for (int32_t i = 0; i < num_row; i++)
4643 10150200 : row_txfm((outcoef256 + i), (in + i), bitrow, num_row);
4644 2537840 : transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
4645 : (void)bd;
4646 2537860 : }
4647 :
4648 : /* call this function only for DCT_DCT, IDTX */
4649 2533910 : void eb_av1_fwd_txfm2d_32x8_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t bd)
4650 : {
4651 : __m256i in[32];
4652 2533910 : __m256i *outcoef256 = (__m256i *)output;
4653 2533910 : const int8_t *shift = fwd_txfm_shift_ls[TX_32X8];
4654 2533910 : const int32_t txw_idx = get_txw_idx(TX_32X8);
4655 2533860 : const int32_t txh_idx = get_txh_idx(TX_32X8);
4656 2533920 : const fwd_transform_1d_avx2 col_txfm = row_fwdtxfm_32x8_arr[tx_type];
4657 2533920 : const fwd_transform_1d_avx2 row_txfm = col_fwdtxfm_8x32_arr[tx_type];
4658 2533920 : int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
4659 2533920 : int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
4660 :
4661 2533920 : const int32_t txfm_size_col = tx_size_wide[TX_32X8];
4662 2533920 : const int32_t txfm_size_row = tx_size_high[TX_32X8];
4663 2533920 : const int32_t num_row = txfm_size_row >> 3;
4664 2533920 : const int32_t num_col = txfm_size_col >> 3;
4665 :
4666 : // column transform
4667 2533920 : load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
4668 12669100 : for (int32_t i = 0; i < num_col; i++)
4669 10135100 : col_txfm((in + i), (in + i), bitcol, num_col);
4670 2534070 : col_txfm_16x16_rounding(&in[0], -shift[1]);
4671 2534070 : transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
4672 :
4673 : // row transform
4674 2534090 : row_txfm(outcoef256, in, bitrow, num_row);
4675 :
4676 2534040 : transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
4677 : (void)bd;
4678 2534100 : }
4679 :
4680 : /* call this function for all 16 transform types */
4681 8828150 : void eb_av1_fwd_txfm2d_8x16_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t bd)
4682 : {
4683 : __m256i in[16], out[16];
4684 8828150 : const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
4685 8828150 : const int32_t txw_idx = get_txw_idx(TX_8X16);
4686 8827910 : const int32_t txh_idx = get_txh_idx(TX_8X16);
4687 8828000 : const fwd_transform_1d_avx2 col_txfm = col_fwdtxfm_8x16_arr[tx_type];
4688 8828000 : const fwd_transform_1d_avx2 row_txfm = row_fwdtxfm_8x8_arr[tx_type];
4689 8828000 : int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
4690 8828000 : int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
4691 : int32_t ud_flip, lr_flip;
4692 8828000 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4693 8828040 : const int32_t txfm_size_col = tx_size_wide[TX_8X16];
4694 8828040 : const int32_t txfm_size_row = tx_size_high[TX_8X16];
4695 8828040 : const int32_t num_row = txfm_size_row >> 3;
4696 8828040 : const int32_t num_col = txfm_size_col >> 3;
4697 :
4698 8828040 : load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
4699 : // column transform
4700 8829600 : col_txfm(in, in, bitcol, num_col);
4701 8829190 : col_txfm_8x16_rounding(in, -shift[1]);
4702 8829310 : transpose_8x8_avx2(in, out);
4703 8829390 : transpose_8x8_avx2(in + 8, out + 8);
4704 :
4705 : // row transform
4706 26485300 : for (int32_t i = 0; i < num_row; i++) {
4707 17655700 : row_txfm(out + i * 8, out, bitrow, 1);
4708 17656400 : transpose_8x8_avx2(out, in);
4709 17657100 : av1_round_shift_rect_array_32_avx2(in, in, 8, -shift[2], NewSqrt2);
4710 17656700 : write_buffer_8x8(in, output + i * 64);
4711 : }
4712 : (void)bd;
4713 8829630 : }
4714 :
4715 : /* call this function for all 16 transform types */
4716 7281340 : void eb_av1_fwd_txfm2d_16x8_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t bd)
4717 : {
4718 7281340 : __m256i in[16], out[16]={0};
4719 7281340 : const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
4720 7281340 : const int32_t txw_idx = get_txw_idx(TX_16X8);
4721 7281320 : const int32_t txh_idx = get_txh_idx(TX_16X8);
4722 7281520 : const fwd_transform_1d_avx2 col_txfm = col_fwdtxfm_8x8_arr[tx_type];
4723 7281520 : const fwd_transform_1d_avx2 row_txfm = row_fwdtxfm_8x16_arr[tx_type];
4724 7281520 : int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
4725 7281520 : int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
4726 : int32_t ud_flip, lr_flip;
4727 7281520 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4728 7281460 : const int32_t txfm_size_col = tx_size_wide[TX_16X8];
4729 7281460 : const int32_t txfm_size_row = tx_size_high[TX_16X8];
4730 7281460 : const int32_t num_row = txfm_size_row >> 3;
4731 7281460 : const int32_t num_col = txfm_size_col >> 3;
4732 :
4733 : // column transform
4734 21843500 : for (int32_t i = 0; i < num_col; i++) {
4735 14561100 : load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
4736 14562800 : col_txfm(in, in, bitcol, 1);
4737 14563300 : col_txfm_8x8_rounding(in, -shift[1]);
4738 14563200 : transpose_8x8_avx2(in, out + i * 8);
4739 : }
4740 :
4741 : // row transform
4742 7282450 : if (lr_flip) {
4743 8039470 : for (int32_t i = 0; i < 16; i++)
4744 7566550 : in[16 - i - 1] = out[i];
4745 472917 : row_txfm(in, out, bitrow, num_row);
4746 : }
4747 : else
4748 6809530 : row_txfm(out, out, bitrow, num_row);
4749 :
4750 21844100 : for (int32_t i = 0; i < num_col; i++) {
4751 14561700 : transpose_8x8_avx2(out + i * 8, in);
4752 14563100 : av1_round_shift_rect_array_32_avx2(in, in, 8, -shift[2], NewSqrt2);
4753 14562900 : write_buffer_16x8_avx2(in, output + i * 8, 16);
4754 : }
4755 : (void)bd;
4756 7282440 : }
4757 :
4758 7683900 : void eb_av1_fwd_txfm2d_4x8_avx2(int16_t *input, int32_t *output, uint32_t stride,
4759 : TxType tx_type, uint8_t bd)
4760 : {
4761 : __m256i in[4];
4762 : __m256i outcoeff256[4];
4763 :
4764 7683900 : const int8_t *shift = fwd_txfm_shift_ls[TX_4X8];
4765 7683900 : const int32_t txw_idx = get_txw_idx(TX_4X8);
4766 7683730 : const int32_t txh_idx = get_txh_idx(TX_4X8);
4767 7685200 : int32_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
4768 7685200 : int32_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
4769 :
4770 7685200 : switch (tx_type) {
4771 4507820 : case DCT_DCT:
4772 4507820 : load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
4773 4508390 : fdct4x8_avx2(in, in, bitcol);
4774 4508500 : col_txfm_8x4_rounding(in, -shift[1]);
4775 4508440 : transpose_4x8_avx2(in, outcoeff256);
4776 4508160 : fdct4x8_col_avx2(outcoeff256, in, bitrow, 1);
4777 4508310 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4778 4508280 : write_buffer_4x8(outcoeff256, output);
4779 4508180 : break;
4780 729726 : case ADST_DCT:
4781 729726 : load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
4782 729749 : fadst8x4_avx2(in, in, bitcol, 1);
4783 729752 : col_txfm_8x4_rounding(in, -shift[1]);
4784 729751 : transpose_4x8_avx2(in, outcoeff256);
4785 729754 : fdct4x8_col_avx2(outcoeff256, in, bitrow, 1);
4786 729753 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4787 729747 : write_buffer_4x8(outcoeff256, output);
4788 729746 : break;
4789 746998 : case DCT_ADST:
4790 746998 : load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
4791 747019 : fdct4x8_avx2(in, in, bitcol);
4792 747024 : col_txfm_8x4_rounding(in, -shift[1]);
4793 747024 : transpose_4x8_avx2(in, outcoeff256);
4794 747013 : fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
4795 747024 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4796 747025 : write_buffer_4x8(outcoeff256, output);
4797 747023 : break;
4798 430724 : case ADST_ADST:
4799 430724 : load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
4800 430735 : fadst8x4_avx2(in, in, bitcol, 1);
4801 430734 : col_txfm_8x4_rounding(in, -shift[1]);
4802 430731 : transpose_4x8_avx2(in, outcoeff256);
4803 430729 : fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
4804 430728 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4805 430727 : write_buffer_4x8(outcoeff256, output);
4806 430726 : break;
4807 81813 : case FLIPADST_DCT:
4808 81813 : load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
4809 81813 : fadst8x4_avx2(in, in, bitcol, 1);
4810 81813 : col_txfm_8x4_rounding(in, -shift[1]);
4811 81813 : transpose_4x8_avx2(in, outcoeff256);
4812 81813 : fdct4x8_col_avx2(outcoeff256, in, bitrow, 1);
4813 81813 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4814 81813 : write_buffer_4x8(outcoeff256, output);
4815 81813 : break;
4816 81707 : case DCT_FLIPADST:
4817 81707 : load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
4818 81707 : fdct4x8_avx2(in, in, bitcol);
4819 81707 : col_txfm_8x4_rounding(in, -shift[1]);
4820 81707 : transpose_4x8_avx2(in, outcoeff256);
4821 81707 : fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
4822 81707 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4823 81707 : write_buffer_4x8(outcoeff256, output);
4824 81707 : break;
4825 82687 : case FLIPADST_FLIPADST:
4826 82687 : load_buffer_4x8_avx2(input, in, stride, 1, 1, shift[0]);
4827 82687 : fadst8x4_avx2(in, in, bitcol, 1);
4828 82688 : col_txfm_8x4_rounding(in, -shift[1]);
4829 82688 : transpose_4x8_avx2(in, outcoeff256);
4830 82688 : fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
4831 82687 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4832 82688 : write_buffer_4x8(outcoeff256, output);
4833 82688 : break;
4834 83243 : case ADST_FLIPADST:
4835 83243 : load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
4836 83244 : fadst8x4_avx2(in, in, bitcol, 1);
4837 83244 : col_txfm_8x4_rounding(in, -shift[1]);
4838 83244 : transpose_4x8_avx2(in, outcoeff256);
4839 83244 : fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
4840 83244 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4841 83243 : write_buffer_4x8(outcoeff256, output);
4842 83243 : break;
4843 84355 : case FLIPADST_ADST:
4844 84355 : load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
4845 84356 : fadst8x4_avx2(in, in, bitcol, 1);
4846 84356 : col_txfm_8x4_rounding(in, -shift[1]);
4847 84356 : transpose_4x8_avx2(in, outcoeff256);
4848 84356 : fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
4849 84356 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4850 84356 : write_buffer_4x8(outcoeff256, output);
4851 84356 : break;
4852 168663 : case IDTX:
4853 168663 : load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
4854 168663 : fidtx8x4_avx2(in, in, bitcol);
4855 168663 : col_txfm_8x4_rounding(in, -shift[1]);
4856 168663 : transpose_4x8_avx2(in, outcoeff256);
4857 168663 : fidtx4x8_col_avx2(outcoeff256, in, bitrow, 1);
4858 168665 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4859 168664 : write_buffer_4x8(outcoeff256, output);
4860 168664 : break;
4861 169658 : case V_DCT:
4862 169658 : load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
4863 169661 : fdct4x8_avx2(in, in, bitcol);
4864 169660 : col_txfm_8x4_rounding(in, -shift[1]);
4865 169658 : transpose_4x8_avx2(in, outcoeff256);
4866 169658 : fidtx4x8_col_avx2(outcoeff256, in, bitrow, 1);
4867 169659 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4868 169659 : write_buffer_4x8(outcoeff256, output);
4869 169659 : break;
4870 180298 : case H_DCT:
4871 180298 : load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
4872 180300 : fidtx8x4_avx2(in, in, bitcol);
4873 180300 : col_txfm_8x4_rounding(in, -shift[1]);
4874 180300 : transpose_4x8_avx2(in, outcoeff256);
4875 180299 : fdct4x8_col_avx2(outcoeff256, in, bitrow, 1);
4876 180298 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4877 180299 : write_buffer_4x8(outcoeff256, output);
4878 180299 : break;
4879 82169 : case V_ADST:
4880 82169 : load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
4881 82169 : fadst8x4_avx2(in, in, bitcol, 1);
4882 82169 : col_txfm_8x4_rounding(in, -shift[1]);
4883 82169 : transpose_4x8_avx2(in, outcoeff256);
4884 82169 : fidtx4x8_col_avx2(outcoeff256, in, bitrow, 1);
4885 82169 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4886 82169 : write_buffer_4x8(outcoeff256, output);
4887 82169 : break;
4888 88118 : case H_ADST:
4889 88118 : load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
4890 88118 : fidtx8x4_avx2(in, in, bitcol);
4891 88118 : col_txfm_8x4_rounding(in, -shift[1]);
4892 88118 : transpose_4x8_avx2(in, outcoeff256);
4893 88118 : fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
4894 88118 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4895 88118 : write_buffer_4x8(outcoeff256, output);
4896 88117 : break;
4897 80870 : case V_FLIPADST:
4898 80870 : load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
4899 80870 : fadst8x4_avx2(in, in, bitcol, 1);
4900 80870 : col_txfm_8x4_rounding(in, -shift[1]);
4901 80870 : transpose_4x8_avx2(in, outcoeff256);
4902 80870 : fidtx4x8_col_avx2(outcoeff256, in, bitrow, 1);
4903 80869 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4904 80870 : write_buffer_4x8(outcoeff256, output);
4905 80870 : break;
4906 86352 : case H_FLIPADST:
4907 86352 : load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
4908 86352 : fidtx8x4_avx2(in, in, bitcol);
4909 86352 : col_txfm_8x4_rounding(in, -shift[1]);
4910 86352 : transpose_4x8_avx2(in, outcoeff256);
4911 86352 : fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
4912 86352 : av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
4913 86352 : write_buffer_4x8(outcoeff256, output);
4914 86352 : break;
4915 0 : default: assert(0);
4916 : }
4917 : (void)bd;
4918 7685610 : }
4919 :
4920 7357290 : void eb_av1_fwd_txfm2d_8x4_avx2(int16_t *input, int32_t *output, uint32_t stride,
4921 : TxType tx_type, uint8_t bd)
4922 : {
4923 : __m256i in[4];
4924 7357290 : __m256i *outcoeff256 = (__m256i *)output;
4925 7357290 : const int8_t *shift = fwd_txfm_shift_ls[TX_8X4];
4926 7357290 : const int32_t txw_idx = get_txw_idx(TX_8X4);
4927 7357170 : const int32_t txh_idx = get_txh_idx(TX_8X4);
4928 7358570 : int32_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
4929 7358570 : int32_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
4930 :
4931 7358570 : switch (tx_type) {
4932 4264560 : case DCT_DCT:
4933 4264560 : load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
4934 4265090 : fdct4x8_row_avx2(in, in, bitcol, 1);
4935 4265120 : col_txfm_8x4_rounding(in, -shift[1]);
4936 4265080 : fdct4x8_avx2(in, outcoeff256, bitrow);
4937 4265130 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
4938 : NewSqrt2);
4939 4264980 : transpose_4x8_avx2(in, outcoeff256);
4940 4264940 : break;
4941 697618 : case ADST_DCT:
4942 697618 : load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
4943 697645 : fadst4x8_row_avx2(in, in, bitcol, 1);
4944 697642 : col_txfm_8x4_rounding(in, -shift[1]);
4945 697638 : fdct4x8_avx2(in, outcoeff256, bitrow);
4946 697648 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
4947 : NewSqrt2);
4948 697641 : transpose_4x8_avx2(in, outcoeff256);
4949 697639 : break;
4950 695527 : case DCT_ADST:
4951 695527 : load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
4952 695550 : fdct4x8_row_avx2(in, in, bitcol, 1);
4953 695545 : col_txfm_8x4_rounding(in, -shift[1]);
4954 695544 : fadst8x4_avx2(in, outcoeff256, bitrow, 1);
4955 695547 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
4956 : NewSqrt2);
4957 695544 : transpose_4x8_avx2(in, outcoeff256);
4958 695545 : break;
4959 415524 : case ADST_ADST:
4960 415524 : load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
4961 415531 : fadst4x8_row_avx2(in, in, bitcol, 1);
4962 415535 : col_txfm_8x4_rounding(in, -shift[1]);
4963 415533 : fadst8x4_avx2(in, outcoeff256, bitrow, 1);
4964 415536 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
4965 : NewSqrt2);
4966 415533 : transpose_4x8_avx2(in, outcoeff256);
4967 415532 : break;
4968 82951 : case FLIPADST_DCT:
4969 82951 : load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
4970 82951 : fadst4x8_row_avx2(in, in, bitcol, 1);
4971 82951 : col_txfm_8x4_rounding(in, -shift[1]);
4972 82951 : fdct4x8_avx2(in, outcoeff256, bitrow);
4973 82951 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
4974 : NewSqrt2);
4975 82951 : transpose_4x8_avx2(in, outcoeff256);
4976 82951 : break;
4977 83444 : case DCT_FLIPADST:
4978 83444 : load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
4979 83443 : fdct4x8_row_avx2(in, in, bitcol, 1);
4980 83444 : col_txfm_8x4_rounding(in, -shift[1]);
4981 83444 : fadst8x4_avx2(in, outcoeff256, bitrow, 1);
4982 83444 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
4983 : NewSqrt2);
4984 83444 : transpose_4x8_avx2(in, outcoeff256);
4985 83444 : break;
4986 84209 : case FLIPADST_FLIPADST:
4987 84209 : load_buffer_8x4_avx2(input, in, stride, 1, 1, shift[0]);
4988 84210 : fadst4x8_row_avx2(in, in, bitcol, 1);
4989 84210 : col_txfm_8x4_rounding(in, -shift[1]);
4990 84210 : fadst8x4_avx2(in, outcoeff256, bitrow, 1);
4991 84209 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
4992 : NewSqrt2);
4993 84210 : transpose_4x8_avx2(in, outcoeff256);
4994 84210 : break;
4995 85842 : case ADST_FLIPADST:
4996 85842 : load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
4997 85842 : fadst4x8_row_avx2(in, in, bitcol, 1);
4998 85842 : col_txfm_8x4_rounding(in, -shift[1]);
4999 85842 : fadst8x4_avx2(in, outcoeff256, bitrow, 1);
5000 85842 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
5001 : NewSqrt2);
5002 85842 : transpose_4x8_avx2(in, outcoeff256);
5003 85842 : break;
5004 84843 : case FLIPADST_ADST:
5005 84843 : load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
5006 84842 : fadst4x8_row_avx2(in, in, bitcol, 1);
5007 84843 : col_txfm_8x4_rounding(in, -shift[1]);
5008 84843 : fadst8x4_avx2(in, outcoeff256, bitrow, 1);
5009 84843 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
5010 : NewSqrt2);
5011 84843 : transpose_4x8_avx2(in, outcoeff256);
5012 84843 : break;
5013 171650 : case IDTX:
5014 171650 : load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
5015 171653 : fidtx4x8_row_avx2(in, in, bitcol, 1);
5016 171653 : col_txfm_8x4_rounding(in, -shift[1]);
5017 171653 : fidtx8x4_avx2(in, outcoeff256, bitrow);
5018 171653 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
5019 : NewSqrt2);
5020 171652 : transpose_4x8_avx2(in, outcoeff256);
5021 171652 : break;
5022 174260 : case V_DCT:
5023 174260 : load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
5024 174262 : fdct4x8_row_avx2(in, in, bitcol, 1);
5025 174262 : col_txfm_8x4_rounding(in, -shift[1]);
5026 174261 : fidtx8x4_avx2(in, outcoeff256, bitrow);
5027 174261 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
5028 : NewSqrt2);
5029 174260 : transpose_4x8_avx2(in, outcoeff256);
5030 174260 : break;
5031 178111 : case H_DCT:
5032 178111 : load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
5033 178112 : fidtx4x8_row_avx2(in, in, bitcol, 1);
5034 178112 : col_txfm_8x4_rounding(in, -shift[1]);
5035 178113 : fdct4x8_avx2(in, outcoeff256, bitrow);
5036 178113 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
5037 : NewSqrt2);
5038 178112 : transpose_4x8_avx2(in, outcoeff256);
5039 178112 : break;
5040 85755 : case V_ADST:
5041 85755 : load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
5042 85755 : fadst4x8_row_avx2(in, in, bitcol, 1);
5043 85755 : col_txfm_8x4_rounding(in, -shift[1]);
5044 85755 : fidtx8x4_avx2(in, outcoeff256, bitrow);
5045 85755 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
5046 : NewSqrt2);
5047 85755 : transpose_4x8_avx2(in, outcoeff256);
5048 85755 : break;
5049 85567 : case H_ADST:
5050 85567 : load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
5051 85567 : fidtx4x8_row_avx2(in, in, bitcol, 1);
5052 85567 : col_txfm_8x4_rounding(in, -shift[1]);
5053 85567 : fadst8x4_avx2(in, outcoeff256, bitrow, 1);
5054 85567 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
5055 : NewSqrt2);
5056 85567 : transpose_4x8_avx2(in, outcoeff256);
5057 85567 : break;
5058 83485 : case V_FLIPADST:
5059 83485 : load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
5060 83485 : fadst4x8_row_avx2(in, in, bitcol, 1);
5061 83485 : col_txfm_8x4_rounding(in, -shift[1]);
5062 83485 : fidtx8x4_avx2(in, outcoeff256, bitrow);
5063 83485 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
5064 : NewSqrt2);
5065 83485 : transpose_4x8_avx2(in, outcoeff256);
5066 83485 : break;
5067 85222 : case H_FLIPADST:
5068 85222 : load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
5069 85222 : fidtx4x8_row_avx2(in, in, bitcol, 1);
5070 85222 : col_txfm_8x4_rounding(in, -shift[1]);
5071 85222 : fadst8x4_avx2(in, outcoeff256, bitrow, 1);
5072 85222 : av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
5073 : NewSqrt2);
5074 85222 : transpose_4x8_avx2(in, outcoeff256);
5075 85222 : break;
5076 0 : default: assert(0);
5077 : }
5078 : (void)bd;
5079 7359000 : }
5080 :
5081 5564310 : void eb_av1_fwd_txfm2d_4x16_avx2(int16_t *input, int32_t *output, uint32_t stride,
5082 : TxType tx_type, uint8_t bd)
5083 : {
5084 : __m256i in[8];
5085 : __m256i outcoeff256[8];
5086 5564310 : const int8_t *shift = fwd_txfm_shift_ls[TX_4X16];
5087 5564310 : const int32_t txw_idx = get_txw_idx(TX_4X16);
5088 5564260 : const int32_t txh_idx = get_txh_idx(TX_4X16);
5089 5564950 : int32_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
5090 5564950 : int32_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
5091 :
5092 5564950 : switch (tx_type) {
5093 2951090 : case DCT_DCT:
5094 2951090 : load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5095 2951320 : fdct16x4_avx2(in, outcoeff256, bitcol);
5096 2951280 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5097 2951270 : transpose_4x16_avx2(outcoeff256, in);
5098 8853730 : for (int32_t i = 0; i < 2; i++)
5099 5902410 : fdct4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5100 2951320 : write_buffer_8x8(outcoeff256, output);
5101 2951250 : break;
5102 342812 : case ADST_DCT:
5103 342812 : load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5104 342821 : fadst16x4_avx2(in, outcoeff256, bitcol);
5105 342817 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5106 342819 : transpose_4x16_avx2(outcoeff256, in);
5107 1028460 : for (int32_t i = 0; i < 2; i++)
5108 685636 : fdct4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5109 342821 : write_buffer_8x8(outcoeff256, output);
5110 342820 : break;
5111 354921 : case DCT_ADST:
5112 354921 : load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5113 354925 : fdct16x4_avx2(in, outcoeff256, bitcol);
5114 354926 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5115 354927 : transpose_4x16_avx2(outcoeff256, in);
5116 1064760 : for (int32_t i = 0; i < 2; i++)
5117 709839 : fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5118 354926 : write_buffer_8x8(outcoeff256, output);
5119 354924 : break;
5120 267241 : case ADST_ADST:
5121 267241 : load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5122 267244 : fadst16x4_avx2(in, outcoeff256, bitcol);
5123 267242 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5124 267244 : transpose_4x16_avx2(outcoeff256, in);
5125 801731 : for (int32_t i = 0; i < 2; i++)
5126 534487 : fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5127 267244 : write_buffer_8x8(outcoeff256, output);
5128 267242 : break;
5129 118347 : case FLIPADST_DCT:
5130 118347 : load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
5131 118347 : fadst16x4_avx2(in, outcoeff256, bitcol);
5132 118348 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5133 118347 : transpose_4x16_avx2(outcoeff256, in);
5134 355043 : for (int32_t i = 0; i < 2; i++)
5135 236695 : fdct4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5136 118348 : write_buffer_8x8(outcoeff256, output);
5137 118348 : break;
5138 118830 : case DCT_FLIPADST:
5139 118830 : load_buffer_4x16_avx2(input, in, stride, 0, 1, shift[0]);
5140 118830 : fdct16x4_avx2(in, outcoeff256, bitcol);
5141 118830 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5142 118830 : transpose_4x16_avx2(outcoeff256, in);
5143 356490 : for (int32_t i = 0; i < 2; i++)
5144 237660 : fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5145 118830 : write_buffer_8x8(outcoeff256, output);
5146 118830 : break;
5147 118952 : case FLIPADST_FLIPADST:
5148 118952 : load_buffer_4x16_avx2(input, in, stride, 1, 1, shift[0]);
5149 118952 : fadst16x4_avx2(in, outcoeff256, bitcol);
5150 118952 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5151 118952 : transpose_4x16_avx2(outcoeff256, in);
5152 356852 : for (int32_t i = 0; i < 2; i++)
5153 237902 : fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5154 118950 : write_buffer_8x8(outcoeff256, output);
5155 118951 : break;
5156 119179 : case ADST_FLIPADST:
5157 119179 : load_buffer_4x16_avx2(input, in, stride, 0, 1, shift[0]);
5158 119179 : fadst16x4_avx2(in, outcoeff256, bitcol);
5159 119179 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5160 119179 : transpose_4x16_avx2(outcoeff256, in);
5161 357537 : for (int32_t i = 0; i < 2; i++)
5162 238358 : fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5163 119179 : write_buffer_8x8(outcoeff256, output);
5164 119179 : break;
5165 119410 : case FLIPADST_ADST:
5166 119410 : load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
5167 119410 : fadst16x4_avx2(in, outcoeff256, bitcol);
5168 119410 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5169 119410 : transpose_4x16_avx2(outcoeff256, in);
5170 358230 : for (int32_t i = 0; i < 2; i++)
5171 238820 : fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5172 119410 : write_buffer_8x8(outcoeff256, output);
5173 119410 : break;
5174 193807 : case IDTX:
5175 193807 : load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5176 193809 : fidtx16x8_avx2(in, outcoeff256, bitcol, 1);
5177 193809 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5178 193809 : transpose_4x16_avx2(outcoeff256, in);
5179 581427 : for (int32_t i = 0; i < 2; i++)
5180 387618 : fidtx4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5181 193809 : write_buffer_8x8(outcoeff256, output);
5182 193809 : break;
5183 185717 : case V_DCT:
5184 185717 : load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5185 185718 : fdct16x4_avx2(in, outcoeff256, bitcol);
5186 185717 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5187 185717 : transpose_4x16_avx2(outcoeff256, in);
5188 557149 : for (int32_t i = 0; i < 2; i++)
5189 371433 : fidtx4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5190 185716 : write_buffer_8x8(outcoeff256, output);
5191 185718 : break;
5192 197130 : case H_DCT:
5193 197130 : load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5194 197131 : fidtx16x8_avx2(in, outcoeff256, bitcol, 1);
5195 197130 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5196 197130 : transpose_4x16_avx2(outcoeff256, in);
5197 591390 : for (int32_t i = 0; i < 2; i++)
5198 394259 : fdct4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5199 197131 : write_buffer_8x8(outcoeff256, output);
5200 197131 : break;
5201 118780 : case V_ADST:
5202 118780 : load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5203 118780 : fadst16x4_avx2(in, outcoeff256, bitcol);
5204 118780 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5205 118780 : transpose_4x16_avx2(outcoeff256, in);
5206 356340 : for (int32_t i = 0; i < 2; i++)
5207 237560 : fidtx4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5208 118780 : write_buffer_8x8(outcoeff256, output);
5209 118780 : break;
5210 120282 : case H_ADST:
5211 120282 : load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5212 120282 : fidtx16x8_avx2(in, outcoeff256, bitcol, 1);
5213 120282 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5214 120282 : transpose_4x16_avx2(outcoeff256, in);
5215 360846 : for (int32_t i = 0; i < 2; i++)
5216 240564 : fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5217 120282 : write_buffer_8x8(outcoeff256, output);
5218 120282 : break;
5219 118469 : case V_FLIPADST:
5220 118469 : load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
5221 118469 : fadst16x4_avx2(in, outcoeff256, bitcol);
5222 118468 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5223 118468 : transpose_4x16_avx2(outcoeff256, in);
5224 355406 : for (int32_t i = 0; i < 2; i++)
5225 236937 : fidtx4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5226 118469 : write_buffer_8x8(outcoeff256, output);
5227 118469 : break;
5228 119980 : case H_FLIPADST:
5229 119980 : load_buffer_4x16_avx2(input, in, stride, 0, 1, shift[0]);
5230 119981 : fidtx16x8_avx2(in, outcoeff256, bitcol, 1);
5231 119981 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5232 119981 : transpose_4x16_avx2(outcoeff256, in);
5233 359943 : for (int32_t i = 0; i < 2; i++)
5234 239962 : fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5235 119981 : write_buffer_8x8(outcoeff256, output);
5236 119981 : break;
5237 0 : default: assert(0);
5238 : }
5239 : (void)bd;
5240 5565120 : }
5241 :
5242 5777380 : void eb_av1_fwd_txfm2d_16x4_avx2(int16_t *input, int32_t *output, uint32_t stride,
5243 : TxType tx_type, uint8_t bd) {
5244 : __m256i in[8];
5245 5777380 : __m256i *outcoeff256 = (__m256i *)output;
5246 5777380 : const int8_t *shift = fwd_shift_16x4;
5247 5777380 : const int32_t txw_idx = get_txw_idx(TX_16X4);
5248 5777320 : const int32_t txh_idx = get_txh_idx(TX_16X4);
5249 5778010 : int32_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
5250 5778010 : int32_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
5251 :
5252 5778010 : switch (tx_type) {
5253 3049470 : case DCT_DCT:
5254 3049470 : load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5255 9148650 : for (int32_t i = 0; i < 2; i++)
5256 6098900 : fdct4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5257 3049750 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5258 3049630 : fdct16x4_avx2(outcoeff256, in, bitrow);
5259 3049600 : transpose_4x16_avx2(in, outcoeff256);
5260 3049610 : break;
5261 363028 : case ADST_DCT:
5262 363028 : load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5263 1089090 : for (int32_t i = 0; i < 2; i++)
5264 726053 : fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5265 363035 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5266 363027 : fdct16x4_avx2(outcoeff256, in, bitrow);
5267 363028 : transpose_4x16_avx2(in, outcoeff256);
5268 363028 : break;
5269 356091 : case DCT_ADST:
5270 356091 : load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5271 1068270 : for (int32_t i = 0; i < 2; i++)
5272 712181 : fdct4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5273 356091 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5274 356090 : fadst16x4_avx2(outcoeff256, in, bitrow);
5275 356092 : transpose_4x16_avx2(in, outcoeff256);
5276 356091 : break;
5277 277087 : case ADST_ADST:
5278 277087 : load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5279 831259 : for (int32_t i = 0; i < 2; i++)
5280 554174 : fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5281 277085 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5282 277088 : fadst16x4_avx2(outcoeff256, in, bitrow);
5283 277087 : transpose_4x16_avx2(in, outcoeff256);
5284 277087 : break;
5285 125337 : case FLIPADST_DCT:
5286 125337 : load_buffer_16x4_avx2(input, in, stride, 1, 0, shift[0]);
5287 376010 : for (int32_t i = 0; i < 2; i++)
5288 250670 : fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5289 125340 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5290 125337 : fdct16x4_avx2(outcoeff256, in, bitrow);
5291 125337 : transpose_4x16_avx2(in, outcoeff256);
5292 125337 : break;
5293 125096 : case DCT_FLIPADST:
5294 125096 : load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
5295 375288 : for (int32_t i = 0; i < 2; i++)
5296 250192 : fdct4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5297 125096 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5298 125096 : fadst16x4_avx2(outcoeff256, in, bitrow);
5299 125095 : transpose_4x16_avx2(in, outcoeff256);
5300 125096 : break;
5301 125734 : case FLIPADST_FLIPADST:
5302 125734 : load_buffer_16x4_avx2(input, in, stride, 1, 1, shift[0]);
5303 377206 : for (int32_t i = 0; i < 2; i++)
5304 251471 : fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5305 125735 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5306 125734 : fadst16x4_avx2(outcoeff256, in, bitrow);
5307 125736 : transpose_4x16_avx2(in, outcoeff256);
5308 125737 : break;
5309 125952 : case ADST_FLIPADST:
5310 125952 : load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
5311 377857 : for (int32_t i = 0; i < 2; i++)
5312 251905 : fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5313 125952 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5314 125953 : fadst16x4_avx2(outcoeff256, in, bitrow);
5315 125952 : transpose_4x16_avx2(in, outcoeff256);
5316 125952 : break;
5317 125992 : case FLIPADST_ADST:
5318 125992 : load_buffer_16x4_avx2(input, in, stride, 1, 0, shift[0]);
5319 377976 : for (int32_t i = 0; i < 2; i++)
5320 251984 : fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5321 125992 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5322 125992 : fadst16x4_avx2(outcoeff256, in, bitrow);
5323 125992 : transpose_4x16_avx2(in, outcoeff256);
5324 125992 : break;
5325 204819 : case IDTX:
5326 204819 : load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5327 614459 : for (int32_t i = 0; i < 2; i++)
5328 409638 : fidtx4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5329 204821 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5330 204819 : fidtx16x8_avx2(outcoeff256, in, bitrow, 1);
5331 204820 : transpose_4x16_avx2(in, outcoeff256);
5332 204820 : break;
5333 199009 : case V_DCT:
5334 199009 : load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5335 597032 : for (int32_t i = 0; i < 2; i++)
5336 398021 : fdct4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5337 199011 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5338 199011 : fidtx16x8_avx2(outcoeff256, in, bitrow, 1);
5339 199011 : transpose_4x16_avx2(in, outcoeff256);
5340 199011 : break;
5341 197798 : case H_DCT:
5342 197798 : load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5343 593396 : for (int32_t i = 0; i < 2; i++)
5344 395597 : fidtx4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5345 197799 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5346 197799 : fdct16x4_avx2(outcoeff256, in, bitrow);
5347 197798 : transpose_4x16_avx2(in, outcoeff256);
5348 197798 : break;
5349 126143 : case V_ADST:
5350 126143 : load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5351 378429 : for (int32_t i = 0; i < 2; i++)
5352 252286 : fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5353 126143 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5354 126143 : fidtx16x8_avx2(outcoeff256, in, bitrow, 1);
5355 126143 : transpose_4x16_avx2(in, outcoeff256);
5356 126143 : break;
5357 125542 : case H_ADST:
5358 125542 : load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5359 376626 : for (int32_t i = 0; i < 2; i++)
5360 251084 : fidtx4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5361 125542 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5362 125542 : fadst16x4_avx2(outcoeff256, in, bitrow);
5363 125542 : transpose_4x16_avx2(in, outcoeff256);
5364 125542 : break;
5365 125586 : case V_FLIPADST:
5366 125586 : load_buffer_16x4_avx2(input, in, stride, 1, 0, shift[0]);
5367 376758 : for (int32_t i = 0; i < 2; i++)
5368 251172 : fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5369 125586 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5370 125586 : fidtx16x8_avx2(outcoeff256, in, bitrow, 1);
5371 125586 : transpose_4x16_avx2(in, outcoeff256);
5372 125586 : break;
5373 125330 : case H_FLIPADST:
5374 125330 : load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
5375 375992 : for (int32_t i = 0; i < 2; i++)
5376 250662 : fidtx4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5377 125330 : col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5378 125331 : fadst16x4_avx2(outcoeff256, in, bitrow);
5379 125331 : transpose_4x16_avx2(in, outcoeff256);
5380 125331 : break;
5381 0 : default: assert(0);
5382 : }
5383 : (void)bd;
5384 5778160 : }
|