Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 : #include <assert.h>
12 : #include <immintrin.h>
13 : #include "EbDefinitions.h"
14 : #include "aom_dsp_rtcd.h"
15 : #include "EbTransforms.h"
16 : #include "av1_inv_txfm_ssse3.h"
17 : #include "txfm_common_avx2.h"
18 :
19 0 : static INLINE void highbd_clamp_epi32(__m256i *x, int32_t bd) {
20 0 : const __m256i zero = _mm256_setzero_si256();
21 0 : const __m256i max = _mm256_set1_epi32((1 << bd) - 1);
22 :
23 0 : *x = _mm256_min_epi32(*x, max);
24 0 : *x = _mm256_max_epi32(*x, zero);
25 0 : }
26 :
27 0 : static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int32_t bd) {
28 0 : const __m256i zero = _mm256_setzero_si256();
29 0 : const __m256i one = _mm256_set1_epi16(1);
30 0 : const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
31 : __m256i clamped, mask;
32 :
33 0 : mask = _mm256_cmpgt_epi16(u, max);
34 0 : clamped = _mm256_andnot_si256(mask, u);
35 0 : mask = _mm256_and_si256(mask, max);
36 0 : clamped = _mm256_or_si256(mask, clamped);
37 0 : mask = _mm256_cmpgt_epi16(clamped, zero);
38 0 : clamped = _mm256_and_si256(clamped, mask);
39 :
40 0 : return clamped;
41 : }
42 :
43 0 : static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
44 : const __m256i *w1, const __m256i *n1,
45 : const __m256i *rounding, int32_t bit) {
46 : __m256i x, y;
47 :
48 0 : x = _mm256_mullo_epi32(*w0, *n0);
49 0 : y = _mm256_mullo_epi32(*w1, *n1);
50 0 : x = _mm256_add_epi32(x, y);
51 0 : x = _mm256_add_epi32(x, *rounding);
52 0 : x = _mm256_srai_epi32(x, bit);
53 0 : return x;
54 : }
55 :
56 0 : static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
57 : __m256i *out1, const __m256i *clamp_lo,
58 : const __m256i *clamp_hi) {
59 0 : __m256i a0 = _mm256_add_epi32(in0, in1);
60 0 : __m256i a1 = _mm256_sub_epi32(in0, in1);
61 :
62 0 : a0 = _mm256_max_epi32(a0, *clamp_lo);
63 0 : a0 = _mm256_min_epi32(a0, *clamp_hi);
64 0 : a1 = _mm256_max_epi32(a1, *clamp_lo);
65 0 : a1 = _mm256_min_epi32(a1, *clamp_hi);
66 :
67 0 : *out0 = a0;
68 0 : *out1 = a1;
69 0 : }
70 :
71 0 : static void addsub_no_clamp_avx2(const __m256i in0, const __m256i in1,
72 : __m256i *out0, __m256i *out1) {
73 0 : __m256i a0 = _mm256_add_epi32(in0, in1);
74 0 : __m256i a1 = _mm256_sub_epi32(in0, in1);
75 :
76 0 : *out0 = a0;
77 0 : *out1 = a1;
78 0 : }
79 :
80 0 : static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
81 : __m256i *out0, __m256i *out1,
82 : const __m256i *clamp_lo, const __m256i *clamp_hi,
83 : int32_t shift) {
84 0 : __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
85 0 : __m256i in0_w_offset = _mm256_add_epi32(in0, offset);
86 0 : __m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
87 0 : __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
88 :
89 0 : a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
90 0 : a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
91 :
92 0 : a0 = _mm256_max_epi32(a0, *clamp_lo);
93 0 : a0 = _mm256_min_epi32(a0, *clamp_hi);
94 0 : a1 = _mm256_max_epi32(a1, *clamp_lo);
95 0 : a1 = _mm256_min_epi32(a1, *clamp_hi);
96 :
97 0 : *out0 = a0;
98 0 : *out1 = a1;
99 0 : }
100 :
101 0 : static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
102 : __m256i *out1, const __m256i *clamp_lo,
103 : const __m256i *clamp_hi, int32_t shift) {
104 0 : __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
105 0 : __m256i a0 = _mm256_add_epi32(offset, in0);
106 0 : __m256i a1 = _mm256_sub_epi32(offset, in1);
107 :
108 0 : a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
109 0 : a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
110 :
111 0 : a0 = _mm256_max_epi32(a0, *clamp_lo);
112 0 : a0 = _mm256_min_epi32(a0, *clamp_hi);
113 0 : a1 = _mm256_max_epi32(a1, *clamp_lo);
114 0 : a1 = _mm256_min_epi32(a1, *clamp_hi);
115 :
116 0 : *out0 = a0;
117 0 : *out1 = a1;
118 0 : }
119 :
120 0 : static INLINE void idct32_stage4_avx2(
121 : __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
122 : const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
123 : const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
124 : const __m256i *rounding, int32_t bit) {
125 : __m256i temp1, temp2;
126 0 : temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
127 0 : bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
128 0 : bf1[17] = temp1;
129 :
130 0 : temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
131 0 : bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
132 0 : bf1[18] = temp2;
133 :
134 0 : temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
135 0 : bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
136 0 : bf1[21] = temp1;
137 :
138 0 : temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
139 0 : bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
140 0 : bf1[22] = temp2;
141 0 : }
142 :
143 0 : static INLINE void idct32_stage5_avx2(
144 : __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
145 : const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
146 : const __m256i *clamp_hi, const __m256i *rounding, int32_t bit) {
147 : __m256i temp1, temp2;
148 0 : temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
149 0 : bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
150 0 : bf1[9] = temp1;
151 :
152 0 : temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
153 0 : bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
154 0 : bf1[10] = temp2;
155 :
156 0 : addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
157 0 : addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
158 0 : addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
159 0 : addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
160 0 : addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
161 0 : addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
162 0 : addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
163 0 : addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
164 0 : }
165 :
166 0 : static INLINE void idct32_stage6_avx2(
167 : __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
168 : const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
169 : const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
170 : const __m256i *rounding, int32_t bit) {
171 : __m256i temp1, temp2;
172 0 : temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
173 0 : bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
174 0 : bf1[5] = temp1;
175 :
176 0 : addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
177 0 : addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
178 0 : addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
179 0 : addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
180 :
181 0 : temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
182 0 : bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
183 0 : bf1[18] = temp1;
184 0 : temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
185 0 : bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
186 0 : bf1[19] = temp2;
187 0 : temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
188 0 : bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
189 0 : bf1[20] = temp1;
190 0 : temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
191 0 : bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
192 0 : bf1[21] = temp2;
193 0 : }
194 :
195 0 : static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
196 : const __m256i *cospi32,
197 : const __m256i *clamp_lo,
198 : const __m256i *clamp_hi,
199 : const __m256i *rounding, int32_t bit) {
200 : __m256i temp1, temp2;
201 0 : addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
202 0 : addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
203 0 : addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
204 0 : addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
205 :
206 0 : temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
207 0 : bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
208 0 : bf1[10] = temp1;
209 0 : temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
210 0 : bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
211 0 : bf1[11] = temp2;
212 :
213 0 : addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
214 0 : addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
215 0 : addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
216 0 : addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
217 0 : addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
218 0 : addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
219 0 : addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
220 0 : addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
221 0 : }
222 :
223 0 : static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
224 : const __m256i *cospi32,
225 : const __m256i *clamp_lo,
226 : const __m256i *clamp_hi,
227 : const __m256i *rounding, int32_t bit) {
228 : __m256i temp1, temp2;
229 0 : addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
230 0 : addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
231 0 : addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
232 0 : addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
233 0 : addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
234 0 : addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
235 0 : addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
236 0 : addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
237 :
238 0 : temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
239 0 : bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
240 0 : bf1[20] = temp1;
241 0 : temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
242 0 : bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
243 0 : bf1[21] = temp2;
244 0 : temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
245 0 : bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
246 0 : bf1[22] = temp1;
247 0 : temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
248 0 : bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
249 0 : bf1[23] = temp2;
250 0 : }
251 :
252 0 : static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
253 : const int32_t do_cols, const int32_t bd,
254 : const int32_t out_shift,
255 : const int32_t log_range) {
256 0 : if (do_cols) {
257 0 : addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31);
258 0 : addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30);
259 0 : addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29);
260 0 : addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28);
261 0 : addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27);
262 0 : addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26);
263 0 : addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25);
264 0 : addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24);
265 0 : addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23);
266 0 : addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22);
267 0 : addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21);
268 0 : addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20);
269 0 : addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19);
270 0 : addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18);
271 0 : addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17);
272 0 : addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16);
273 : }
274 : else {
275 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
276 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
277 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
278 0 : const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
279 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
280 :
281 0 : addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
282 : &clamp_hi_out, out_shift);
283 0 : addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
284 : &clamp_hi_out, out_shift);
285 0 : addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
286 : &clamp_hi_out, out_shift);
287 0 : addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
288 : &clamp_hi_out, out_shift);
289 0 : addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
290 : &clamp_hi_out, out_shift);
291 0 : addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
292 : &clamp_hi_out, out_shift);
293 0 : addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
294 : &clamp_hi_out, out_shift);
295 0 : addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
296 : &clamp_hi_out, out_shift);
297 0 : addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
298 : &clamp_hi_out, out_shift);
299 0 : addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
300 : &clamp_hi_out, out_shift);
301 0 : addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
302 : &clamp_hi_out, out_shift);
303 0 : addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
304 : &clamp_hi_out, out_shift);
305 0 : addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
306 : &clamp_hi_out, out_shift);
307 0 : addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
308 : &clamp_hi_out, out_shift);
309 0 : addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
310 : &clamp_hi_out, out_shift);
311 0 : addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
312 : &clamp_hi_out, out_shift);
313 : }
314 0 : }
315 :
316 0 : static INLINE void idct64_stage8_avx2(
317 : __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
318 : const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
319 : const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
320 : const __m256i *rnding, int32_t bit) {
321 : int32_t i;
322 : __m256i temp1, temp2, temp3, temp4;
323 0 : temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit);
324 0 : u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit);
325 0 : u[10] = temp1;
326 0 : temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit);
327 0 : u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit);
328 0 : u[11] = temp2;
329 :
330 0 : for (i = 16; i < 20; ++i) {
331 0 : addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
332 0 : addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
333 : }
334 :
335 0 : temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit);
336 0 : temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit);
337 0 : temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit);
338 0 : temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit);
339 0 : u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit);
340 0 : u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit);
341 0 : u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit);
342 0 : u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit);
343 0 : u[36] = temp1;
344 0 : u[37] = temp2;
345 0 : u[38] = temp3;
346 0 : u[39] = temp4;
347 :
348 0 : temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit);
349 0 : temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit);
350 0 : temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit);
351 0 : temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit);
352 0 : u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit);
353 0 : u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit);
354 0 : u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit);
355 0 : u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit);
356 0 : u[40] = temp1;
357 0 : u[41] = temp2;
358 0 : u[42] = temp3;
359 0 : u[43] = temp4;
360 0 : }
361 :
362 0 : static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
363 : const __m256i *cospi32,
364 : const __m256i *clamp_lo,
365 : const __m256i *clamp_hi,
366 : const __m256i *rnding, int32_t bit) {
367 : int32_t i;
368 : __m256i temp1, temp2, temp3, temp4;
369 0 : for (i = 0; i < 8; ++i)
370 0 : addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
371 0 : temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit);
372 0 : temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit);
373 0 : temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit);
374 0 : temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit);
375 0 : u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit);
376 0 : u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit);
377 0 : u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit);
378 0 : u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit);
379 0 : u[20] = temp1;
380 0 : u[21] = temp2;
381 0 : u[22] = temp3;
382 0 : u[23] = temp4;
383 0 : for (i = 32; i < 40; i++)
384 0 : addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
385 0 : for (i = 48; i < 56; i++)
386 0 : addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
387 0 : }
388 :
389 0 : static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
390 : const __m256i *cospi32,
391 : const __m256i *clamp_lo,
392 : const __m256i *clamp_hi,
393 : const __m256i *rnding, int32_t bit) {
394 : __m256i temp1, temp2, temp3, temp4;
395 0 : for (int32_t i = 0; i < 16; i++)
396 0 : addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
397 0 : temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit);
398 0 : temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit);
399 0 : temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit);
400 0 : temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit);
401 0 : u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit);
402 0 : u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit);
403 0 : u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit);
404 0 : u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit);
405 0 : u[40] = temp1;
406 0 : u[41] = temp2;
407 0 : u[42] = temp3;
408 0 : u[43] = temp4;
409 :
410 0 : temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit);
411 0 : temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit);
412 0 : temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit);
413 0 : temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit);
414 0 : u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit);
415 0 : u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit);
416 0 : u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit);
417 0 : u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit);
418 0 : u[44] = temp1;
419 0 : u[45] = temp2;
420 0 : u[46] = temp3;
421 0 : u[47] = temp4;
422 0 : }
423 :
424 0 : static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int32_t do_cols,
425 : int32_t bd, int32_t out_shift,
426 : const int32_t log_range) {
427 0 : if (do_cols) {
428 0 : for (int32_t i = 0; i < 32; i++)
429 0 : addsub_no_clamp_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
430 : }
431 : else {
432 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
433 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
434 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
435 0 : const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
436 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
437 :
438 0 : for (int32_t i = 0; i < 32; i++) {
439 0 : addsub_shift_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
440 : &clamp_lo_out, &clamp_hi_out, out_shift);
441 : }
442 : }
443 0 : }
444 :
445 0 : static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
446 : __m256i u0, u1, u2, u3, u4, u5, u6, u7;
447 : __m256i x0, x1;
448 :
449 0 : u0 = _mm256_unpacklo_epi32(in[7], in[6]);
450 0 : u1 = _mm256_unpackhi_epi32(in[7], in[6]);
451 :
452 0 : u2 = _mm256_unpacklo_epi32(in[5], in[4]);
453 0 : u3 = _mm256_unpackhi_epi32(in[5], in[4]);
454 :
455 0 : u4 = _mm256_unpacklo_epi32(in[3], in[2]);
456 0 : u5 = _mm256_unpackhi_epi32(in[3], in[2]);
457 :
458 0 : u6 = _mm256_unpacklo_epi32(in[1], in[0]);
459 0 : u7 = _mm256_unpackhi_epi32(in[1], in[0]);
460 :
461 0 : x0 = _mm256_unpacklo_epi64(u0, u2);
462 0 : x1 = _mm256_unpacklo_epi64(u4, u6);
463 0 : out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
464 0 : out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
465 :
466 0 : x0 = _mm256_unpackhi_epi64(u0, u2);
467 0 : x1 = _mm256_unpackhi_epi64(u4, u6);
468 0 : out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
469 0 : out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
470 :
471 0 : x0 = _mm256_unpacklo_epi64(u1, u3);
472 0 : x1 = _mm256_unpacklo_epi64(u5, u7);
473 0 : out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
474 0 : out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
475 :
476 0 : x0 = _mm256_unpackhi_epi64(u1, u3);
477 0 : x1 = _mm256_unpackhi_epi64(u5, u7);
478 0 : out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
479 0 : out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
480 0 : }
481 :
482 0 : static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
483 : const int32_t bd) {
484 0 : __m256i x0 = pred;
485 0 : x0 = _mm256_add_epi32(res, x0);
486 0 : x0 = _mm256_packus_epi32(x0, x0);
487 0 : x0 = _mm256_permute4x64_epi64(x0, 0xd8);
488 0 : x0 = highbd_clamp_epi16_avx2(x0, bd);
489 0 : return x0;
490 : }
491 :
492 0 : static INLINE void highbd_write_buffer_8xn_avx2(__m256i *in,
493 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
494 : int32_t flipud, int32_t height, const int32_t bd) {
495 0 : int32_t j = flipud ? (height - 1) : 0;
496 : __m128i temp;
497 0 : const int32_t step = flipud ? -1 : 1;
498 0 : for (int32_t i = 0; i < height; ++i, j += step) {
499 0 : temp = _mm_loadu_si128((__m128i const *)(output_r + i * stride_r));
500 0 : __m256i v = _mm256_cvtepi16_epi32(temp);
501 0 : __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd);
502 0 : __m128i u1 = _mm256_castsi256_si128(u);
503 0 : _mm_storeu_si128((__m128i *)(output_w + i * stride_w), u1);
504 : }
505 0 : }
506 :
507 0 : static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
508 : __m256i res0, __m256i res1,
509 : const int32_t bd) {
510 0 : __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
511 0 : __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
512 :
513 0 : x0 = _mm256_add_epi32(res0, x0);
514 0 : x1 = _mm256_add_epi32(res1, x1);
515 0 : x0 = _mm256_packus_epi32(x0, x1);
516 0 : x0 = _mm256_permute4x64_epi64(x0, 0xd8);
517 0 : x0 = highbd_clamp_epi16_avx2(x0, bd);
518 0 : return x0;
519 : }
520 :
521 0 : static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in,
522 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
523 : int32_t flipud, int32_t height, const int32_t bd) {
524 0 : int32_t j = flipud ? (height - 1) : 0;
525 0 : const int32_t step = flipud ? -1 : 1;
526 0 : for (int32_t i = 0; i < height; ++i, j += step) {
527 0 : __m256i v = _mm256_loadu_si256((__m256i const *)(output_r + i * stride_r));
528 0 : __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
529 :
530 0 : _mm256_storeu_si256((__m256i *)(output_w + i * stride_w), u);
531 : }
532 0 : }
533 :
534 0 : static INLINE void load_buffer_4x4(const int32_t *coeff, __m256i *in) {
535 0 : in[0] = _mm256_loadu_si256((const __m256i *)coeff);
536 0 : in[1] = _mm256_loadu_si256((const __m256i *)(coeff + 8));
537 0 : }
538 :
539 0 : static INLINE void write_buffer_4x4(__m256i *in,
540 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
541 : int32_t fliplr, int32_t flipud, int32_t bd) {
542 : __m256i u0, x0, x1, v0, v1;
543 0 : const __m256i zero = _mm256_setzero_si256();
544 :
545 0 : if (fliplr) {
546 0 : in[0] = _mm256_shuffle_epi32(in[0], 0x1B);
547 0 : in[1] = _mm256_shuffle_epi32(in[1], 0x1B);
548 : }
549 :
550 0 : if (flipud) {
551 0 : u0 = _mm256_set_epi64x(*(uint64_t*)(output_r + 0 * stride_r),
552 0 : *(uint64_t*)(output_r + 2 * stride_r),
553 0 : *(uint64_t*)(output_r + 1 * stride_r),
554 0 : *(uint64_t*)(output_r + 3 * stride_r));
555 : }
556 : else {
557 : // Load 64bits in order ACBD
558 0 : u0 = _mm256_set_epi64x(*(uint64_t*)(output_r + 3 * stride_r),
559 0 : *(uint64_t*)(output_r + 1 * stride_r),
560 0 : *(uint64_t*)(output_r + 2 * stride_r),
561 0 : *(uint64_t*)(output_r + 0 * stride_r));
562 : }
563 :
564 : // Unpack and Swap 128bits from ACBD to ABCD
565 0 : x0 = _mm256_unpacklo_epi16(u0, zero);
566 0 : x1 = _mm256_unpackhi_epi16(u0, zero);
567 :
568 0 : v0 = _mm256_add_epi32(in[0], x0);
569 0 : v1 = _mm256_add_epi32(in[1], x1);
570 :
571 0 : highbd_clamp_epi32(&v0, bd);
572 0 : highbd_clamp_epi32(&v1, bd);
573 :
574 : // Pack and Swap 128bits from ABCD to ACBD
575 0 : v0 = _mm256_packus_epi32(v0, v1);
576 :
577 0 : if (flipud) {
578 0 : _mm_storel_epi64((__m128i *)(output_w + 3 * stride_w),
579 : _mm256_castsi256_si128(v0));
580 0 : _mm_storel_epi64((__m128i *)(output_w + 2 * stride_w),
581 0 : _mm256_extractf128_si256(v0, 0x1));
582 : //Move up memory 64bites
583 0 : v0 = _mm256_permute4x64_epi64(v0, 1 + (3 << 4));
584 0 : _mm_storel_epi64((__m128i *)(output_w + 1 * stride_w),
585 : _mm256_castsi256_si128(v0));
586 0 : _mm_storel_epi64((__m128i *)(output_w + 0 * stride_w),
587 0 : _mm256_extractf128_si256(v0, 0x1));
588 : }
589 : else {
590 : // Store in order from ACBD to ABCD
591 0 : _mm_storel_epi64((__m128i *)(output_w + 0 * stride_w),
592 : _mm256_castsi256_si128(v0));
593 0 : _mm_storel_epi64((__m128i *)(output_w + 1 * stride_w),
594 0 : _mm256_extractf128_si256(v0, 0x1));
595 : //Move up memory 64bites
596 0 : v0 = _mm256_permute4x64_epi64(v0, 1 + (3 << 4));
597 0 : _mm_storel_epi64((__m128i *)(output_w + 2 * stride_w),
598 : _mm256_castsi256_si128(v0));
599 0 : _mm_storel_epi64((__m128i *)(output_w + 3 * stride_w),
600 0 : _mm256_extractf128_si256(v0, 0x1));
601 : }
602 0 : }
603 :
604 0 : static INLINE void round_shift_4x4(__m256i *in, int32_t shift) {
605 0 : __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
606 :
607 0 : in[0] = _mm256_add_epi32(in[0], rnding);
608 0 : in[0] = _mm256_srai_epi32(in[0], shift);
609 0 : in[1] = _mm256_add_epi32(in[1], rnding);
610 0 : in[1] = _mm256_srai_epi32(in[1], shift);
611 0 : }
612 :
613 0 : static INLINE void iidentity4_and_round_shift_avx2(__m256i *input, int32_t shift)
614 : {
615 : // Input takes 18 bits, can be multiplied with NewSqrt2 in 32 bits space.
616 : // round_shift(NewSqrt2Bits) and next round_shift(shift) in one pass.
617 0 : const __m256i scalar = _mm256_set1_epi32(NewSqrt2);
618 0 : const __m256i rnding = _mm256_set1_epi32((1 << (NewSqrt2Bits - 1)) +
619 0 : (!!(shift) << (shift + NewSqrt2Bits - 1)));
620 :
621 0 : for (int32_t i = 0; i < 2; i++) {
622 0 : input[i] = _mm256_mullo_epi32(input[i], scalar);
623 0 : input[i] = _mm256_add_epi32(input[i], rnding);
624 0 : input[i] = _mm256_srai_epi32(input[i], NewSqrt2Bits + shift);
625 : }
626 0 : }
627 :
628 0 : static INLINE void idct4_row_avx2(__m256i *in, int8_t cos_bit) {
629 0 : const int32_t *cospi = cospi_arr(cos_bit);
630 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
631 0 : const __m256i cospi32x16 = _mm256_blend_epi32(cospi32,
632 : _mm256_set1_epi32(cospi[16]), 0xAA);
633 0 : const __m256i cospi32x48 = _mm256_blend_epi32(cospi32,
634 : _mm256_set1_epi32(cospi[48]), 0xAA);
635 0 : const __m256i rnding = _mm256_set1_epi32((1 << (cos_bit - 1)));
636 0 : const __m256i minplus = _mm256_blend_epi32(_mm256_set1_epi32(-1),
637 : _mm256_set1_epi32(1), 0xAA);
638 : __m256i v0, v1, x, y;
639 : __m256i step[2];
640 :
641 0 : v0 = _mm256_unpacklo_epi64(in[0], in[1]);
642 0 : v1 = _mm256_unpackhi_epi64(in[0], in[1]);
643 :
644 0 : x = _mm256_mullo_epi32(cospi32x16, v0);
645 0 : y = _mm256_mullo_epi32(cospi32x48, v1);
646 0 : step[0] = _mm256_add_epi32(x, y);
647 :
648 0 : x = _mm256_mullo_epi32(cospi32x48, v0);
649 0 : y = _mm256_mullo_epi32(cospi32x16, v1);
650 0 : step[1] = _mm256_sub_epi32(x, y);
651 :
652 0 : step[0] = _mm256_add_epi32(step[0], rnding);
653 0 : step[0] = _mm256_srai_epi32(step[0], cos_bit);
654 0 : step[1] = _mm256_add_epi32(step[1], rnding);
655 0 : step[1] = _mm256_srai_epi32(step[1], cos_bit);
656 :
657 0 : v0 = _mm256_shuffle_epi32(step[0], 0xB1);
658 0 : v1 = _mm256_shuffle_epi32(step[1], 0xB1);
659 :
660 0 : v0 = _mm256_mullo_epi32(minplus, v0);
661 0 : v1 = _mm256_mullo_epi32(minplus, v1);
662 :
663 0 : v0 = _mm256_add_epi32(v0, step[0]);
664 0 : v1 = _mm256_add_epi32(v1, step[1]);
665 :
666 0 : v0 = _mm256_shuffle_epi32(v0, 0x2D);
667 0 : v1 = _mm256_shuffle_epi32(v1, 0x87);
668 :
669 0 : in[0] = _mm256_blend_epi32(v0, v1, 0x66);
670 :
671 0 : v0 = _mm256_blend_epi32(v0, v1, 0x99);
672 0 : in[1] = _mm256_shuffle_epi32(v0, 0xB1);
673 0 : }
674 :
675 0 : static INLINE void idct4_col_avx2(__m256i *in, int8_t cos_bit) {
676 0 : const int32_t *cospi = cospi_arr(cos_bit);
677 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
678 0 : const __m256i cospi32x16 =
679 0 : _mm256_blend_epi32(_mm256_set1_epi32(cospi[16]), cospi32, 0x0F);
680 0 : const __m256i cospi32x48 =
681 0 : _mm256_blend_epi32(_mm256_set1_epi32(cospi[48]), cospi32, 0x0F);
682 0 : const __m256i rnding = _mm256_set1_epi32((1 << (cos_bit - 1)));
683 : __m256i x, y;
684 : __m256i step[2];
685 :
686 0 : x = _mm256_mullo_epi32(cospi32x16, in[0]);
687 0 : y = _mm256_mullo_epi32(cospi32x48, in[1]);
688 0 : step[0] = _mm256_add_epi32(x, y);
689 :
690 0 : x = _mm256_mullo_epi32(cospi32x48, in[0]);
691 0 : y = _mm256_mullo_epi32(cospi32x16, in[1]);
692 0 : step[1] = _mm256_sub_epi32(x, y);
693 :
694 0 : step[0] = _mm256_add_epi32(step[0], rnding);
695 0 : step[0] = _mm256_srai_epi32(step[0], cos_bit);
696 0 : step[1] = _mm256_add_epi32(step[1], rnding);
697 0 : step[1] = _mm256_srai_epi32(step[1], cos_bit);
698 :
699 0 : x = _mm256_permute2x128_si256(step[0], step[1], 0x20);
700 0 : y = _mm256_permute2x128_si256(step[0], step[1], 0x31);
701 0 : in[0] = _mm256_add_epi32(x, y);
702 :
703 0 : x = _mm256_permute2x128_si256(step[0], step[1], 0x02);
704 0 : y = _mm256_permute2x128_si256(step[0], step[1], 0x13);
705 0 : in[1] = _mm256_sub_epi32(x, y);
706 0 : }
707 :
708 0 : static INLINE void iadst4_row_avx2(__m256i *in, int8_t cos_bit) {
709 0 : const int32_t bit = cos_bit;
710 0 : const int32_t *sinpi = sinpi_arr(bit);
711 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
712 0 : const __m128i sinpi1 = _mm_set1_epi32((int32_t)sinpi[1]);
713 0 : const __m128i sinpi2 = _mm_set1_epi32((int32_t)sinpi[2]);
714 0 : const __m128i sinpi3 = _mm_set1_epi32((int32_t)sinpi[3]);
715 0 : const __m128i sinpi4 = _mm_set1_epi32((int32_t)sinpi[4]);
716 : __m128i t;
717 : __m128i s0, s1, s2, s3, s4, s5, s6, s7;
718 : __m128i x0, x1, x2, x3;
719 : __m128i u0, u1, u2, u3;
720 : __m256i y0;
721 :
722 0 : u0 = _mm256_extractf128_si256(in[0], 0x1);
723 0 : u1 = _mm256_extractf128_si256(in[1], 0x1);
724 :
725 0 : s0 = _mm_unpacklo_epi32(_mm256_castsi256_si128(in[0]), u0);
726 0 : s1 = _mm_unpackhi_epi32(_mm256_castsi256_si128(in[0]), u0);
727 0 : s2 = _mm_unpacklo_epi32(_mm256_castsi256_si128(in[1]), u1);
728 0 : s3 = _mm_unpackhi_epi32(_mm256_castsi256_si128(in[1]), u1);
729 :
730 0 : x0 = _mm_unpacklo_epi64(s0, s2);
731 0 : x1 = _mm_unpackhi_epi64(s0, s2);
732 0 : x2 = _mm_unpacklo_epi64(s1, s3);
733 0 : x3 = _mm_unpackhi_epi64(s1, s3);
734 :
735 0 : s0 = _mm_mullo_epi32(x0, sinpi1);
736 0 : s1 = _mm_mullo_epi32(x0, sinpi2);
737 0 : s2 = _mm_mullo_epi32(x1, sinpi3);
738 0 : s3 = _mm_mullo_epi32(x2, sinpi4);
739 0 : s4 = _mm_mullo_epi32(x2, sinpi1);
740 0 : s5 = _mm_mullo_epi32(x3, sinpi2);
741 0 : s6 = _mm_mullo_epi32(x3, sinpi4);
742 0 : t = _mm_sub_epi32(x0, x2);
743 0 : s7 = _mm_add_epi32(t, x3);
744 :
745 0 : t = _mm_add_epi32(s0, s3);
746 0 : s0 = _mm_add_epi32(t, s5);
747 0 : t = _mm_sub_epi32(s1, s4);
748 0 : s1 = _mm_sub_epi32(t, s6);
749 0 : u2 = _mm_mullo_epi32(s7, sinpi3);
750 :
751 0 : u0 = _mm_add_epi32(s0, s2);
752 0 : u1 = _mm_add_epi32(s1, s2);
753 0 : t = _mm_add_epi32(s0, s1);
754 0 : u3 = _mm_sub_epi32(t, s2);
755 :
756 0 : s0 = _mm_unpacklo_epi32(u0, u1);
757 0 : s1 = _mm_unpackhi_epi32(u0, u1);
758 0 : s2 = _mm_unpacklo_epi32(u2, u3);
759 0 : s3 = _mm_unpackhi_epi32(u2, u3);
760 :
761 0 : u0 = _mm_unpacklo_epi64(s0, s2);
762 0 : u1 = _mm_unpackhi_epi64(s0, s2);
763 0 : u2 = _mm_unpacklo_epi64(s1, s3);
764 0 : u3 = _mm_unpackhi_epi64(s1, s3);
765 :
766 0 : y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(u0), (u1), 0x1);
767 0 : y0 = _mm256_add_epi32(y0, rnding);
768 0 : in[0] = _mm256_srai_epi32(y0, bit);
769 :
770 0 : y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(u2), (u3), 0x1);
771 0 : y0 = _mm256_add_epi32(y0, rnding);
772 0 : in[1] = _mm256_srai_epi32(y0, bit);
773 0 : }
774 :
775 0 : static INLINE void iadst4_col_avx2(__m256i *in, int8_t cos_bit) {
776 0 : const int32_t bit = cos_bit;
777 0 : const int32_t *sinpi = sinpi_arr(bit);
778 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
779 0 : const __m128i sinpi1 = _mm_set1_epi32((int32_t)sinpi[1]);
780 0 : const __m128i sinpi2 = _mm_set1_epi32((int32_t)sinpi[2]);
781 0 : const __m128i sinpi3 = _mm_set1_epi32((int32_t)sinpi[3]);
782 0 : const __m128i sinpi4 = _mm_set1_epi32((int32_t)sinpi[4]);
783 : __m128i t;
784 : __m128i s0, s1, s2, s3, s4, s5, s6, s7;
785 : __m128i x0, x1;
786 : __m128i u0, u1, u3;
787 : __m256i y0;
788 :
789 0 : x0 = _mm256_extractf128_si256(in[0], 0x1);
790 0 : x1 = _mm256_extractf128_si256(in[1], 0x1);
791 :
792 0 : s0 = _mm_mullo_epi32(_mm256_castsi256_si128(in[0]), sinpi1);
793 0 : s1 = _mm_mullo_epi32(_mm256_castsi256_si128(in[0]), sinpi2);
794 0 : s2 = _mm_mullo_epi32(x0, sinpi3);
795 0 : s3 = _mm_mullo_epi32(_mm256_castsi256_si128(in[1]), sinpi4);
796 0 : s4 = _mm_mullo_epi32(_mm256_castsi256_si128(in[1]), sinpi1);
797 0 : s5 = _mm_mullo_epi32(x1, sinpi2);
798 0 : s6 = _mm_mullo_epi32(x1, sinpi4);
799 0 : t = _mm_sub_epi32(_mm256_castsi256_si128(in[0]),
800 0 : _mm256_castsi256_si128(in[1]));
801 0 : s7 = _mm_add_epi32(t, x1);
802 :
803 0 : t = _mm_add_epi32(s0, s3);
804 0 : s0 = _mm_add_epi32(t, s5);
805 0 : t = _mm_sub_epi32(s1, s4);
806 0 : s1 = _mm_sub_epi32(t, s6);
807 0 : s3 = _mm_mullo_epi32(s7, sinpi3);
808 :
809 0 : u0 = _mm_add_epi32(s0, s2);
810 0 : u1 = _mm_add_epi32(s1, s2);
811 :
812 0 : t = _mm_add_epi32(s0, s1);
813 0 : u3 = _mm_sub_epi32(t, s2);
814 :
815 0 : y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(u0), (u1), 0x1);
816 0 : y0 = _mm256_add_epi32(y0, rnding);
817 0 : in[0] = _mm256_srai_epi32(y0, bit);
818 :
819 0 : y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(s3), (u3), 0x1);
820 0 : y0 = _mm256_add_epi32(y0, rnding);
821 0 : in[1] = _mm256_srai_epi32(y0, bit);
822 0 : }
823 :
824 0 : void eb_av1_inv_txfm2d_add_4x4_avx2(const int32_t *input,
825 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
826 : TxType tx_type, int32_t bd) {
827 : __m256i in[2];
828 0 : const int8_t *shift = eb_inv_txfm_shift_ls[TX_4X4];
829 0 : const int32_t txw_idx = get_txw_idx(TX_4X4);
830 0 : const int32_t txh_idx = get_txh_idx(TX_4X4);
831 :
832 0 : switch (tx_type) {
833 0 : case IDTX:
834 0 : load_buffer_4x4(input, in);
835 0 : iidentity4_and_round_shift_avx2(in, -shift[0]);
836 0 : iidentity4_and_round_shift_avx2(in, -shift[1]);
837 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
838 0 : break;
839 0 : case V_DCT:
840 0 : load_buffer_4x4(input, in);
841 0 : iidentity4_and_round_shift_avx2(in, -shift[0]);
842 0 : idct4_col_avx2(in, inv_cos_bit_col[txw_idx][txh_idx]);
843 0 : round_shift_4x4(in, -shift[1]);
844 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
845 0 : break;
846 0 : case H_DCT:
847 0 : load_buffer_4x4(input, in);
848 0 : idct4_row_avx2(in, inv_cos_bit_row[txw_idx][txh_idx]);
849 0 : iidentity4_and_round_shift_avx2(in, -shift[1]);
850 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
851 0 : break;
852 0 : case V_ADST:
853 0 : load_buffer_4x4(input, in);
854 0 : iidentity4_and_round_shift_avx2(in, -shift[0]);
855 0 : iadst4_col_avx2(in, inv_cos_bit_col[txw_idx][txh_idx]);
856 0 : round_shift_4x4(in, -shift[1]);
857 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
858 0 : break;
859 0 : case H_ADST:
860 0 : load_buffer_4x4(input, in);
861 0 : iadst4_row_avx2(in, inv_cos_bit_row[txw_idx][txh_idx]);
862 0 : iidentity4_and_round_shift_avx2(in, -shift[1]);
863 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
864 0 : break;
865 0 : case V_FLIPADST:
866 0 : load_buffer_4x4(input, in);
867 0 : iidentity4_and_round_shift_avx2(in, -shift[0]);
868 0 : iadst4_col_avx2(in, inv_cos_bit_col[txw_idx][txh_idx]);
869 0 : round_shift_4x4(in, -shift[1]);
870 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w, 0, 1, bd);
871 0 : break;
872 0 : case H_FLIPADST:
873 0 : load_buffer_4x4(input, in);
874 0 : iadst4_row_avx2(in, inv_cos_bit_row[txw_idx][txh_idx]);
875 0 : iidentity4_and_round_shift_avx2(in, -shift[1]);
876 0 : write_buffer_4x4(in, output_r, stride_r, output_w, stride_w, 1, 0, bd);
877 0 : break;
878 0 : default:
879 0 : eb_av1_inv_txfm2d_add_4x4_sse4_1(input,
880 : output_r, stride_r, output_w, stride_w, tx_type, bd);
881 0 : break;
882 : }
883 0 : }
884 :
885 : #define TRANSPOSE_4X4_AVX2(x0, x1, x2, x3, y0, y1, y2, y3) \
886 : do { \
887 : __m256i u0, u1, u2, u3; \
888 : u0 = _mm256_unpacklo_epi32(x0, x1); \
889 : u1 = _mm256_unpackhi_epi32(x0, x1); \
890 : u2 = _mm256_unpacklo_epi32(x2, x3); \
891 : u3 = _mm256_unpackhi_epi32(x2, x3); \
892 : y0 = _mm256_unpacklo_epi64(u0, u2); \
893 : y1 = _mm256_unpackhi_epi64(u0, u2); \
894 : y2 = _mm256_unpacklo_epi64(u1, u3); \
895 : y3 = _mm256_unpackhi_epi64(u1, u3); \
896 : } while (0)
897 :
898 0 : static INLINE void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
899 : __m256i out1[8];
900 0 : TRANSPOSE_4X4_AVX2(
901 : in[0], in[1], in[2], in[3], out1[0], out1[1], out1[4], out1[5]);
902 0 : TRANSPOSE_4X4_AVX2(
903 : in[4], in[5], in[6], in[7], out1[2], out1[3], out1[6], out1[7]);
904 0 : out[0] = _mm256_permute2x128_si256(out1[0], out1[2], 0x20);
905 0 : out[1] = _mm256_permute2x128_si256(out1[1], out1[3], 0x20);
906 0 : out[2] = _mm256_permute2x128_si256(out1[4], out1[6], 0x20);
907 0 : out[3] = _mm256_permute2x128_si256(out1[5], out1[7], 0x20);
908 0 : out[4] = _mm256_permute2x128_si256(out1[0], out1[2], 0x31);
909 0 : out[5] = _mm256_permute2x128_si256(out1[1], out1[3], 0x31);
910 0 : out[6] = _mm256_permute2x128_si256(out1[4], out1[6], 0x31);
911 0 : out[7] = _mm256_permute2x128_si256(out1[5], out1[7], 0x31);
912 0 : }
913 :
914 0 : static INLINE void transpose_16x16_avx2(const __m256i *in, __m256i *out) {
915 : __m256i temp[32];
916 0 : TRANSPOSE_4X4_AVX2(
917 : in[0], in[2], in[4], in[6], temp[0], temp[2], temp[4], temp[6]);
918 0 : TRANSPOSE_4X4_AVX2(
919 : in[8], in[10], in[12], in[14], temp[17], temp[19], temp[21], temp[23]);
920 0 : TRANSPOSE_4X4_AVX2(
921 : in[1], in[3], in[5], in[7], temp[16], temp[18], temp[20], temp[22]);
922 0 : TRANSPOSE_4X4_AVX2(
923 : in[9], in[11], in[13], in[15], temp[25], temp[27], temp[29], temp[31]);
924 0 : TRANSPOSE_4X4_AVX2(
925 : in[16], in[18], in[20], in[22], temp[1], temp[3], temp[5], temp[7]);
926 0 : TRANSPOSE_4X4_AVX2(
927 : in[24], in[26], in[28], in[30], temp[9], temp[11], temp[13], temp[15]);
928 0 : TRANSPOSE_4X4_AVX2(
929 : in[17], in[19], in[21], in[23], temp[8], temp[10], temp[12], temp[14]);
930 0 : TRANSPOSE_4X4_AVX2(
931 : in[25], in[27], in[29], in[31], temp[24], temp[26], temp[28], temp[30]);
932 :
933 0 : out[0] = _mm256_permute2x128_si256(temp[0], temp[17], 0x20);
934 0 : out[1] = _mm256_permute2x128_si256(temp[1], temp[9], 0x20);
935 0 : out[2] = _mm256_permute2x128_si256(temp[2], temp[19], 0x20);
936 0 : out[3] = _mm256_permute2x128_si256(temp[3], temp[11], 0x20);
937 0 : out[4] = _mm256_permute2x128_si256(temp[4], temp[21], 0x20);
938 0 : out[5] = _mm256_permute2x128_si256(temp[5], temp[13], 0x20);
939 0 : out[6] = _mm256_permute2x128_si256(temp[6], temp[23], 0x20);
940 0 : out[7] = _mm256_permute2x128_si256(temp[7], temp[15], 0x20);
941 0 : out[8] = _mm256_permute2x128_si256(temp[0], temp[17], 0x31);
942 0 : out[9] = _mm256_permute2x128_si256(temp[1], temp[9], 0x31);
943 0 : out[10] = _mm256_permute2x128_si256(temp[2], temp[19], 0x31);
944 0 : out[11] = _mm256_permute2x128_si256(temp[3], temp[11], 0x31);
945 0 : out[12] = _mm256_permute2x128_si256(temp[4], temp[21], 0x31);
946 0 : out[13] = _mm256_permute2x128_si256(temp[5], temp[13], 0x31);
947 0 : out[14] = _mm256_permute2x128_si256(temp[6], temp[23], 0x31);
948 0 : out[15] = _mm256_permute2x128_si256(temp[7], temp[15], 0x31);
949 0 : out[16] = _mm256_permute2x128_si256(temp[16], temp[25], 0x20);
950 0 : out[17] = _mm256_permute2x128_si256(temp[8], temp[24], 0x20);
951 0 : out[18] = _mm256_permute2x128_si256(temp[18], temp[27], 0x20);
952 0 : out[19] = _mm256_permute2x128_si256(temp[10], temp[26], 0x20);
953 0 : out[20] = _mm256_permute2x128_si256(temp[20], temp[29], 0x20);
954 0 : out[21] = _mm256_permute2x128_si256(temp[12], temp[28], 0x20);
955 0 : out[22] = _mm256_permute2x128_si256(temp[22], temp[31], 0x20);
956 0 : out[23] = _mm256_permute2x128_si256(temp[14], temp[30], 0x20);
957 0 : out[24] = _mm256_permute2x128_si256(temp[16], temp[25], 0x31);
958 0 : out[25] = _mm256_permute2x128_si256(temp[8], temp[24], 0x31);
959 0 : out[26] = _mm256_permute2x128_si256(temp[18], temp[27], 0x31);
960 0 : out[27] = _mm256_permute2x128_si256(temp[10], temp[26], 0x31);
961 0 : out[28] = _mm256_permute2x128_si256(temp[20], temp[29], 0x31);
962 0 : out[29] = _mm256_permute2x128_si256(temp[12], temp[28], 0x31);
963 0 : out[30] = _mm256_permute2x128_si256(temp[22], temp[31], 0x31);
964 0 : out[31] = _mm256_permute2x128_si256(temp[14], temp[30], 0x31);
965 0 : }
966 :
967 0 : static void load_buffer_8x8(const int32_t *coeff, __m256i *in) {
968 : int32_t i;
969 0 : for (i = 0; i < 8; ++i) {
970 0 : in[i] = _mm256_loadu_si256((const __m256i *)coeff);
971 0 : coeff += 8;
972 : }
973 0 : }
974 :
975 0 : static INLINE void write_buffer_8x8(__m256i *in,
976 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
977 : int32_t fliplr, int32_t flipud, int32_t bd) {
978 : __m256i u0, x0, x1, v0, v1;
979 0 : const __m256i zero = _mm256_setzero_si256();
980 0 : int32_t i = 0;
981 0 : int32_t step = 1;
982 :
983 0 : if (flipud) {
984 0 : i = 7;
985 0 : step = -1;
986 : }
987 :
988 0 : while (i < 8 && i > -1) {
989 0 : u0 = _mm256_inserti128_si256(_mm256_castsi128_si256(
990 : _mm_loadu_si128((__m128i*)(output_r))),
991 : _mm_loadu_si128((__m128i*)(output_r + stride_r)), 1);
992 :
993 : // Swap 64bits from ABCD to ACBD
994 0 : u0 = _mm256_permute4x64_epi64(u0, 0xD8);
995 :
996 : // Unpack and Swap 128bits from ACBD to ABCD
997 0 : x0 = _mm256_unpacklo_epi16(u0, zero);
998 0 : x1 = _mm256_unpackhi_epi16(u0, zero);
999 :
1000 0 : if (fliplr) {
1001 0 : v0 = _mm256_permute4x64_epi64(in[i], 0x1B);
1002 0 : v0 = _mm256_shuffle_epi32(v0, 0xB1);
1003 0 : v0 = _mm256_add_epi32(v0, x0);
1004 0 : i += step;
1005 0 : v1 = _mm256_permute4x64_epi64(in[i], 0x1B);
1006 0 : v1 = _mm256_shuffle_epi32(v1, 0xB1);
1007 0 : v1 = _mm256_add_epi32(v1, x1);
1008 0 : i += step;
1009 : }
1010 : else {
1011 0 : v0 = _mm256_add_epi32(in[i], x0);
1012 0 : i += step;
1013 0 : v1 = _mm256_add_epi32(in[i], x1);
1014 0 : i += step;
1015 : }
1016 :
1017 0 : highbd_clamp_epi32(&v0, bd);
1018 0 : highbd_clamp_epi32(&v1, bd);
1019 :
1020 : // Pack and Swap 128bits from ABCD to ACBD
1021 0 : v0 = _mm256_packus_epi32(v0, v1);
1022 : // Swap 64bits from ACBD to ABCD
1023 0 : v0 = _mm256_permute4x64_epi64(v0, 0xD8);
1024 :
1025 0 : _mm_storeu_si128((__m128i *)output_w, _mm256_castsi256_si128(v0));
1026 0 : _mm_storeu_si128((__m128i *)(output_w + stride_w),
1027 0 : _mm256_extractf128_si256(v0, 0x1));
1028 :
1029 0 : output_r += 2 * stride_r;
1030 0 : output_w += 2 * stride_w;
1031 : }
1032 0 : }
1033 :
1034 0 : static INLINE void round_shift_8x8(__m256i *in, int32_t shift) {
1035 0 : __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
1036 0 : int32_t i = 0;
1037 :
1038 0 : while (i < 8) {
1039 0 : in[i] = _mm256_add_epi32(in[i], rnding);
1040 0 : in[i] = _mm256_srai_epi32(in[i], shift);
1041 0 : i++;
1042 : }
1043 0 : }
1044 :
1045 0 : static INLINE void round_shift_8x8_double(__m256i *in, int32_t first, int32_t second) {
1046 0 : __m256i rnding = _mm256_set1_epi32(
1047 0 : (1 << (first - 1)) + (1 << (first + second - 1)));
1048 0 : int32_t i = 0;
1049 :
1050 0 : while (i < 8) {
1051 0 : in[i] = _mm256_add_epi32(in[i], rnding);
1052 0 : in[i] = _mm256_srai_epi32(in[i], first + second);
1053 0 : i++;
1054 : }
1055 0 : }
1056 :
1057 0 : static INLINE void idct8_col_avx2(__m256i *in, __m256i *out, int32_t bit) {
1058 0 : const int32_t *cospi = cospi_arr(bit);
1059 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1060 0 : const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
1061 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1062 0 : const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
1063 0 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1064 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1065 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1066 0 : const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
1067 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1068 0 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1069 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1070 0 : const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
1071 : __m256i tmp[8], tmp2[8];
1072 :
1073 : //stage 1
1074 :
1075 : //stage 2
1076 0 : tmp[4] = half_btf_avx2(&cospi56, &in[1], &cospim8, &in[7], &rounding, bit);
1077 0 : tmp[5] = half_btf_avx2(&cospi24, &in[5], &cospim40, &in[3], &rounding, bit);
1078 0 : tmp[6] = half_btf_avx2(&cospi40, &in[5], &cospi24, &in[3], &rounding, bit);
1079 0 : tmp[7] = half_btf_avx2(&cospi8, &in[1], &cospi56, &in[7], &rounding, bit);
1080 :
1081 : //stage 3
1082 0 : tmp2[0] = half_btf_avx2(&cospi32, &in[0], &cospi32, &in[4], &rounding, bit);
1083 0 : tmp2[1] = half_btf_avx2(&cospi32, &in[0], &cospim32, &in[4], &rounding, bit);
1084 0 : tmp2[2] = half_btf_avx2(&cospi48, &in[2], &cospim16, &in[6], &rounding, bit);
1085 0 : tmp2[3] = half_btf_avx2(&cospi16, &in[2], &cospi48, &in[6], &rounding, bit);
1086 0 : tmp2[4] = _mm256_add_epi32(tmp[4], tmp[5]);
1087 0 : tmp2[5] = _mm256_sub_epi32(tmp[4], tmp[5]);
1088 0 : tmp2[6] = _mm256_sub_epi32(tmp[7], tmp[6]);
1089 0 : tmp2[7] = _mm256_add_epi32(tmp[6], tmp[7]);
1090 :
1091 : //stage 4
1092 0 : tmp[0] = _mm256_add_epi32(tmp2[0], tmp2[3]);
1093 0 : tmp[1] = _mm256_add_epi32(tmp2[1], tmp2[2]);
1094 0 : tmp[2] = _mm256_sub_epi32(tmp2[1], tmp2[2]);
1095 0 : tmp[3] = _mm256_sub_epi32(tmp2[0], tmp2[3]);
1096 0 : tmp[5] = half_btf_avx2(&cospim32, &tmp2[5], &cospi32, &tmp2[6],
1097 : &rounding, bit);
1098 0 : tmp[6] = half_btf_avx2(&cospi32, &tmp2[5], &cospi32, &tmp2[6],
1099 : &rounding, bit);
1100 :
1101 : //stage 5
1102 0 : out[0] = _mm256_add_epi32(tmp[0], tmp2[7]);
1103 0 : out[1] = _mm256_add_epi32(tmp[1], tmp[6]);
1104 0 : out[2] = _mm256_add_epi32(tmp[2], tmp[5]);
1105 0 : out[3] = _mm256_add_epi32(tmp[3], tmp2[4]);
1106 0 : out[4] = _mm256_sub_epi32(tmp[3], tmp2[4]);
1107 0 : out[5] = _mm256_sub_epi32(tmp[2], tmp[5]);
1108 0 : out[6] = _mm256_sub_epi32(tmp[1], tmp[6]);
1109 0 : out[7] = _mm256_sub_epi32(tmp[0], tmp2[7]);
1110 0 : }
1111 :
1112 0 : static INLINE void iadst8_col_avx2(__m256i *in, __m256i *out, int8_t cos_bit) {
1113 0 : const int32_t *cospi = cospi_arr(cos_bit);
1114 0 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
1115 0 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
1116 0 : const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
1117 0 : const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
1118 0 : const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
1119 0 : const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
1120 0 : const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
1121 0 : const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
1122 0 : const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
1123 0 : const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
1124 0 : const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
1125 0 : const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
1126 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1127 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1128 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1129 0 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1130 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1131 0 : const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
1132 0 : const __m256i negative = _mm256_set1_epi32(-1);
1133 0 : const __m256i rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
1134 : __m256i tmp[8], tmp2[4];
1135 :
1136 : //stage 1
1137 : //stage 2
1138 0 : tmp[0] =
1139 0 : half_btf_avx2(&cospi4, &in[7], &cospi60, &in[0], &rounding, cos_bit);
1140 0 : tmp[1] =
1141 0 : half_btf_avx2(&cospi60, &in[7], &cospim4, &in[0], &rounding, cos_bit);
1142 0 : tmp[2] =
1143 0 : half_btf_avx2(&cospi20, &in[5], &cospi44, &in[2], &rounding, cos_bit);
1144 0 : tmp[3] =
1145 0 : half_btf_avx2(&cospi44, &in[5], &cospim20, &in[2], &rounding, cos_bit);
1146 0 : tmp[4] =
1147 0 : half_btf_avx2(&cospi36, &in[3], &cospi28, &in[4], &rounding, cos_bit);
1148 0 : tmp[5] =
1149 0 : half_btf_avx2(&cospi28, &in[3], &cospim36, &in[4], &rounding, cos_bit);
1150 0 : tmp[6] =
1151 0 : half_btf_avx2(&cospi52, &in[1], &cospi12, &in[6], &rounding, cos_bit);
1152 0 : tmp[7] =
1153 0 : half_btf_avx2(&cospi12, &in[1], &cospim52, &in[6], &rounding, cos_bit);
1154 :
1155 : //stage 3
1156 0 : out[7] = _mm256_add_epi32(tmp[0], tmp[4]);
1157 0 : out[1] = _mm256_add_epi32(tmp[1], tmp[5]);
1158 0 : out[2] = _mm256_add_epi32(tmp[2], tmp[6]);
1159 0 : out[3] = _mm256_add_epi32(tmp[3], tmp[7]);
1160 0 : tmp2[0] = _mm256_sub_epi32(tmp[0], tmp[4]);
1161 0 : tmp2[1] = _mm256_sub_epi32(tmp[1], tmp[5]);
1162 0 : tmp2[2] = _mm256_sub_epi32(tmp[2], tmp[6]);
1163 0 : tmp2[3] = _mm256_sub_epi32(tmp[3], tmp[7]);
1164 :
1165 : //stage 4
1166 0 : tmp[4] = half_btf_avx2(
1167 : &cospi16, &tmp2[0], &cospi48, &tmp2[1], &rounding, cos_bit);
1168 0 : tmp[5] = half_btf_avx2(
1169 : &cospi48, &tmp2[0], &cospim16, &tmp2[1], &rounding, cos_bit);
1170 0 : tmp[6] = half_btf_avx2(
1171 : &cospim48, &tmp2[2], &cospi16, &tmp2[3], &rounding, cos_bit);
1172 0 : tmp[7] = half_btf_avx2(
1173 : &cospi16, &tmp2[2], &cospi48, &tmp2[3], &rounding, cos_bit);
1174 :
1175 : //stage 5
1176 0 : out[0] = _mm256_add_epi32(out[7], out[2]);
1177 0 : tmp[1] = _mm256_add_epi32(out[1], out[3]);
1178 0 : tmp2[0] = _mm256_sub_epi32(out[7], out[2]);
1179 0 : tmp2[1] = _mm256_sub_epi32(out[1], out[3]);
1180 0 : out[1] = _mm256_add_epi32(tmp[4], tmp[6]);
1181 0 : out[6] = _mm256_add_epi32(tmp[5], tmp[7]);
1182 0 : tmp2[2] = _mm256_sub_epi32(tmp[4], tmp[6]);
1183 0 : tmp2[3] = _mm256_sub_epi32(tmp[5], tmp[7]);
1184 :
1185 : //stage 6
1186 0 : tmp[2] = half_btf_avx2(
1187 : &cospi32, &tmp2[0], &cospi32, &tmp2[1], &rounding, cos_bit);
1188 0 : out[4] = half_btf_avx2(
1189 : &cospi32, &tmp2[0], &cospim32, &tmp2[1], &rounding, cos_bit);
1190 0 : out[2] = half_btf_avx2(
1191 : &cospi32, &tmp2[2], &cospi32, &tmp2[3], &rounding, cos_bit);
1192 0 : tmp[7] = half_btf_avx2(
1193 : &cospi32, &tmp2[2], &cospim32, &tmp2[3], &rounding, cos_bit);
1194 :
1195 : //stage 7
1196 0 : out[1] = _mm256_sign_epi32(out[1], negative);
1197 0 : out[3] = _mm256_sign_epi32(tmp[2], negative);
1198 0 : out[5] = _mm256_sign_epi32(tmp[7], negative);
1199 0 : out[7] = _mm256_sign_epi32(tmp[1], negative);
1200 0 : }
1201 :
1202 0 : void eb_av1_inv_txfm2d_add_8x8_avx2(const int32_t *input,
1203 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
1204 : TxType tx_type, int32_t bd) {
1205 : __m256i in[8], out[8];
1206 0 : const int8_t *shift = eb_inv_txfm_shift_ls[TX_8X8];
1207 0 : const int32_t txw_idx = get_txw_idx(TX_8X8);
1208 0 : const int32_t txh_idx = get_txh_idx(TX_8X8);
1209 :
1210 0 : switch (tx_type) {
1211 0 : case IDTX:
1212 0 : load_buffer_8x8(input, in);
1213 : // Operations can be joined together without losing precision
1214 : // eb_av1_iidentity8_c() shift left 1 bits
1215 : // round_shift_8x8(, -shift[0]) shift right 1 bits
1216 : // eb_av1_iidentity8_c() shift left 1 bits
1217 : // round_shift_8x8(, -shift[1]) shift right 4 bits with complement
1218 0 : round_shift_8x8(in, -shift[0] - shift[1] - 2);
1219 0 : write_buffer_8x8(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
1220 0 : break;
1221 0 : case V_DCT:
1222 0 : load_buffer_8x8(input, in);
1223 : // eb_av1_iidentity8_c() shift left 1 bits
1224 : // round_shift_8x8(, -shift[0]) shift right 1 bits
1225 0 : idct8_col_avx2(in, out, inv_cos_bit_col[txw_idx][txh_idx]);
1226 0 : round_shift_8x8(out, -shift[1]);
1227 0 : write_buffer_8x8(out, output_r, stride_r, output_w, stride_w, 0, 0, bd);
1228 0 : break;
1229 0 : case H_DCT:
1230 0 : load_buffer_8x8(input, in);
1231 0 : transpose_8x8_avx2(in, out);
1232 0 : idct8_col_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
1233 0 : transpose_8x8_avx2(in, out);
1234 : // eb_av1_iidentity8_c() shift left 1 bits
1235 : // round_shift_8x8(, -shift[1]) shift right 4 bits with complement
1236 0 : round_shift_8x8_double(out, -shift[0], -shift[1] - 1);
1237 0 : write_buffer_8x8(out, output_r, stride_r, output_w, stride_w, 0, 0, bd);
1238 0 : break;
1239 0 : case V_ADST:
1240 0 : load_buffer_8x8(input, in);
1241 : // eb_av1_iidentity8_c() shift left 1 bits
1242 : // round_shift_8x8(, -shift[0]) shift right 1 bits
1243 0 : iadst8_col_avx2(in, out, inv_cos_bit_col[txw_idx][txh_idx]);
1244 0 : round_shift_8x8(out, -shift[1]);
1245 0 : write_buffer_8x8(out, output_r, stride_r, output_w, stride_w, 0, 0, bd);
1246 0 : break;
1247 0 : case H_ADST:
1248 0 : load_buffer_8x8(input, in);
1249 0 : transpose_8x8_avx2(in, out);
1250 0 : iadst8_col_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
1251 0 : transpose_8x8_avx2(in, out);
1252 : // eb_av1_iidentity8_c() shift left 1 bits
1253 : // round_shift_8x8(, -shift[1]) shift right 4 bits with complement
1254 0 : round_shift_8x8_double(out, -shift[0], -shift[1] - 1);
1255 0 : write_buffer_8x8(out, output_r, stride_r, output_w, stride_w, 0, 0, bd);
1256 0 : break;
1257 0 : case V_FLIPADST:
1258 0 : load_buffer_8x8(input, in);
1259 : // eb_av1_iidentity8_c() shift left 1 bits
1260 : // round_shift_8x8(, -shift[0]) shift right 1 bits
1261 0 : iadst8_col_avx2(in, out, inv_cos_bit_col[txw_idx][txh_idx]);
1262 0 : round_shift_8x8(out, -shift[1]);
1263 0 : write_buffer_8x8(out, output_r, stride_r, output_w, stride_w, 0, 1, bd);
1264 0 : break;
1265 0 : case H_FLIPADST:
1266 0 : load_buffer_8x8(input, in);
1267 0 : transpose_8x8_avx2(in, out);
1268 0 : iadst8_col_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
1269 0 : transpose_8x8_avx2(in, out);
1270 : // eb_av1_iidentity8_c() shift left 1 bits
1271 : // round_shift_8x8(, -shift[1]) shift right 4 bits with complement
1272 0 : round_shift_8x8_double(out, -shift[0], -shift[1] - 1);
1273 0 : write_buffer_8x8(out, output_r, stride_r, output_w, stride_w, 1, 0, bd);
1274 0 : break;
1275 0 : default:
1276 0 : eb_av1_inv_txfm2d_add_8x8_sse4_1(input,
1277 : output_r, stride_r, output_w, stride_w, tx_type, bd);
1278 0 : break;
1279 : }
1280 0 : }
1281 :
1282 0 : static void load_buffer_16x16(const int32_t *coeff, __m256i *in) {
1283 : int32_t i;
1284 0 : for (i = 0; i < 32; ++i) {
1285 0 : in[i] = _mm256_loadu_si256((const __m256i *)coeff);
1286 0 : coeff += 8;
1287 : }
1288 0 : }
1289 :
1290 0 : static INLINE void write_buffer_16x16(__m256i *in,
1291 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
1292 : int32_t fliplr, int32_t flipud, int32_t bd) {
1293 : __m256i u0, x0, x1, v0, v1;
1294 0 : const __m256i zero = _mm256_setzero_si256();
1295 0 : int32_t i = 0;
1296 :
1297 0 : if (flipud) {
1298 0 : output_r += stride_r * 15;
1299 0 : stride_r = -stride_r;
1300 0 : output_w += stride_w * 15;
1301 0 : stride_w = -stride_w;
1302 : }
1303 :
1304 0 : while (i < 32) {
1305 0 : u0 = _mm256_loadu_si256((const __m256i *)output_r);
1306 :
1307 0 : x0 = _mm256_unpacklo_epi16(u0, zero);
1308 0 : x1 = _mm256_unpackhi_epi16(u0, zero);
1309 :
1310 0 : if (fliplr) {
1311 0 : v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x13);
1312 0 : v0 = _mm256_shuffle_epi32(v0, 0x1B);
1313 0 : v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x02);
1314 0 : v1 = _mm256_shuffle_epi32(v1, 0x1B);
1315 : }
1316 : else {
1317 0 : v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x20);
1318 0 : v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x31);
1319 : }
1320 :
1321 0 : v0 = _mm256_add_epi32(v0, x0);
1322 0 : v1 = _mm256_add_epi32(v1, x1);
1323 0 : highbd_clamp_epi32(&v0, bd);
1324 0 : highbd_clamp_epi32(&v1, bd);
1325 :
1326 0 : v0 = _mm256_packus_epi32(v0, v1);
1327 :
1328 0 : _mm256_storeu_si256((__m256i *)output_w, v0);
1329 :
1330 0 : output_r += stride_r;
1331 0 : output_w += stride_w;
1332 0 : i += 2;
1333 : }
1334 0 : }
1335 :
1336 0 : static INLINE void round_shift_16x16(__m256i *in, int32_t shift) {
1337 0 : __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
1338 0 : int32_t i = 0;
1339 :
1340 0 : while (i < 32) {
1341 0 : in[i] = _mm256_add_epi32(in[i], rnding);
1342 0 : in[i] = _mm256_srai_epi32(in[i], shift);
1343 0 : i++;
1344 : }
1345 0 : }
1346 :
1347 0 : static INLINE void iidentity16_and_round_shift_avx2(__m256i *input, int32_t shift)
1348 : {
1349 : // Input takes 18 bits, can be multiplied with NewSqrt2 in 32 bits space.
1350 : // Multiplied by half value NewSqrt2, instead (2*NewSqrt2),
1351 : // and round_shift() by one bit less (NewSqrt2Bits-1).
1352 : // round_shift(NewSqrt2Bits-1) and next round_shift(shift) in one pass.
1353 0 : const __m256i scalar = _mm256_set1_epi32(NewSqrt2);
1354 0 : const __m256i rnding = _mm256_set1_epi32((1 << (NewSqrt2Bits - 2)) +
1355 0 : (!!(shift) << (shift + NewSqrt2Bits - 2)));
1356 :
1357 0 : for (int32_t i = 0; i < 32; i++) {
1358 0 : input[i] = _mm256_mullo_epi32(input[i], scalar);
1359 0 : input[i] = _mm256_add_epi32(input[i], rnding);
1360 0 : input[i] = _mm256_srai_epi32(input[i], NewSqrt2Bits - 1 + shift);
1361 : }
1362 0 : }
1363 :
1364 0 : static INLINE void idct16_col_avx2(__m256i *in, __m256i *out, int32_t bit,
1365 : const int8_t *shift) {
1366 : (void) shift;
1367 0 : const int32_t *cospi = cospi_arr(bit);
1368 0 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
1369 0 : const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
1370 0 : const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
1371 0 : const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
1372 0 : const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
1373 0 : const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
1374 0 : const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
1375 0 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
1376 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1377 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1378 0 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1379 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1380 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1381 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1382 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1383 0 : const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
1384 0 : const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
1385 0 : const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
1386 0 : const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
1387 0 : const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
1388 0 : const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
1389 0 : const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
1390 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1391 0 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1392 0 : const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
1393 : __m256i tmp[16], tmp2[16];
1394 0 : int32_t col = 0;
1395 :
1396 0 : for (col = 0; col < 2; ++col) {
1397 : //stage 1
1398 :
1399 : //stage 2
1400 0 : tmp[8] = half_btf_avx2(&cospi60, &in[1 * 2 + col],
1401 0 : &cospim4, &in[15 * 2 + col], &rounding, bit);
1402 0 : tmp[9] = half_btf_avx2(&cospi28, &in[9 * 2 + col],
1403 0 : &cospim36, &in[7 * 2 + col], &rounding, bit);
1404 0 : tmp[10] = half_btf_avx2(&cospi44, &in[5 * 2 + col],
1405 0 : &cospim20, &in[11 * 2 + col], &rounding, bit);
1406 0 : tmp[11] = half_btf_avx2(&cospi12, &in[13 * 2 + col],
1407 0 : &cospim52, &in[3 * 2 + col], &rounding, bit);
1408 0 : tmp[12] = half_btf_avx2(&cospi52, &in[13 * 2 + col],
1409 0 : &cospi12, &in[3 * 2 + col], &rounding, bit);
1410 0 : tmp[13] = half_btf_avx2(&cospi20, &in[5 * 2 + col],
1411 0 : &cospi44, &in[11 * 2 + col], &rounding, bit);
1412 0 : tmp[14] = half_btf_avx2(&cospi36, &in[9 * 2 + col],
1413 0 : &cospi28, &in[7 * 2 + col], &rounding, bit);
1414 0 : tmp[15] = half_btf_avx2(&cospi4, &in[1 * 2 + col],
1415 0 : &cospi60, &in[15 * 2 + col], &rounding, bit);
1416 :
1417 : //stage 3
1418 0 : tmp2[0] = half_btf_avx2(&cospi56, &in[2 * 2 + col],
1419 0 : &cospim8, &in[14 * 2 + col], &rounding, bit);
1420 0 : tmp2[1] = half_btf_avx2(&cospi24, &in[10 * 2 + col],
1421 0 : &cospim40, &in[6 * 2 + col], &rounding, bit);
1422 0 : tmp2[2] = half_btf_avx2(&cospi40, &in[10 * 2 + col],
1423 0 : &cospi24, &in[6 * 2 + col], &rounding, bit);
1424 0 : tmp2[3] = half_btf_avx2(&cospi8, &in[2 * 2 + col],
1425 0 : &cospi56, &in[14 * 2 + col], &rounding, bit);
1426 0 : tmp2[4] = _mm256_add_epi32(tmp[8], tmp[9]);
1427 0 : tmp2[5] = _mm256_sub_epi32(tmp[8], tmp[9]);
1428 0 : tmp2[6] = _mm256_sub_epi32(tmp[11], tmp[10]);
1429 0 : tmp2[7] = _mm256_add_epi32(tmp[10], tmp[11]);
1430 0 : tmp2[8] = _mm256_add_epi32(tmp[12], tmp[13]);
1431 0 : tmp2[9] = _mm256_sub_epi32(tmp[12], tmp[13]);
1432 0 : tmp2[10] = _mm256_sub_epi32(tmp[15], tmp[14]);
1433 0 : tmp2[11] = _mm256_add_epi32(tmp[14], tmp[15]);
1434 :
1435 : //stage 4
1436 0 : tmp[0] = half_btf_avx2(&cospi32, &in[0 * 2 + col],
1437 0 : &cospi32, &in[8 * 2 + col], &rounding, bit);
1438 0 : tmp[1] = half_btf_avx2(&cospi32, &in[0 * 2 + col],
1439 0 : &cospim32, &in[8 * 2 + col], &rounding, bit);
1440 0 : tmp[2] = half_btf_avx2(&cospi48, &in[4 * 2 + col],
1441 0 : &cospim16, &in[12 * 2 + col], &rounding, bit);
1442 0 : tmp[3] = half_btf_avx2(&cospi16, &in[4 * 2 + col],
1443 0 : &cospi48, &in[12 * 2 + col], &rounding, bit);
1444 0 : tmp[4] = _mm256_add_epi32(tmp2[0], tmp2[1]);
1445 0 : tmp[5] = _mm256_sub_epi32(tmp2[0], tmp2[1]);
1446 0 : tmp[6] = _mm256_sub_epi32(tmp2[3], tmp2[2]);
1447 0 : tmp[7] = _mm256_add_epi32(tmp2[2], tmp2[3]);
1448 0 : tmp[9] = half_btf_avx2(&cospim16, &tmp2[5],
1449 : &cospi48, &tmp2[10], &rounding, bit);
1450 0 : tmp[10] = half_btf_avx2(&cospim48, &tmp2[6],
1451 : &cospim16, &tmp2[9], &rounding, bit);
1452 0 : tmp[13] = half_btf_avx2(&cospim16, &tmp2[6],
1453 : &cospi48, &tmp2[9], &rounding, bit);
1454 0 : tmp[14] = half_btf_avx2(&cospi48, &tmp2[5],
1455 : &cospi16, &tmp2[10], &rounding, bit);
1456 :
1457 : //stage 5
1458 0 : tmp2[12] = _mm256_sub_epi32(tmp2[11], tmp2[8]);
1459 0 : tmp2[15] = _mm256_add_epi32(tmp2[8], tmp2[11]);
1460 0 : tmp2[8] = _mm256_add_epi32(tmp2[4], tmp2[7]);
1461 0 : tmp2[11] = _mm256_sub_epi32(tmp2[4], tmp2[7]);
1462 0 : tmp2[0] = _mm256_add_epi32(tmp[0], tmp[3]);
1463 0 : tmp2[1] = _mm256_add_epi32(tmp[1], tmp[2]);
1464 0 : tmp2[2] = _mm256_sub_epi32(tmp[1], tmp[2]);
1465 0 : tmp2[3] = _mm256_sub_epi32(tmp[0], tmp[3]);
1466 0 : tmp2[5] = half_btf_avx2(&cospim32, &tmp[5],
1467 : &cospi32, &tmp[6], &rounding, bit);
1468 0 : tmp2[6] = half_btf_avx2(&cospi32, &tmp[5],
1469 : &cospi32, &tmp[6], &rounding, bit);
1470 0 : tmp2[9] = _mm256_add_epi32(tmp[9], tmp[10]);
1471 0 : tmp2[10] = _mm256_sub_epi32(tmp[9], tmp[10]);
1472 0 : tmp2[13] = _mm256_sub_epi32(tmp[14], tmp[13]);
1473 0 : tmp2[14] = _mm256_add_epi32(tmp[13], tmp[14]);
1474 :
1475 : //stage 6
1476 0 : tmp[0] = _mm256_add_epi32(tmp2[0], tmp[7]);
1477 0 : tmp[1] = _mm256_add_epi32(tmp2[1], tmp2[6]);
1478 0 : tmp[2] = _mm256_add_epi32(tmp2[2], tmp2[5]);
1479 0 : tmp[3] = _mm256_add_epi32(tmp2[3], tmp[4]);
1480 0 : tmp[4] = _mm256_sub_epi32(tmp2[3], tmp[4]);
1481 0 : tmp[5] = _mm256_sub_epi32(tmp2[2], tmp2[5]);
1482 0 : tmp[6] = _mm256_sub_epi32(tmp2[1], tmp2[6]);
1483 0 : tmp[7] = _mm256_sub_epi32(tmp2[0], tmp[7]);
1484 0 : tmp[10] = half_btf_avx2(&cospim32, &tmp2[10],
1485 : &cospi32, &tmp2[13], &rounding, bit);
1486 0 : tmp[11] = half_btf_avx2(&cospim32, &tmp2[11],
1487 : &cospi32, &tmp2[12], &rounding, bit);
1488 0 : tmp[12] = half_btf_avx2(&cospi32, &tmp2[11],
1489 : &cospi32, &tmp2[12], &rounding, bit);
1490 0 : tmp[13] = half_btf_avx2(&cospi32, &tmp2[10],
1491 : &cospi32, &tmp2[13], &rounding, bit);
1492 :
1493 : //stage 7
1494 0 : out[0 * 2 + col] = _mm256_add_epi32(tmp[0], tmp2[15]);
1495 0 : out[1 * 2 + col] = _mm256_add_epi32(tmp[1], tmp2[14]);
1496 0 : out[2 * 2 + col] = _mm256_add_epi32(tmp[2], tmp[13]);
1497 0 : out[3 * 2 + col] = _mm256_add_epi32(tmp[3], tmp[12]);
1498 0 : out[4 * 2 + col] = _mm256_add_epi32(tmp[4], tmp[11]);
1499 0 : out[5 * 2 + col] = _mm256_add_epi32(tmp[5], tmp[10]);
1500 0 : out[6 * 2 + col] = _mm256_add_epi32(tmp[6], tmp2[9]);
1501 0 : out[7 * 2 + col] = _mm256_add_epi32(tmp[7], tmp2[8]);
1502 0 : out[8 * 2 + col] = _mm256_sub_epi32(tmp[7], tmp2[8]);
1503 0 : out[9 * 2 + col] = _mm256_sub_epi32(tmp[6], tmp2[9]);
1504 0 : out[10 * 2 + col] = _mm256_sub_epi32(tmp[5], tmp[10]);
1505 0 : out[11 * 2 + col] = _mm256_sub_epi32(tmp[4], tmp[11]);
1506 0 : out[12 * 2 + col] = _mm256_sub_epi32(tmp[3], tmp[12]);
1507 0 : out[13 * 2 + col] = _mm256_sub_epi32(tmp[2], tmp[13]);
1508 0 : out[14 * 2 + col] = _mm256_sub_epi32(tmp[1], tmp2[14]);
1509 0 : out[15 * 2 + col] = _mm256_sub_epi32(tmp[0], tmp2[15]);
1510 : }
1511 0 : }
1512 :
1513 0 : static INLINE void iadst16_col_avx2(__m256i *in, __m256i *out,
1514 : int8_t cos_bit) {
1515 0 : const int32_t *cospi = cospi_arr(cos_bit);
1516 0 : const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
1517 0 : const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
1518 0 : const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
1519 0 : const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
1520 0 : const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
1521 0 : const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
1522 0 : const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
1523 0 : const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
1524 0 : const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
1525 0 : const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
1526 0 : const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
1527 0 : const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
1528 0 : const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
1529 0 : const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
1530 0 : const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
1531 0 : const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
1532 :
1533 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1534 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1535 0 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1536 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1537 :
1538 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1539 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1540 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1541 :
1542 0 : const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
1543 0 : const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
1544 0 : const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
1545 0 : const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
1546 0 : const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
1547 0 : const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
1548 0 : const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
1549 0 : const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
1550 0 : const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
1551 0 : const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
1552 0 : const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
1553 0 : const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
1554 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1555 0 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1556 0 : const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
1557 :
1558 0 : const __m256i negative = _mm256_set1_epi32(-1);
1559 0 : const __m256i rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
1560 :
1561 : __m256i tmp[16], tmp2[16], tmp3[16];
1562 :
1563 0 : int32_t col = 0;
1564 :
1565 0 : for (col = 0; col < 2; ++col) {
1566 : //stage 1
1567 :
1568 : //stage 2
1569 0 : tmp[0] = half_btf_avx2(&cospi2, &in[15 * 2 + col],
1570 0 : &cospi62, &in[0 * 2 + col], &rounding, cos_bit);
1571 0 : tmp[1] = half_btf_avx2(&cospi62, &in[15 * 2 + col],
1572 0 : &cospim2, &in[0 * 2 + col], &rounding, cos_bit);
1573 0 : tmp[2] = half_btf_avx2(&cospi10, &in[13 * 2 + col],
1574 0 : &cospi54, &in[2 * 2 + col], &rounding, cos_bit);
1575 0 : tmp[3] = half_btf_avx2(&cospi54, &in[13 * 2 + col],
1576 0 : &cospim10, &in[2 * 2 + col], &rounding, cos_bit);
1577 0 : tmp[4] = half_btf_avx2(&cospi18, &in[11 * 2 + col],
1578 0 : &cospi46, &in[4 * 2 + col], &rounding, cos_bit);
1579 0 : tmp[5] = half_btf_avx2(&cospi46, &in[11 * 2 + col],
1580 0 : &cospim18, &in[4 * 2 + col], &rounding, cos_bit);
1581 0 : tmp[6] = half_btf_avx2(&cospi26, &in[9 * 2 + col],
1582 0 : &cospi38, &in[6 * 2 + col], &rounding, cos_bit);
1583 0 : tmp[7] = half_btf_avx2(&cospi38, &in[9 * 2 + col],
1584 0 : &cospim26, &in[6 * 2 + col], &rounding, cos_bit);
1585 0 : tmp[8] = half_btf_avx2(&cospi34, &in[7 * 2 + col],
1586 0 : &cospi30, &in[8 * 2 + col], &rounding, cos_bit);
1587 0 : tmp[9] = half_btf_avx2(&cospi30, &in[7 * 2 + col],
1588 0 : &cospim34, &in[8 * 2 + col], &rounding, cos_bit);
1589 0 : tmp[10] = half_btf_avx2(&cospi42, &in[5 * 2 + col],
1590 0 : &cospi22, &in[10 * 2 + col], &rounding, cos_bit);
1591 0 : tmp[11] = half_btf_avx2(&cospi22, &in[5 * 2 + col],
1592 0 : &cospim42, &in[10 * 2 + col], &rounding, cos_bit);
1593 0 : tmp[12] = half_btf_avx2(&cospi50, &in[3 * 2 + col],
1594 0 : &cospi14, &in[12 * 2 + col], &rounding, cos_bit);
1595 0 : tmp[13] = half_btf_avx2(&cospi14, &in[3 * 2 + col],
1596 0 : &cospim50, &in[12 * 2 + col], &rounding, cos_bit);
1597 0 : tmp[14] = half_btf_avx2(&cospi58, &in[1 * 2 + col],
1598 0 : &cospi6, &in[14 * 2 + col], &rounding, cos_bit);
1599 0 : tmp[15] = half_btf_avx2(&cospi6, &in[1 * 2 + col],
1600 0 : &cospim58, &in[14 * 2 + col], &rounding, cos_bit);
1601 :
1602 : //stage 3
1603 0 : tmp3[0] = _mm256_add_epi32(tmp[0], tmp[8]);
1604 0 : tmp3[1] = _mm256_add_epi32(tmp[1], tmp[9]);
1605 0 : tmp3[2] = _mm256_add_epi32(tmp[2], tmp[10]);
1606 0 : tmp3[3] = _mm256_add_epi32(tmp[3], tmp[11]);
1607 0 : tmp3[4] = _mm256_add_epi32(tmp[4], tmp[12]);
1608 0 : tmp3[5] = _mm256_add_epi32(tmp[5], tmp[13]);
1609 0 : tmp3[6] = _mm256_add_epi32(tmp[6], tmp[14]);
1610 0 : tmp3[7] = _mm256_add_epi32(tmp[7], tmp[15]);
1611 0 : tmp2[8] = _mm256_sub_epi32(tmp[0], tmp[8]);
1612 0 : tmp2[9] = _mm256_sub_epi32(tmp[1], tmp[9]);
1613 0 : tmp2[10] = _mm256_sub_epi32(tmp[2], tmp[10]);
1614 0 : tmp2[11] = _mm256_sub_epi32(tmp[3], tmp[11]);
1615 0 : tmp2[12] = _mm256_sub_epi32(tmp[4], tmp[12]);
1616 0 : tmp2[13] = _mm256_sub_epi32(tmp[5], tmp[13]);
1617 0 : tmp2[14] = _mm256_sub_epi32(tmp[6], tmp[14]);
1618 0 : tmp2[15] = _mm256_sub_epi32(tmp[7], tmp[15]);
1619 :
1620 : //stage 4
1621 0 : tmp[8] = half_btf_avx2(
1622 : &cospi8, &tmp2[8], &cospi56, &tmp2[9], &rounding, cos_bit);
1623 0 : tmp[9] = half_btf_avx2(
1624 : &cospi56, &tmp2[8], &cospim8, &tmp2[9], &rounding, cos_bit);
1625 0 : tmp[10] = half_btf_avx2(
1626 : &cospi40, &tmp2[10], &cospi24, &tmp2[11], &rounding, cos_bit);
1627 0 : tmp[11] = half_btf_avx2(
1628 : &cospi24, &tmp2[10], &cospim40, &tmp2[11], &rounding, cos_bit);
1629 0 : tmp[12] = half_btf_avx2(
1630 : &cospim56, &tmp2[12], &cospi8, &tmp2[13], &rounding, cos_bit);
1631 0 : tmp[13] = half_btf_avx2(
1632 : &cospi8, &tmp2[12], &cospi56, &tmp2[13], &rounding, cos_bit);
1633 0 : tmp[14] = half_btf_avx2(
1634 : &cospim24, &tmp2[14], &cospi40, &tmp2[15], &rounding, cos_bit);
1635 0 : tmp[15] = half_btf_avx2(
1636 : &cospi40, &tmp2[14], &cospi24, &tmp2[15], &rounding, cos_bit);
1637 :
1638 : //stage 5
1639 0 : tmp3[8] = _mm256_add_epi32(tmp3[0], tmp3[4]);
1640 0 : tmp3[9] = _mm256_add_epi32(tmp3[1], tmp3[5]);
1641 0 : tmp3[10] = _mm256_add_epi32(tmp3[2], tmp3[6]);
1642 0 : tmp3[11] = _mm256_add_epi32(tmp3[3], tmp3[7]);
1643 0 : tmp2[4] = _mm256_sub_epi32(tmp3[0], tmp3[4]);
1644 0 : tmp2[5] = _mm256_sub_epi32(tmp3[1], tmp3[5]);
1645 0 : tmp2[6] = _mm256_sub_epi32(tmp3[2], tmp3[6]);
1646 0 : tmp2[7] = _mm256_sub_epi32(tmp3[3], tmp3[7]);
1647 0 : tmp3[12] = _mm256_add_epi32(tmp[8], tmp[12]);
1648 0 : tmp3[13] = _mm256_add_epi32(tmp[9], tmp[13]);
1649 0 : tmp3[14] = _mm256_add_epi32(tmp[10], tmp[14]);
1650 0 : tmp3[15] = _mm256_add_epi32(tmp[11], tmp[15]);
1651 0 : tmp2[12] = _mm256_sub_epi32(tmp[8], tmp[12]);
1652 0 : tmp2[13] = _mm256_sub_epi32(tmp[9], tmp[13]);
1653 0 : tmp2[14] = _mm256_sub_epi32(tmp[10], tmp[14]);
1654 0 : tmp2[15] = _mm256_sub_epi32(tmp[11], tmp[15]);
1655 :
1656 : //stage 6
1657 0 : tmp[4] = half_btf_avx2(
1658 : &cospi16, &tmp2[4], &cospi48, &tmp2[5], &rounding, cos_bit);
1659 0 : tmp[5] = half_btf_avx2(
1660 : &cospi48, &tmp2[4], &cospim16, &tmp2[5], &rounding, cos_bit);
1661 0 : tmp[6] = half_btf_avx2(
1662 : &cospim48, &tmp2[6], &cospi16, &tmp2[7], &rounding, cos_bit);
1663 0 : tmp[7] = half_btf_avx2(
1664 : &cospi16, &tmp2[6], &cospi48, &tmp2[7], &rounding, cos_bit);
1665 0 : tmp[12] = half_btf_avx2(
1666 : &cospi16, &tmp2[12], &cospi48, &tmp2[13], &rounding, cos_bit);
1667 0 : tmp[13] = half_btf_avx2(
1668 : &cospi48, &tmp2[12], &cospim16, &tmp2[13], &rounding, cos_bit);
1669 0 : tmp[14] = half_btf_avx2(
1670 : &cospim48, &tmp2[14], &cospi16, &tmp2[15], &rounding, cos_bit);
1671 0 : tmp[15] = half_btf_avx2(
1672 : &cospi16, &tmp2[14], &cospi48, &tmp2[15], &rounding, cos_bit);
1673 :
1674 : //stage 7
1675 0 : out[0 * 2 + col] = _mm256_add_epi32(tmp3[8], tmp3[10]);
1676 0 : out[2 * 2 + col] = _mm256_add_epi32(tmp[12], tmp[14]);
1677 0 : out[12 * 2 + col] = _mm256_add_epi32(tmp[5], tmp[7]);
1678 0 : out[14 * 2 + col] = _mm256_add_epi32(tmp3[13], tmp3[15]);
1679 0 : tmp2[1] = _mm256_add_epi32(tmp3[9], tmp3[11]);
1680 0 : tmp2[2] = _mm256_sub_epi32(tmp3[8], tmp3[10]);
1681 0 : tmp2[3] = _mm256_sub_epi32(tmp3[9], tmp3[11]);
1682 0 : tmp2[4] = _mm256_add_epi32(tmp[4], tmp[6]);
1683 0 : tmp2[6] = _mm256_sub_epi32(tmp[4], tmp[6]);
1684 0 : tmp2[7] = _mm256_sub_epi32(tmp[5], tmp[7]);
1685 0 : tmp2[8] = _mm256_add_epi32(tmp3[12], tmp3[14]);
1686 0 : tmp2[10] = _mm256_sub_epi32(tmp3[12], tmp3[14]);
1687 0 : tmp2[11] = _mm256_sub_epi32(tmp3[13], tmp3[15]);
1688 0 : tmp2[13] = _mm256_add_epi32(tmp[13], tmp[15]);
1689 0 : tmp2[14] = _mm256_sub_epi32(tmp[12], tmp[14]);
1690 0 : tmp2[15] = _mm256_sub_epi32(tmp[13], tmp[15]);
1691 :
1692 : //stage 8
1693 0 : out[4 * 2 + col] = half_btf_avx2(
1694 : &cospi32, &tmp2[6], &cospi32, &tmp2[7], &rounding, cos_bit);
1695 0 : out[6 * 2 + col] = half_btf_avx2(
1696 : &cospi32, &tmp2[10], &cospi32, &tmp2[11], &rounding, cos_bit);
1697 0 : out[8 * 2 + col] = half_btf_avx2(
1698 : &cospi32, &tmp2[2], &cospim32, &tmp2[3], &rounding, cos_bit);
1699 0 : out[10 * 2 + col] = half_btf_avx2(
1700 : &cospi32, &tmp2[14], &cospim32, &tmp2[15], &rounding, cos_bit);
1701 0 : tmp[2] = half_btf_avx2(
1702 : &cospi32, &tmp2[2], &cospi32, &tmp2[3], &rounding, cos_bit);
1703 0 : tmp[7] = half_btf_avx2(
1704 : &cospi32, &tmp2[6], &cospim32, &tmp2[7], &rounding, cos_bit);
1705 0 : tmp[11] = half_btf_avx2(
1706 : &cospi32, &tmp2[10], &cospim32, &tmp2[11], &rounding, cos_bit);
1707 0 : tmp[14] = half_btf_avx2(
1708 : &cospi32, &tmp2[14], &cospi32, &tmp2[15], &rounding, cos_bit);
1709 : //range_check_buf(stage, input, bf1, size, stage_range[stage]);
1710 :
1711 : //stage 9
1712 0 : out[1 * 2 + col] = _mm256_sign_epi32(tmp2[8], negative);
1713 0 : out[3 * 2 + col] = _mm256_sign_epi32(tmp2[4], negative);
1714 0 : out[5 * 2 + col] = _mm256_sign_epi32(tmp[14], negative);
1715 0 : out[7 * 2 + col] = _mm256_sign_epi32(tmp[2], negative);
1716 0 : out[9 * 2 + col] = _mm256_sign_epi32(tmp[11], negative);
1717 0 : out[11 * 2 + col] = _mm256_sign_epi32(tmp[7], negative);
1718 0 : out[13 * 2 + col] = _mm256_sign_epi32(tmp2[13], negative);
1719 0 : out[15 * 2 + col] = _mm256_sign_epi32(tmp2[1], negative);
1720 : }
1721 0 : }
1722 :
1723 0 : void eb_av1_inv_txfm2d_add_16x16_avx2(const int32_t *input,
1724 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w,
1725 : int32_t stride_w, TxType tx_type, int32_t bd) {
1726 : __m256i in[32], out[32];
1727 0 : const int8_t *shift = eb_inv_txfm_shift_ls[TX_16X16];
1728 0 : const int32_t txw_idx = get_txw_idx(TX_16X16);
1729 0 : const int32_t txh_idx = get_txh_idx(TX_16X16);
1730 :
1731 0 : switch (tx_type) {
1732 0 : case IDTX:
1733 0 : load_buffer_16x16(input, in);
1734 0 : iidentity16_and_round_shift_avx2(in, -shift[0]);
1735 0 : iidentity16_and_round_shift_avx2(in, -shift[1]);
1736 0 : write_buffer_16x16(in, output_r, stride_r, output_w, stride_w,
1737 : 0, 0, bd);
1738 0 : break;
1739 0 : case V_DCT:
1740 0 : load_buffer_16x16(input, in);
1741 0 : iidentity16_and_round_shift_avx2(in, -shift[0]);
1742 0 : idct16_col_avx2(in, out, inv_cos_bit_col[txw_idx][txh_idx], shift);
1743 0 : round_shift_16x16(out, -shift[1]);
1744 0 : write_buffer_16x16(out, output_r, stride_r, output_w, stride_w,
1745 : 0, 0, bd);
1746 0 : break;
1747 0 : case H_DCT:
1748 0 : load_buffer_16x16(input, in);
1749 0 : transpose_16x16_avx2(in, out);
1750 0 : idct16_col_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx], shift);
1751 0 : transpose_16x16_avx2(in, out);
1752 0 : round_shift_16x16(out, -shift[0]);
1753 0 : iidentity16_and_round_shift_avx2(out, -shift[1]);
1754 0 : write_buffer_16x16(out, output_r, stride_r, output_w, stride_w,
1755 : 0, 0, bd);
1756 0 : break;
1757 0 : case V_ADST:
1758 0 : load_buffer_16x16(input, in);
1759 0 : iidentity16_and_round_shift_avx2(in, -shift[0]);
1760 0 : iadst16_col_avx2(in, out, inv_cos_bit_col[txw_idx][txh_idx]);
1761 0 : round_shift_16x16(out, -shift[1]);
1762 0 : write_buffer_16x16(out, output_r, stride_r, output_w, stride_w,
1763 : 0, 0, bd);
1764 0 : break;
1765 0 : case H_ADST:
1766 0 : load_buffer_16x16(input, in);
1767 0 : transpose_16x16_avx2(in, out);
1768 0 : iadst16_col_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
1769 0 : transpose_16x16_avx2(in, out);
1770 0 : round_shift_16x16(out, -shift[0]);
1771 0 : iidentity16_and_round_shift_avx2(out, -shift[1]);
1772 0 : write_buffer_16x16(out, output_r, stride_r, output_w, stride_w,
1773 : 0, 0, bd);
1774 0 : break;
1775 0 : case V_FLIPADST:
1776 0 : load_buffer_16x16(input, in);
1777 0 : iidentity16_and_round_shift_avx2(in, -shift[0]);
1778 0 : iadst16_col_avx2(in, out, inv_cos_bit_col[txw_idx][txh_idx]);
1779 0 : round_shift_16x16(out, -shift[1]);
1780 0 : write_buffer_16x16(out, output_r, stride_r, output_w, stride_w,
1781 : 0, 1, bd);
1782 0 : break;
1783 0 : case H_FLIPADST:
1784 0 : load_buffer_16x16(input, in);
1785 0 : transpose_16x16_avx2(in, out);
1786 0 : iadst16_col_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
1787 0 : transpose_16x16_avx2(in, out);
1788 0 : round_shift_16x16(out, -shift[0]);
1789 0 : iidentity16_and_round_shift_avx2(out, -shift[1]);
1790 0 : write_buffer_16x16(out, output_r, stride_r, output_w, stride_w,
1791 : 1, 0, bd);
1792 0 : break;
1793 0 : default:
1794 0 : eb_av1_inv_txfm2d_add_16x16_sse4_1(input,
1795 : output_r, stride_r, output_w, stride_w, tx_type, bd);
1796 0 : break;
1797 : }
1798 0 : }
1799 :
1800 : // Note:
1801 : // Total 32x4 registers to represent 32x32 block coefficients.
1802 : // For high bit depth, each coefficient is 4-byte.
1803 : // Each __m256i register holds 8 coefficients.
1804 : // So each "row" we needs 4 register. Totally 32 rows
1805 : // Register layout:
1806 : // v0, v1, v2, v3,
1807 : // v4, v5, v6, v7,
1808 : // ... ...
1809 : // v124, v125, v126, v127
1810 :
1811 0 : static void transpose_32x32_8x8(const __m256i *in, __m256i *out) {
1812 : __m256i u0, u1, u2, u3, u4, u5, u6, u7;
1813 : __m256i x0, x1;
1814 :
1815 0 : u0 = _mm256_unpacklo_epi32(in[0], in[4]);
1816 0 : u1 = _mm256_unpackhi_epi32(in[0], in[4]);
1817 :
1818 0 : u2 = _mm256_unpacklo_epi32(in[8], in[12]);
1819 0 : u3 = _mm256_unpackhi_epi32(in[8], in[12]);
1820 :
1821 0 : u4 = _mm256_unpacklo_epi32(in[16], in[20]);
1822 0 : u5 = _mm256_unpackhi_epi32(in[16], in[20]);
1823 :
1824 0 : u6 = _mm256_unpacklo_epi32(in[24], in[28]);
1825 0 : u7 = _mm256_unpackhi_epi32(in[24], in[28]);
1826 :
1827 0 : x0 = _mm256_unpacklo_epi64(u0, u2);
1828 0 : x1 = _mm256_unpacklo_epi64(u4, u6);
1829 0 : out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
1830 0 : out[16] = _mm256_permute2f128_si256(x0, x1, 0x31);
1831 :
1832 0 : x0 = _mm256_unpackhi_epi64(u0, u2);
1833 0 : x1 = _mm256_unpackhi_epi64(u4, u6);
1834 0 : out[4] = _mm256_permute2f128_si256(x0, x1, 0x20);
1835 0 : out[20] = _mm256_permute2f128_si256(x0, x1, 0x31);
1836 :
1837 0 : x0 = _mm256_unpacklo_epi64(u1, u3);
1838 0 : x1 = _mm256_unpacklo_epi64(u5, u7);
1839 0 : out[8] = _mm256_permute2f128_si256(x0, x1, 0x20);
1840 0 : out[24] = _mm256_permute2f128_si256(x0, x1, 0x31);
1841 :
1842 0 : x0 = _mm256_unpackhi_epi64(u1, u3);
1843 0 : x1 = _mm256_unpackhi_epi64(u5, u7);
1844 0 : out[12] = _mm256_permute2f128_si256(x0, x1, 0x20);
1845 0 : out[28] = _mm256_permute2f128_si256(x0, x1, 0x31);
1846 0 : }
1847 :
1848 0 : static void transpose_32x32_16x16(const __m256i *in, __m256i *out) {
1849 0 : transpose_32x32_8x8(&in[0], &out[0]);
1850 0 : transpose_32x32_8x8(&in[1], &out[32]);
1851 0 : transpose_32x32_8x8(&in[32], &out[1]);
1852 0 : transpose_32x32_8x8(&in[33], &out[33]);
1853 0 : }
1854 :
1855 0 : static void transpose_32x32(const __m256i *in, __m256i *out) {
1856 0 : transpose_32x32_16x16(&in[0], &out[0]);
1857 0 : transpose_32x32_16x16(&in[2], &out[64]);
1858 0 : transpose_32x32_16x16(&in[64], &out[2]);
1859 0 : transpose_32x32_16x16(&in[66], &out[66]);
1860 0 : }
1861 :
1862 0 : static void load_buffer_32x32_new(const int32_t *coeff, __m256i *in,
1863 : int32_t input_stiride, int32_t size) {
1864 : int32_t i;
1865 0 : for (i = 0; i < size; ++i)
1866 0 : in[i] = _mm256_loadu_si256((const __m256i *)(coeff + i * input_stiride));
1867 0 : }
1868 :
1869 0 : static void load_buffer_32x32(const int32_t *coeff, __m256i *in) {
1870 : int32_t i;
1871 0 : for (i = 0; i < 128; ++i) {
1872 0 : in[i] = _mm256_loadu_si256((const __m256i *)coeff);
1873 0 : coeff += 8;
1874 : }
1875 0 : }
1876 :
1877 0 : static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
1878 : const __m256i *rounding, int32_t bit) {
1879 : __m256i x;
1880 0 : x = _mm256_mullo_epi32(*w0, *n0);
1881 0 : x = _mm256_add_epi32(x, *rounding);
1882 0 : x = _mm256_srai_epi32(x, bit);
1883 0 : return x;
1884 : }
1885 :
1886 0 : static INLINE void round_shift_32x32(__m256i *in, int32_t shift) {
1887 0 : __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
1888 0 : int32_t i = 0;
1889 :
1890 0 : while (i < 128) {
1891 0 : in[i] = _mm256_add_epi32(in[i], rnding);
1892 0 : in[i] = _mm256_srai_epi32(in[i], shift);
1893 0 : i++;
1894 : }
1895 0 : }
1896 :
1897 0 : static void write_buffer_32x32(__m256i *in,
1898 : uint16_t *output_r, int32_t stride_r,
1899 : uint16_t *output_w, int32_t stride_w,
1900 : int32_t fliplr, int32_t flipud, int32_t bd) {
1901 : __m256i u0, u1, x0, x1, x2, x3, v0, v1, v2, v3;
1902 0 : const __m256i zero = _mm256_setzero_si256();
1903 0 : int32_t i = 0;
1904 : (void)fliplr;
1905 : (void)flipud;
1906 :
1907 0 : while (i < 128) {
1908 0 : u0 = _mm256_loadu_si256((const __m256i *)output_r);
1909 0 : u1 = _mm256_loadu_si256((const __m256i *)(output_r + 16));
1910 :
1911 0 : x0 = _mm256_unpacklo_epi16(u0, zero);
1912 0 : x1 = _mm256_unpackhi_epi16(u0, zero);
1913 0 : x2 = _mm256_unpacklo_epi16(u1, zero);
1914 0 : x3 = _mm256_unpackhi_epi16(u1, zero);
1915 :
1916 0 : v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x20);
1917 0 : v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x31);
1918 0 : v2 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x20);
1919 0 : v3 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x31);
1920 :
1921 0 : v0 = _mm256_add_epi32(v0, x0);
1922 0 : v1 = _mm256_add_epi32(v1, x1);
1923 0 : v2 = _mm256_add_epi32(v2, x2);
1924 0 : v3 = _mm256_add_epi32(v3, x3);
1925 :
1926 0 : highbd_clamp_epi32(&v0, bd);
1927 0 : highbd_clamp_epi32(&v1, bd);
1928 0 : highbd_clamp_epi32(&v2, bd);
1929 0 : highbd_clamp_epi32(&v3, bd);
1930 :
1931 0 : v0 = _mm256_packus_epi32(v0, v1);
1932 0 : v2 = _mm256_packus_epi32(v2, v3);
1933 :
1934 0 : _mm256_storeu_si256((__m256i *)output_w, v0);
1935 0 : _mm256_storeu_si256((__m256i *)(output_w + 16), v2);
1936 0 : output_r += stride_r;
1937 0 : output_w += stride_w;
1938 0 : i += 4;
1939 : }
1940 0 : }
1941 :
1942 0 : static void idct32_avx2(__m256i *in, __m256i *out, int32_t bit) {
1943 0 : const int32_t *cospi = cospi_arr(bit);
1944 0 : const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
1945 0 : const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
1946 0 : const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
1947 0 : const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
1948 0 : const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
1949 0 : const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
1950 0 : const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
1951 0 : const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
1952 0 : const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
1953 0 : const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
1954 0 : const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
1955 0 : const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
1956 0 : const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
1957 0 : const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
1958 0 : const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
1959 0 : const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
1960 0 : const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
1961 0 : const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
1962 0 : const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
1963 0 : const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
1964 0 : const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
1965 0 : const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
1966 0 : const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
1967 0 : const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
1968 0 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
1969 0 : const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
1970 0 : const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
1971 0 : const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
1972 0 : const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
1973 0 : const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
1974 0 : const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
1975 0 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
1976 0 : const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
1977 0 : const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
1978 0 : const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
1979 0 : const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
1980 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1981 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1982 0 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1983 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1984 0 : const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
1985 0 : const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
1986 0 : const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
1987 0 : const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
1988 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1989 0 : const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
1990 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1991 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1992 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1993 0 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1994 0 : const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
1995 : __m256i bf1[32], bf0[32];
1996 : int32_t col;
1997 :
1998 0 : for (col = 0; col < 4; ++col) {
1999 : // stage 0
2000 : // stage 1
2001 0 : bf1[0] = in[0 * 4 + col];
2002 0 : bf1[1] = in[16 * 4 + col];
2003 0 : bf1[2] = in[8 * 4 + col];
2004 0 : bf1[3] = in[24 * 4 + col];
2005 0 : bf1[4] = in[4 * 4 + col];
2006 0 : bf1[5] = in[20 * 4 + col];
2007 0 : bf1[6] = in[12 * 4 + col];
2008 0 : bf1[7] = in[28 * 4 + col];
2009 0 : bf1[8] = in[2 * 4 + col];
2010 0 : bf1[9] = in[18 * 4 + col];
2011 0 : bf1[10] = in[10 * 4 + col];
2012 0 : bf1[11] = in[26 * 4 + col];
2013 0 : bf1[12] = in[6 * 4 + col];
2014 0 : bf1[13] = in[22 * 4 + col];
2015 0 : bf1[14] = in[14 * 4 + col];
2016 0 : bf1[15] = in[30 * 4 + col];
2017 0 : bf1[16] = in[1 * 4 + col];
2018 0 : bf1[17] = in[17 * 4 + col];
2019 0 : bf1[18] = in[9 * 4 + col];
2020 0 : bf1[19] = in[25 * 4 + col];
2021 0 : bf1[20] = in[5 * 4 + col];
2022 0 : bf1[21] = in[21 * 4 + col];
2023 0 : bf1[22] = in[13 * 4 + col];
2024 0 : bf1[23] = in[29 * 4 + col];
2025 0 : bf1[24] = in[3 * 4 + col];
2026 0 : bf1[25] = in[19 * 4 + col];
2027 0 : bf1[26] = in[11 * 4 + col];
2028 0 : bf1[27] = in[27 * 4 + col];
2029 0 : bf1[28] = in[7 * 4 + col];
2030 0 : bf1[29] = in[23 * 4 + col];
2031 0 : bf1[30] = in[15 * 4 + col];
2032 0 : bf1[31] = in[31 * 4 + col];
2033 :
2034 : // stage 2
2035 0 : bf0[0] = bf1[0];
2036 0 : bf0[1] = bf1[1];
2037 0 : bf0[2] = bf1[2];
2038 0 : bf0[3] = bf1[3];
2039 0 : bf0[4] = bf1[4];
2040 0 : bf0[5] = bf1[5];
2041 0 : bf0[6] = bf1[6];
2042 0 : bf0[7] = bf1[7];
2043 0 : bf0[8] = bf1[8];
2044 0 : bf0[9] = bf1[9];
2045 0 : bf0[10] = bf1[10];
2046 0 : bf0[11] = bf1[11];
2047 0 : bf0[12] = bf1[12];
2048 0 : bf0[13] = bf1[13];
2049 0 : bf0[14] = bf1[14];
2050 0 : bf0[15] = bf1[15];
2051 0 : bf0[16] =
2052 0 : half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
2053 0 : bf0[17] =
2054 0 : half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
2055 0 : bf0[18] =
2056 0 : half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
2057 0 : bf0[19] =
2058 0 : half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
2059 0 : bf0[20] =
2060 0 : half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
2061 0 : bf0[21] =
2062 0 : half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
2063 0 : bf0[22] =
2064 0 : half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
2065 0 : bf0[23] =
2066 0 : half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
2067 0 : bf0[24] =
2068 0 : half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
2069 0 : bf0[25] =
2070 0 : half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
2071 0 : bf0[26] =
2072 0 : half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
2073 0 : bf0[27] =
2074 0 : half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
2075 0 : bf0[28] =
2076 0 : half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
2077 0 : bf0[29] =
2078 0 : half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
2079 0 : bf0[30] =
2080 0 : half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
2081 0 : bf0[31] =
2082 0 : half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
2083 :
2084 : // stage 3
2085 0 : bf1[0] = bf0[0];
2086 0 : bf1[1] = bf0[1];
2087 0 : bf1[2] = bf0[2];
2088 0 : bf1[3] = bf0[3];
2089 0 : bf1[4] = bf0[4];
2090 0 : bf1[5] = bf0[5];
2091 0 : bf1[6] = bf0[6];
2092 0 : bf1[7] = bf0[7];
2093 0 : bf1[8] =
2094 0 : half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
2095 0 : bf1[9] =
2096 0 : half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
2097 0 : bf1[10] =
2098 0 : half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
2099 0 : bf1[11] =
2100 0 : half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
2101 0 : bf1[12] =
2102 0 : half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
2103 0 : bf1[13] =
2104 0 : half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
2105 0 : bf1[14] =
2106 0 : half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
2107 0 : bf1[15] =
2108 0 : half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
2109 0 : bf1[16] = _mm256_add_epi32(bf0[16], bf0[17]);
2110 0 : bf1[17] = _mm256_sub_epi32(bf0[16], bf0[17]);
2111 0 : bf1[18] = _mm256_sub_epi32(bf0[19], bf0[18]);
2112 0 : bf1[19] = _mm256_add_epi32(bf0[18], bf0[19]);
2113 0 : bf1[20] = _mm256_add_epi32(bf0[20], bf0[21]);
2114 0 : bf1[21] = _mm256_sub_epi32(bf0[20], bf0[21]);
2115 0 : bf1[22] = _mm256_sub_epi32(bf0[23], bf0[22]);
2116 0 : bf1[23] = _mm256_add_epi32(bf0[22], bf0[23]);
2117 0 : bf1[24] = _mm256_add_epi32(bf0[24], bf0[25]);
2118 0 : bf1[25] = _mm256_sub_epi32(bf0[24], bf0[25]);
2119 0 : bf1[26] = _mm256_sub_epi32(bf0[27], bf0[26]);
2120 0 : bf1[27] = _mm256_add_epi32(bf0[26], bf0[27]);
2121 0 : bf1[28] = _mm256_add_epi32(bf0[28], bf0[29]);
2122 0 : bf1[29] = _mm256_sub_epi32(bf0[28], bf0[29]);
2123 0 : bf1[30] = _mm256_sub_epi32(bf0[31], bf0[30]);
2124 0 : bf1[31] = _mm256_add_epi32(bf0[30], bf0[31]);
2125 :
2126 : // stage 4
2127 0 : bf0[0] = bf1[0];
2128 0 : bf0[1] = bf1[1];
2129 0 : bf0[2] = bf1[2];
2130 0 : bf0[3] = bf1[3];
2131 0 : bf0[4] =
2132 0 : half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
2133 0 : bf0[5] =
2134 0 : half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
2135 0 : bf0[6] =
2136 0 : half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
2137 0 : bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
2138 0 : bf0[8] = _mm256_add_epi32(bf1[8], bf1[9]);
2139 0 : bf0[9] = _mm256_sub_epi32(bf1[8], bf1[9]);
2140 0 : bf0[10] = _mm256_sub_epi32(bf1[11], bf1[10]);
2141 0 : bf0[11] = _mm256_add_epi32(bf1[10], bf1[11]);
2142 0 : bf0[12] = _mm256_add_epi32(bf1[12], bf1[13]);
2143 0 : bf0[13] = _mm256_sub_epi32(bf1[12], bf1[13]);
2144 0 : bf0[14] = _mm256_sub_epi32(bf1[15], bf1[14]);
2145 0 : bf0[15] = _mm256_add_epi32(bf1[14], bf1[15]);
2146 0 : bf0[16] = bf1[16];
2147 0 : bf0[17] =
2148 0 : half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
2149 0 : bf0[18] =
2150 0 : half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
2151 0 : bf0[19] = bf1[19];
2152 0 : bf0[20] = bf1[20];
2153 0 : bf0[21] =
2154 0 : half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
2155 0 : bf0[22] =
2156 0 : half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
2157 0 : bf0[23] = bf1[23];
2158 0 : bf0[24] = bf1[24];
2159 0 : bf0[25] =
2160 0 : half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
2161 0 : bf0[26] =
2162 0 : half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
2163 0 : bf0[27] = bf1[27];
2164 0 : bf0[28] = bf1[28];
2165 0 : bf0[29] =
2166 0 : half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
2167 0 : bf0[30] =
2168 0 : half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
2169 0 : bf0[31] = bf1[31];
2170 :
2171 : // stage 5
2172 0 : bf1[0] =
2173 0 : half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
2174 0 : bf1[1] =
2175 0 : half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
2176 0 : bf1[2] =
2177 0 : half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
2178 0 : bf1[3] =
2179 0 : half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
2180 0 : bf1[4] = _mm256_add_epi32(bf0[4], bf0[5]);
2181 0 : bf1[5] = _mm256_sub_epi32(bf0[4], bf0[5]);
2182 0 : bf1[6] = _mm256_sub_epi32(bf0[7], bf0[6]);
2183 0 : bf1[7] = _mm256_add_epi32(bf0[6], bf0[7]);
2184 0 : bf1[8] = bf0[8];
2185 0 : bf1[9] =
2186 0 : half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
2187 0 : bf1[10] =
2188 0 : half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
2189 0 : bf1[11] = bf0[11];
2190 0 : bf1[12] = bf0[12];
2191 0 : bf1[13] =
2192 0 : half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
2193 0 : bf1[14] =
2194 0 : half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
2195 0 : bf1[15] = bf0[15];
2196 0 : bf1[16] = _mm256_add_epi32(bf0[16], bf0[19]);
2197 0 : bf1[17] = _mm256_add_epi32(bf0[17], bf0[18]);
2198 0 : bf1[18] = _mm256_sub_epi32(bf0[17], bf0[18]);
2199 0 : bf1[19] = _mm256_sub_epi32(bf0[16], bf0[19]);
2200 0 : bf1[20] = _mm256_sub_epi32(bf0[23], bf0[20]);
2201 0 : bf1[21] = _mm256_sub_epi32(bf0[22], bf0[21]);
2202 0 : bf1[22] = _mm256_add_epi32(bf0[21], bf0[22]);
2203 0 : bf1[23] = _mm256_add_epi32(bf0[20], bf0[23]);
2204 0 : bf1[24] = _mm256_add_epi32(bf0[24], bf0[27]);
2205 0 : bf1[25] = _mm256_add_epi32(bf0[25], bf0[26]);
2206 0 : bf1[26] = _mm256_sub_epi32(bf0[25], bf0[26]);
2207 0 : bf1[27] = _mm256_sub_epi32(bf0[24], bf0[27]);
2208 0 : bf1[28] = _mm256_sub_epi32(bf0[31], bf0[28]);
2209 0 : bf1[29] = _mm256_sub_epi32(bf0[30], bf0[29]);
2210 0 : bf1[30] = _mm256_add_epi32(bf0[29], bf0[30]);
2211 0 : bf1[31] = _mm256_add_epi32(bf0[28], bf0[31]);
2212 :
2213 : // stage 6
2214 0 : bf0[0] = _mm256_add_epi32(bf1[0], bf1[3]);
2215 0 : bf0[1] = _mm256_add_epi32(bf1[1], bf1[2]);
2216 0 : bf0[2] = _mm256_sub_epi32(bf1[1], bf1[2]);
2217 0 : bf0[3] = _mm256_sub_epi32(bf1[0], bf1[3]);
2218 0 : bf0[4] = bf1[4];
2219 0 : bf0[5] =
2220 0 : half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
2221 0 : bf0[6] =
2222 0 : half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
2223 0 : bf0[7] = bf1[7];
2224 0 : bf0[8] = _mm256_add_epi32(bf1[8], bf1[11]);
2225 0 : bf0[9] = _mm256_add_epi32(bf1[9], bf1[10]);
2226 0 : bf0[10] = _mm256_sub_epi32(bf1[9], bf1[10]);
2227 0 : bf0[11] = _mm256_sub_epi32(bf1[8], bf1[11]);
2228 0 : bf0[12] = _mm256_sub_epi32(bf1[15], bf1[12]);
2229 0 : bf0[13] = _mm256_sub_epi32(bf1[14], bf1[13]);
2230 0 : bf0[14] = _mm256_add_epi32(bf1[13], bf1[14]);
2231 0 : bf0[15] = _mm256_add_epi32(bf1[12], bf1[15]);
2232 0 : bf0[16] = bf1[16];
2233 0 : bf0[17] = bf1[17];
2234 0 : bf0[18] =
2235 0 : half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
2236 0 : bf0[19] =
2237 0 : half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
2238 0 : bf0[20] =
2239 0 : half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
2240 0 : bf0[21] =
2241 0 : half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
2242 0 : bf0[22] = bf1[22];
2243 0 : bf0[23] = bf1[23];
2244 0 : bf0[24] = bf1[24];
2245 0 : bf0[25] = bf1[25];
2246 0 : bf0[26] =
2247 0 : half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
2248 0 : bf0[27] =
2249 0 : half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
2250 0 : bf0[28] =
2251 0 : half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
2252 0 : bf0[29] =
2253 0 : half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
2254 0 : bf0[30] = bf1[30];
2255 0 : bf0[31] = bf1[31];
2256 :
2257 : // stage 7
2258 0 : bf1[0] = _mm256_add_epi32(bf0[0], bf0[7]);
2259 0 : bf1[1] = _mm256_add_epi32(bf0[1], bf0[6]);
2260 0 : bf1[2] = _mm256_add_epi32(bf0[2], bf0[5]);
2261 0 : bf1[3] = _mm256_add_epi32(bf0[3], bf0[4]);
2262 0 : bf1[4] = _mm256_sub_epi32(bf0[3], bf0[4]);
2263 0 : bf1[5] = _mm256_sub_epi32(bf0[2], bf0[5]);
2264 0 : bf1[6] = _mm256_sub_epi32(bf0[1], bf0[6]);
2265 0 : bf1[7] = _mm256_sub_epi32(bf0[0], bf0[7]);
2266 0 : bf1[8] = bf0[8];
2267 0 : bf1[9] = bf0[9];
2268 0 : bf1[10] =
2269 0 : half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
2270 0 : bf1[11] =
2271 0 : half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
2272 0 : bf1[12] =
2273 0 : half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
2274 0 : bf1[13] =
2275 0 : half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
2276 0 : bf1[14] = bf0[14];
2277 0 : bf1[15] = bf0[15];
2278 0 : bf1[16] = _mm256_add_epi32(bf0[16], bf0[23]);
2279 0 : bf1[17] = _mm256_add_epi32(bf0[17], bf0[22]);
2280 0 : bf1[18] = _mm256_add_epi32(bf0[18], bf0[21]);
2281 0 : bf1[19] = _mm256_add_epi32(bf0[19], bf0[20]);
2282 0 : bf1[20] = _mm256_sub_epi32(bf0[19], bf0[20]);
2283 0 : bf1[21] = _mm256_sub_epi32(bf0[18], bf0[21]);
2284 0 : bf1[22] = _mm256_sub_epi32(bf0[17], bf0[22]);
2285 0 : bf1[23] = _mm256_sub_epi32(bf0[16], bf0[23]);
2286 0 : bf1[24] = _mm256_sub_epi32(bf0[31], bf0[24]);
2287 0 : bf1[25] = _mm256_sub_epi32(bf0[30], bf0[25]);
2288 0 : bf1[26] = _mm256_sub_epi32(bf0[29], bf0[26]);
2289 0 : bf1[27] = _mm256_sub_epi32(bf0[28], bf0[27]);
2290 0 : bf1[28] = _mm256_add_epi32(bf0[27], bf0[28]);
2291 0 : bf1[29] = _mm256_add_epi32(bf0[26], bf0[29]);
2292 0 : bf1[30] = _mm256_add_epi32(bf0[25], bf0[30]);
2293 0 : bf1[31] = _mm256_add_epi32(bf0[24], bf0[31]);
2294 :
2295 : // stage 8
2296 0 : bf0[0] = _mm256_add_epi32(bf1[0], bf1[15]);
2297 0 : bf0[1] = _mm256_add_epi32(bf1[1], bf1[14]);
2298 0 : bf0[2] = _mm256_add_epi32(bf1[2], bf1[13]);
2299 0 : bf0[3] = _mm256_add_epi32(bf1[3], bf1[12]);
2300 0 : bf0[4] = _mm256_add_epi32(bf1[4], bf1[11]);
2301 0 : bf0[5] = _mm256_add_epi32(bf1[5], bf1[10]);
2302 0 : bf0[6] = _mm256_add_epi32(bf1[6], bf1[9]);
2303 0 : bf0[7] = _mm256_add_epi32(bf1[7], bf1[8]);
2304 0 : bf0[8] = _mm256_sub_epi32(bf1[7], bf1[8]);
2305 0 : bf0[9] = _mm256_sub_epi32(bf1[6], bf1[9]);
2306 0 : bf0[10] = _mm256_sub_epi32(bf1[5], bf1[10]);
2307 0 : bf0[11] = _mm256_sub_epi32(bf1[4], bf1[11]);
2308 0 : bf0[12] = _mm256_sub_epi32(bf1[3], bf1[12]);
2309 0 : bf0[13] = _mm256_sub_epi32(bf1[2], bf1[13]);
2310 0 : bf0[14] = _mm256_sub_epi32(bf1[1], bf1[14]);
2311 0 : bf0[15] = _mm256_sub_epi32(bf1[0], bf1[15]);
2312 0 : bf0[16] = bf1[16];
2313 0 : bf0[17] = bf1[17];
2314 0 : bf0[18] = bf1[18];
2315 0 : bf0[19] = bf1[19];
2316 0 : bf0[20] =
2317 0 : half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
2318 0 : bf0[21] =
2319 0 : half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
2320 0 : bf0[22] =
2321 0 : half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
2322 0 : bf0[23] =
2323 0 : half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
2324 0 : bf0[24] =
2325 0 : half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
2326 0 : bf0[25] =
2327 0 : half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
2328 0 : bf0[26] =
2329 0 : half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
2330 0 : bf0[27] =
2331 0 : half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
2332 0 : bf0[28] = bf1[28];
2333 0 : bf0[29] = bf1[29];
2334 0 : bf0[30] = bf1[30];
2335 0 : bf0[31] = bf1[31];
2336 :
2337 : // stage 9
2338 0 : out[0 * 4 + col] = _mm256_add_epi32(bf0[0], bf0[31]);
2339 0 : out[1 * 4 + col] = _mm256_add_epi32(bf0[1], bf0[30]);
2340 0 : out[2 * 4 + col] = _mm256_add_epi32(bf0[2], bf0[29]);
2341 0 : out[3 * 4 + col] = _mm256_add_epi32(bf0[3], bf0[28]);
2342 0 : out[4 * 4 + col] = _mm256_add_epi32(bf0[4], bf0[27]);
2343 0 : out[5 * 4 + col] = _mm256_add_epi32(bf0[5], bf0[26]);
2344 0 : out[6 * 4 + col] = _mm256_add_epi32(bf0[6], bf0[25]);
2345 0 : out[7 * 4 + col] = _mm256_add_epi32(bf0[7], bf0[24]);
2346 0 : out[8 * 4 + col] = _mm256_add_epi32(bf0[8], bf0[23]);
2347 0 : out[9 * 4 + col] = _mm256_add_epi32(bf0[9], bf0[22]);
2348 0 : out[10 * 4 + col] = _mm256_add_epi32(bf0[10], bf0[21]);
2349 0 : out[11 * 4 + col] = _mm256_add_epi32(bf0[11], bf0[20]);
2350 0 : out[12 * 4 + col] = _mm256_add_epi32(bf0[12], bf0[19]);
2351 0 : out[13 * 4 + col] = _mm256_add_epi32(bf0[13], bf0[18]);
2352 0 : out[14 * 4 + col] = _mm256_add_epi32(bf0[14], bf0[17]);
2353 0 : out[15 * 4 + col] = _mm256_add_epi32(bf0[15], bf0[16]);
2354 0 : out[16 * 4 + col] = _mm256_sub_epi32(bf0[15], bf0[16]);
2355 0 : out[17 * 4 + col] = _mm256_sub_epi32(bf0[14], bf0[17]);
2356 0 : out[18 * 4 + col] = _mm256_sub_epi32(bf0[13], bf0[18]);
2357 0 : out[19 * 4 + col] = _mm256_sub_epi32(bf0[12], bf0[19]);
2358 0 : out[20 * 4 + col] = _mm256_sub_epi32(bf0[11], bf0[20]);
2359 0 : out[21 * 4 + col] = _mm256_sub_epi32(bf0[10], bf0[21]);
2360 0 : out[22 * 4 + col] = _mm256_sub_epi32(bf0[9], bf0[22]);
2361 0 : out[23 * 4 + col] = _mm256_sub_epi32(bf0[8], bf0[23]);
2362 0 : out[24 * 4 + col] = _mm256_sub_epi32(bf0[7], bf0[24]);
2363 0 : out[25 * 4 + col] = _mm256_sub_epi32(bf0[6], bf0[25]);
2364 0 : out[26 * 4 + col] = _mm256_sub_epi32(bf0[5], bf0[26]);
2365 0 : out[27 * 4 + col] = _mm256_sub_epi32(bf0[4], bf0[27]);
2366 0 : out[28 * 4 + col] = _mm256_sub_epi32(bf0[3], bf0[28]);
2367 0 : out[29 * 4 + col] = _mm256_sub_epi32(bf0[2], bf0[29]);
2368 0 : out[30 * 4 + col] = _mm256_sub_epi32(bf0[1], bf0[30]);
2369 0 : out[31 * 4 + col] = _mm256_sub_epi32(bf0[0], bf0[31]);
2370 : }
2371 0 : }
2372 :
2373 0 : void eb_av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff,
2374 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
2375 : TxType tx_type, int32_t bd) {
2376 : __m256i in[128], out[128];
2377 0 : const int8_t *shift = eb_inv_txfm_shift_ls[TX_32X32];
2378 0 : const int32_t txw_idx = get_txw_idx(TX_32X32);
2379 0 : const int32_t txh_idx = get_txh_idx(TX_32X32);
2380 :
2381 0 : switch (tx_type) {
2382 0 : case DCT_DCT:
2383 0 : load_buffer_32x32(coeff, in);
2384 0 : transpose_32x32(in, out);
2385 0 : idct32_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
2386 0 : round_shift_32x32(in, -shift[0]);
2387 0 : transpose_32x32(in, out);
2388 0 : idct32_avx2(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
2389 0 : round_shift_32x32(in, -shift[1]);
2390 0 : write_buffer_32x32(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
2391 0 : break;
2392 0 : case IDTX:
2393 0 : load_buffer_32x32(coeff, in);
2394 : // Operations can be joined together without losing precision
2395 : // eb_av1_iidentity32_c() shift left 2 bits
2396 : // round_shift_32x32(, -shift[0]) shift right 2 bits
2397 : // eb_av1_iidentity32_c() shift left 2 bits
2398 : // round_shift_32x32(, -shift[1]) shift right 4 bits with complement
2399 0 : round_shift_32x32(in, -shift[0] - shift[1] - 4);
2400 0 : write_buffer_32x32(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
2401 0 : break;
2402 0 : default: assert(0);
2403 : }
2404 0 : }
2405 :
2406 0 : static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
2407 : int32_t bd, int32_t out_shift) {
2408 0 : const int32_t *cospi = cospi_arr(bit);
2409 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2410 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2411 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2412 : __m256i x;
2413 :
2414 : // stage 0
2415 : // stage 1
2416 : // stage 2
2417 : // stage 3
2418 0 : x = _mm256_mullo_epi32(in[0], cospi32);
2419 0 : x = _mm256_add_epi32(x, rnding);
2420 0 : x = _mm256_srai_epi32(x, bit);
2421 :
2422 : // stage 4
2423 : // stage 5
2424 0 : if (!do_cols) {
2425 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
2426 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
2427 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2428 0 : const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
2429 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2430 :
2431 0 : __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
2432 0 : x = _mm256_add_epi32(x, offset);
2433 0 : x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
2434 0 : x = _mm256_max_epi32(x, clamp_lo_out);
2435 0 : x = _mm256_min_epi32(x, clamp_hi_out);
2436 : }
2437 :
2438 0 : out[0] = x;
2439 0 : out[1] = x;
2440 0 : out[2] = x;
2441 0 : out[3] = x;
2442 0 : out[4] = x;
2443 0 : out[5] = x;
2444 0 : out[6] = x;
2445 0 : out[7] = x;
2446 0 : }
2447 :
2448 0 : static void idct8x8_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
2449 : int32_t bd, int32_t out_shift) {
2450 0 : const int32_t *cospi = cospi_arr(bit);
2451 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
2452 0 : const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
2453 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
2454 0 : const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
2455 0 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
2456 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
2457 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2458 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2459 0 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
2460 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2461 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2462 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2463 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2464 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2465 : __m256i u0, u1, u2, u3, u4, u5, u6, u7;
2466 : __m256i v0, v1, v2, v3, v4, v5, v6, v7;
2467 : __m256i x, y;
2468 :
2469 : // stage 0
2470 : // stage 1
2471 : // stage 2
2472 0 : u0 = in[0];
2473 0 : u1 = in[4];
2474 0 : u2 = in[2];
2475 0 : u3 = in[6];
2476 :
2477 0 : x = _mm256_mullo_epi32(in[1], cospi56);
2478 0 : y = _mm256_mullo_epi32(in[7], cospim8);
2479 0 : u4 = _mm256_add_epi32(x, y);
2480 0 : u4 = _mm256_add_epi32(u4, rnding);
2481 0 : u4 = _mm256_srai_epi32(u4, bit);
2482 :
2483 0 : x = _mm256_mullo_epi32(in[1], cospi8);
2484 0 : y = _mm256_mullo_epi32(in[7], cospi56);
2485 0 : u7 = _mm256_add_epi32(x, y);
2486 0 : u7 = _mm256_add_epi32(u7, rnding);
2487 0 : u7 = _mm256_srai_epi32(u7, bit);
2488 :
2489 0 : x = _mm256_mullo_epi32(in[5], cospi24);
2490 0 : y = _mm256_mullo_epi32(in[3], cospim40);
2491 0 : u5 = _mm256_add_epi32(x, y);
2492 0 : u5 = _mm256_add_epi32(u5, rnding);
2493 0 : u5 = _mm256_srai_epi32(u5, bit);
2494 :
2495 0 : x = _mm256_mullo_epi32(in[5], cospi40);
2496 0 : y = _mm256_mullo_epi32(in[3], cospi24);
2497 0 : u6 = _mm256_add_epi32(x, y);
2498 0 : u6 = _mm256_add_epi32(u6, rnding);
2499 0 : u6 = _mm256_srai_epi32(u6, bit);
2500 :
2501 : // stage 3
2502 0 : x = _mm256_mullo_epi32(u0, cospi32);
2503 0 : y = _mm256_mullo_epi32(u1, cospi32);
2504 0 : v0 = _mm256_add_epi32(x, y);
2505 0 : v0 = _mm256_add_epi32(v0, rnding);
2506 0 : v0 = _mm256_srai_epi32(v0, bit);
2507 :
2508 0 : v1 = _mm256_sub_epi32(x, y);
2509 0 : v1 = _mm256_add_epi32(v1, rnding);
2510 0 : v1 = _mm256_srai_epi32(v1, bit);
2511 :
2512 0 : x = _mm256_mullo_epi32(u2, cospi48);
2513 0 : y = _mm256_mullo_epi32(u3, cospim16);
2514 0 : v2 = _mm256_add_epi32(x, y);
2515 0 : v2 = _mm256_add_epi32(v2, rnding);
2516 0 : v2 = _mm256_srai_epi32(v2, bit);
2517 :
2518 0 : x = _mm256_mullo_epi32(u2, cospi16);
2519 0 : y = _mm256_mullo_epi32(u3, cospi48);
2520 0 : v3 = _mm256_add_epi32(x, y);
2521 0 : v3 = _mm256_add_epi32(v3, rnding);
2522 0 : v3 = _mm256_srai_epi32(v3, bit);
2523 :
2524 0 : addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
2525 0 : addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
2526 :
2527 : // stage 4
2528 0 : addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
2529 0 : addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
2530 0 : u4 = v4;
2531 0 : u7 = v7;
2532 :
2533 0 : x = _mm256_mullo_epi32(v5, cospi32);
2534 0 : y = _mm256_mullo_epi32(v6, cospi32);
2535 0 : u6 = _mm256_add_epi32(y, x);
2536 0 : u6 = _mm256_add_epi32(u6, rnding);
2537 0 : u6 = _mm256_srai_epi32(u6, bit);
2538 :
2539 0 : u5 = _mm256_sub_epi32(y, x);
2540 0 : u5 = _mm256_add_epi32(u5, rnding);
2541 0 : u5 = _mm256_srai_epi32(u5, bit);
2542 :
2543 : // stage 5
2544 0 : if (do_cols) {
2545 0 : addsub_no_clamp_avx2(u0, u7, out + 0, out + 7);
2546 0 : addsub_no_clamp_avx2(u1, u6, out + 1, out + 6);
2547 0 : addsub_no_clamp_avx2(u2, u5, out + 2, out + 5);
2548 0 : addsub_no_clamp_avx2(u3, u4, out + 3, out + 4);
2549 : }
2550 : else {
2551 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
2552 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
2553 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2554 0 : const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
2555 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2556 0 : addsub_shift_avx2(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
2557 : out_shift);
2558 0 : addsub_shift_avx2(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
2559 : out_shift);
2560 0 : addsub_shift_avx2(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
2561 : out_shift);
2562 0 : addsub_shift_avx2(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
2563 : out_shift);
2564 : }
2565 0 : }
2566 :
2567 0 : static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
2568 : int32_t bd, int32_t out_shift) {
2569 0 : const int32_t *cospi = cospi_arr(bit);
2570 0 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
2571 0 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
2572 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2573 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2574 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2575 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2576 0 : const __m256i kZero = _mm256_setzero_si256();
2577 : __m256i u[8], x;
2578 :
2579 : // stage 0
2580 : // stage 1
2581 : // stage 2
2582 :
2583 0 : x = _mm256_mullo_epi32(in[0], cospi60);
2584 0 : u[0] = _mm256_add_epi32(x, rnding);
2585 0 : u[0] = _mm256_srai_epi32(u[0], bit);
2586 :
2587 0 : x = _mm256_mullo_epi32(in[0], cospi4);
2588 0 : u[1] = _mm256_sub_epi32(kZero, x);
2589 0 : u[1] = _mm256_add_epi32(u[1], rnding);
2590 0 : u[1] = _mm256_srai_epi32(u[1], bit);
2591 :
2592 : // stage 3
2593 : // stage 4
2594 : __m256i temp1, temp2;
2595 0 : temp1 = _mm256_mullo_epi32(u[0], cospi16);
2596 0 : x = _mm256_mullo_epi32(u[1], cospi48);
2597 0 : temp1 = _mm256_add_epi32(temp1, x);
2598 0 : temp1 = _mm256_add_epi32(temp1, rnding);
2599 0 : temp1 = _mm256_srai_epi32(temp1, bit);
2600 0 : u[4] = temp1;
2601 :
2602 0 : temp2 = _mm256_mullo_epi32(u[0], cospi48);
2603 0 : x = _mm256_mullo_epi32(u[1], cospi16);
2604 0 : u[5] = _mm256_sub_epi32(temp2, x);
2605 0 : u[5] = _mm256_add_epi32(u[5], rnding);
2606 0 : u[5] = _mm256_srai_epi32(u[5], bit);
2607 :
2608 : // stage 5
2609 : // stage 6
2610 0 : temp1 = _mm256_mullo_epi32(u[0], cospi32);
2611 0 : x = _mm256_mullo_epi32(u[1], cospi32);
2612 0 : u[2] = _mm256_add_epi32(temp1, x);
2613 0 : u[2] = _mm256_add_epi32(u[2], rnding);
2614 0 : u[2] = _mm256_srai_epi32(u[2], bit);
2615 :
2616 0 : u[3] = _mm256_sub_epi32(temp1, x);
2617 0 : u[3] = _mm256_add_epi32(u[3], rnding);
2618 0 : u[3] = _mm256_srai_epi32(u[3], bit);
2619 :
2620 0 : temp1 = _mm256_mullo_epi32(u[4], cospi32);
2621 0 : x = _mm256_mullo_epi32(u[5], cospi32);
2622 0 : u[6] = _mm256_add_epi32(temp1, x);
2623 0 : u[6] = _mm256_add_epi32(u[6], rnding);
2624 0 : u[6] = _mm256_srai_epi32(u[6], bit);
2625 :
2626 0 : u[7] = _mm256_sub_epi32(temp1, x);
2627 0 : u[7] = _mm256_add_epi32(u[7], rnding);
2628 0 : u[7] = _mm256_srai_epi32(u[7], bit);
2629 :
2630 : // stage 7
2631 0 : if (do_cols) {
2632 0 : out[0] = u[0];
2633 0 : out[1] = _mm256_sub_epi32(kZero, u[4]);
2634 0 : out[2] = u[6];
2635 0 : out[3] = _mm256_sub_epi32(kZero, u[2]);
2636 0 : out[4] = u[3];
2637 0 : out[5] = _mm256_sub_epi32(kZero, u[7]);
2638 0 : out[6] = u[5];
2639 0 : out[7] = _mm256_sub_epi32(kZero, u[1]);
2640 : }
2641 : else {
2642 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
2643 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2644 0 : const __m256i clamp_hi_out =
2645 0 : _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2646 :
2647 0 : neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2648 : out_shift);
2649 0 : neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
2650 : out_shift);
2651 0 : neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
2652 : out_shift);
2653 0 : neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
2654 : out_shift);
2655 : }
2656 0 : }
2657 :
2658 0 : static void iadst8x8_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
2659 : int32_t bd, int32_t out_shift) {
2660 0 : const int32_t *cospi = cospi_arr(bit);
2661 0 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
2662 0 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
2663 0 : const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
2664 0 : const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
2665 0 : const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
2666 0 : const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
2667 0 : const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
2668 0 : const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
2669 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2670 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2671 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
2672 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2673 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2674 0 : const __m256i kZero = _mm256_setzero_si256();
2675 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2676 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2677 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2678 : __m256i u[8], v[8], x;
2679 :
2680 : // stage 0
2681 : // stage 1
2682 : // stage 2
2683 :
2684 0 : u[0] = _mm256_mullo_epi32(in[7], cospi4);
2685 0 : x = _mm256_mullo_epi32(in[0], cospi60);
2686 0 : u[0] = _mm256_add_epi32(u[0], x);
2687 0 : u[0] = _mm256_add_epi32(u[0], rnding);
2688 0 : u[0] = _mm256_srai_epi32(u[0], bit);
2689 :
2690 0 : u[1] = _mm256_mullo_epi32(in[7], cospi60);
2691 0 : x = _mm256_mullo_epi32(in[0], cospi4);
2692 0 : u[1] = _mm256_sub_epi32(u[1], x);
2693 0 : u[1] = _mm256_add_epi32(u[1], rnding);
2694 0 : u[1] = _mm256_srai_epi32(u[1], bit);
2695 :
2696 0 : u[2] = _mm256_mullo_epi32(in[5], cospi20);
2697 0 : x = _mm256_mullo_epi32(in[2], cospi44);
2698 0 : u[2] = _mm256_add_epi32(u[2], x);
2699 0 : u[2] = _mm256_add_epi32(u[2], rnding);
2700 0 : u[2] = _mm256_srai_epi32(u[2], bit);
2701 :
2702 0 : u[3] = _mm256_mullo_epi32(in[5], cospi44);
2703 0 : x = _mm256_mullo_epi32(in[2], cospi20);
2704 0 : u[3] = _mm256_sub_epi32(u[3], x);
2705 0 : u[3] = _mm256_add_epi32(u[3], rnding);
2706 0 : u[3] = _mm256_srai_epi32(u[3], bit);
2707 :
2708 0 : u[4] = _mm256_mullo_epi32(in[3], cospi36);
2709 0 : x = _mm256_mullo_epi32(in[4], cospi28);
2710 0 : u[4] = _mm256_add_epi32(u[4], x);
2711 0 : u[4] = _mm256_add_epi32(u[4], rnding);
2712 0 : u[4] = _mm256_srai_epi32(u[4], bit);
2713 :
2714 0 : u[5] = _mm256_mullo_epi32(in[3], cospi28);
2715 0 : x = _mm256_mullo_epi32(in[4], cospi36);
2716 0 : u[5] = _mm256_sub_epi32(u[5], x);
2717 0 : u[5] = _mm256_add_epi32(u[5], rnding);
2718 0 : u[5] = _mm256_srai_epi32(u[5], bit);
2719 :
2720 0 : u[6] = _mm256_mullo_epi32(in[1], cospi52);
2721 0 : x = _mm256_mullo_epi32(in[6], cospi12);
2722 0 : u[6] = _mm256_add_epi32(u[6], x);
2723 0 : u[6] = _mm256_add_epi32(u[6], rnding);
2724 0 : u[6] = _mm256_srai_epi32(u[6], bit);
2725 :
2726 0 : u[7] = _mm256_mullo_epi32(in[1], cospi12);
2727 0 : x = _mm256_mullo_epi32(in[6], cospi52);
2728 0 : u[7] = _mm256_sub_epi32(u[7], x);
2729 0 : u[7] = _mm256_add_epi32(u[7], rnding);
2730 0 : u[7] = _mm256_srai_epi32(u[7], bit);
2731 :
2732 : // stage 3
2733 0 : addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
2734 0 : addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
2735 0 : addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
2736 0 : addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
2737 :
2738 : // stage 4
2739 0 : u[0] = v[0];
2740 0 : u[1] = v[1];
2741 0 : u[2] = v[2];
2742 0 : u[3] = v[3];
2743 :
2744 0 : u[4] = _mm256_mullo_epi32(v[4], cospi16);
2745 0 : x = _mm256_mullo_epi32(v[5], cospi48);
2746 0 : u[4] = _mm256_add_epi32(u[4], x);
2747 0 : u[4] = _mm256_add_epi32(u[4], rnding);
2748 0 : u[4] = _mm256_srai_epi32(u[4], bit);
2749 :
2750 0 : u[5] = _mm256_mullo_epi32(v[4], cospi48);
2751 0 : x = _mm256_mullo_epi32(v[5], cospi16);
2752 0 : u[5] = _mm256_sub_epi32(u[5], x);
2753 0 : u[5] = _mm256_add_epi32(u[5], rnding);
2754 0 : u[5] = _mm256_srai_epi32(u[5], bit);
2755 :
2756 0 : u[6] = _mm256_mullo_epi32(v[6], cospim48);
2757 0 : x = _mm256_mullo_epi32(v[7], cospi16);
2758 0 : u[6] = _mm256_add_epi32(u[6], x);
2759 0 : u[6] = _mm256_add_epi32(u[6], rnding);
2760 0 : u[6] = _mm256_srai_epi32(u[6], bit);
2761 :
2762 0 : u[7] = _mm256_mullo_epi32(v[6], cospi16);
2763 0 : x = _mm256_mullo_epi32(v[7], cospim48);
2764 0 : u[7] = _mm256_sub_epi32(u[7], x);
2765 0 : u[7] = _mm256_add_epi32(u[7], rnding);
2766 0 : u[7] = _mm256_srai_epi32(u[7], bit);
2767 :
2768 : // stage 5
2769 0 : addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
2770 0 : addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
2771 0 : addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
2772 0 : addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
2773 :
2774 : // stage 6
2775 0 : u[0] = v[0];
2776 0 : u[1] = v[1];
2777 0 : u[4] = v[4];
2778 0 : u[5] = v[5];
2779 :
2780 0 : v[0] = _mm256_mullo_epi32(v[2], cospi32);
2781 0 : x = _mm256_mullo_epi32(v[3], cospi32);
2782 0 : u[2] = _mm256_add_epi32(v[0], x);
2783 0 : u[2] = _mm256_add_epi32(u[2], rnding);
2784 0 : u[2] = _mm256_srai_epi32(u[2], bit);
2785 :
2786 0 : u[3] = _mm256_sub_epi32(v[0], x);
2787 0 : u[3] = _mm256_add_epi32(u[3], rnding);
2788 0 : u[3] = _mm256_srai_epi32(u[3], bit);
2789 :
2790 0 : v[0] = _mm256_mullo_epi32(v[6], cospi32);
2791 0 : x = _mm256_mullo_epi32(v[7], cospi32);
2792 0 : u[6] = _mm256_add_epi32(v[0], x);
2793 0 : u[6] = _mm256_add_epi32(u[6], rnding);
2794 0 : u[6] = _mm256_srai_epi32(u[6], bit);
2795 :
2796 0 : u[7] = _mm256_sub_epi32(v[0], x);
2797 0 : u[7] = _mm256_add_epi32(u[7], rnding);
2798 0 : u[7] = _mm256_srai_epi32(u[7], bit);
2799 :
2800 : // stage 7
2801 0 : if (do_cols) {
2802 0 : out[0] = u[0];
2803 0 : out[1] = _mm256_sub_epi32(kZero, u[4]);
2804 0 : out[2] = u[6];
2805 0 : out[3] = _mm256_sub_epi32(kZero, u[2]);
2806 0 : out[4] = u[3];
2807 0 : out[5] = _mm256_sub_epi32(kZero, u[7]);
2808 0 : out[6] = u[5];
2809 0 : out[7] = _mm256_sub_epi32(kZero, u[1]);
2810 : }
2811 : else {
2812 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
2813 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2814 0 : const __m256i clamp_hi_out =
2815 0 : _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2816 :
2817 0 : neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2818 : out_shift);
2819 0 : neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
2820 : out_shift);
2821 0 : neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
2822 : out_shift);
2823 0 : neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
2824 : out_shift);
2825 : }
2826 0 : }
2827 0 : static void highbd_clamp_epi32_avx2(const __m256i *in, __m256i *out,
2828 : const __m256i *clamp_lo,
2829 : const __m256i *clamp_hi, int32_t size) {
2830 : __m256i a0, a1;
2831 0 : for (int32_t i = 0; i < size; i += 4) {
2832 0 : a0 = _mm256_max_epi32(in[i], *clamp_lo);
2833 0 : out[i] = _mm256_min_epi32(a0, *clamp_hi);
2834 :
2835 0 : a1 = _mm256_max_epi32(in[i + 1], *clamp_lo);
2836 0 : out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
2837 :
2838 0 : a0 = _mm256_max_epi32(in[i + 2], *clamp_lo);
2839 0 : out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
2840 :
2841 0 : a1 = _mm256_max_epi32(in[i + 3], *clamp_lo);
2842 0 : out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
2843 : }
2844 0 : }
2845 0 : static void shift_avx2(const __m256i *in, __m256i *out,
2846 : const __m256i *clamp_lo, const __m256i *clamp_hi,
2847 : int32_t shift, int32_t size) {
2848 0 : __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
2849 0 : __m128i shift_vec = _mm_cvtsi32_si128(shift);
2850 : __m256i a0, a1;
2851 0 : for (int32_t i = 0; i < size; i += 4) {
2852 0 : a0 = _mm256_add_epi32(in[i], offset);
2853 0 : a1 = _mm256_add_epi32(in[i + 1], offset);
2854 0 : a0 = _mm256_sra_epi32(a0, shift_vec);
2855 0 : a1 = _mm256_sra_epi32(a1, shift_vec);
2856 0 : a0 = _mm256_max_epi32(a0, *clamp_lo);
2857 0 : a1 = _mm256_max_epi32(a1, *clamp_lo);
2858 0 : out[i] = _mm256_min_epi32(a0, *clamp_hi);
2859 0 : out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
2860 :
2861 0 : a0 = _mm256_add_epi32(in[i + 2], offset);
2862 0 : a1 = _mm256_add_epi32(in[i + 3], offset);
2863 0 : a0 = _mm256_sra_epi32(a0, shift_vec);
2864 0 : a1 = _mm256_sra_epi32(a1, shift_vec);
2865 0 : a0 = _mm256_max_epi32(a0, *clamp_lo);
2866 0 : a1 = _mm256_max_epi32(a1, *clamp_lo);
2867 0 : out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
2868 0 : out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
2869 : }
2870 0 : }
2871 0 : static void iidentity8_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
2872 : int32_t bd, int32_t out_shift) {
2873 : (void)bit;
2874 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2875 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2876 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2877 : __m256i v[8];
2878 0 : v[0] = _mm256_add_epi32(in[0], in[0]);
2879 0 : v[1] = _mm256_add_epi32(in[1], in[1]);
2880 0 : v[2] = _mm256_add_epi32(in[2], in[2]);
2881 0 : v[3] = _mm256_add_epi32(in[3], in[3]);
2882 0 : v[4] = _mm256_add_epi32(in[4], in[4]);
2883 0 : v[5] = _mm256_add_epi32(in[5], in[5]);
2884 0 : v[6] = _mm256_add_epi32(in[6], in[6]);
2885 0 : v[7] = _mm256_add_epi32(in[7], in[7]);
2886 :
2887 0 : if (!do_cols) {
2888 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
2889 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
2890 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2891 0 : const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
2892 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2893 :
2894 0 : shift_avx2(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 8);
2895 : }
2896 : else
2897 0 : highbd_clamp_epi32_avx2(v, out, &clamp_lo, &clamp_hi, 8);
2898 0 : }
2899 0 : static void idct16_low1_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
2900 : int32_t bd, int32_t out_shift) {
2901 0 : const int32_t *cospi = cospi_arr(bit);
2902 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2903 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2904 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2905 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2906 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2907 :
2908 : {
2909 : // stage 0
2910 : // stage 1
2911 : // stage 2
2912 : // stage 3
2913 : // stage 4
2914 0 : in[0] = _mm256_mullo_epi32(in[0], cospi32);
2915 0 : in[0] = _mm256_add_epi32(in[0], rnding);
2916 0 : in[0] = _mm256_srai_epi32(in[0], bit);
2917 :
2918 : // stage 5
2919 : // stage 6
2920 : // stage 7
2921 0 : if (do_cols) {
2922 0 : in[0] = _mm256_max_epi32(in[0], clamp_lo);
2923 0 : in[0] = _mm256_min_epi32(in[0], clamp_hi);
2924 : }
2925 : else {
2926 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
2927 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
2928 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2929 0 : const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
2930 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2931 0 : __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
2932 0 : in[0] = _mm256_add_epi32(in[0], offset);
2933 0 : in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
2934 0 : in[0] = _mm256_max_epi32(in[0], clamp_lo_out);
2935 0 : in[0] = _mm256_min_epi32(in[0], clamp_hi_out);
2936 : }
2937 :
2938 0 : out[0] = in[0];
2939 0 : out[1] = in[0];
2940 0 : out[2] = in[0];
2941 0 : out[3] = in[0];
2942 0 : out[4] = in[0];
2943 0 : out[5] = in[0];
2944 0 : out[6] = in[0];
2945 0 : out[7] = in[0];
2946 0 : out[8] = in[0];
2947 0 : out[9] = in[0];
2948 0 : out[10] = in[0];
2949 0 : out[11] = in[0];
2950 0 : out[12] = in[0];
2951 0 : out[13] = in[0];
2952 0 : out[14] = in[0];
2953 0 : out[15] = in[0];
2954 : }
2955 0 : }
2956 :
2957 0 : static void idct16_low8_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
2958 : int32_t bd, int32_t out_shift) {
2959 0 : const int32_t *cospi = cospi_arr(bit);
2960 0 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
2961 0 : const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
2962 0 : const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
2963 0 : const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
2964 0 : const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
2965 0 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
2966 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
2967 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
2968 0 : const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
2969 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
2970 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2971 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2972 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2973 0 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
2974 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
2975 0 : const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
2976 0 : const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
2977 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2978 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2979 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2980 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2981 : __m256i u[16], x, y;
2982 :
2983 : {
2984 : // stage 0
2985 : // stage 1
2986 0 : u[0] = in[0];
2987 0 : u[2] = in[4];
2988 0 : u[4] = in[2];
2989 0 : u[6] = in[6];
2990 0 : u[8] = in[1];
2991 0 : u[10] = in[5];
2992 0 : u[12] = in[3];
2993 0 : u[14] = in[7];
2994 :
2995 : // stage 2
2996 0 : u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
2997 0 : u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
2998 :
2999 0 : u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
3000 0 : u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
3001 :
3002 0 : u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
3003 0 : u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
3004 :
3005 0 : u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
3006 0 : u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
3007 :
3008 : // stage 3
3009 0 : u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
3010 0 : u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
3011 0 : u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit);
3012 0 : u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit);
3013 :
3014 0 : addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
3015 0 : addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
3016 0 : addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
3017 0 : addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
3018 :
3019 : // stage 4
3020 0 : x = _mm256_mullo_epi32(u[0], cospi32);
3021 0 : u[0] = _mm256_add_epi32(x, rnding);
3022 0 : u[0] = _mm256_srai_epi32(u[0], bit);
3023 0 : u[1] = u[0];
3024 :
3025 0 : u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
3026 0 : u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
3027 :
3028 0 : addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
3029 0 : addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
3030 :
3031 0 : x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3032 0 : u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3033 0 : u[9] = x;
3034 0 : y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3035 0 : u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3036 0 : u[10] = y;
3037 :
3038 : // stage 5
3039 0 : addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
3040 0 : addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
3041 :
3042 0 : x = _mm256_mullo_epi32(u[5], cospi32);
3043 0 : y = _mm256_mullo_epi32(u[6], cospi32);
3044 0 : u[5] = _mm256_sub_epi32(y, x);
3045 0 : u[5] = _mm256_add_epi32(u[5], rnding);
3046 0 : u[5] = _mm256_srai_epi32(u[5], bit);
3047 :
3048 0 : u[6] = _mm256_add_epi32(y, x);
3049 0 : u[6] = _mm256_add_epi32(u[6], rnding);
3050 0 : u[6] = _mm256_srai_epi32(u[6], bit);
3051 :
3052 0 : addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3053 0 : addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3054 0 : addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3055 0 : addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3056 :
3057 : // stage 6
3058 0 : addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
3059 0 : addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
3060 0 : addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
3061 0 : addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
3062 :
3063 0 : x = _mm256_mullo_epi32(u[10], cospi32);
3064 0 : y = _mm256_mullo_epi32(u[13], cospi32);
3065 0 : u[10] = _mm256_sub_epi32(y, x);
3066 0 : u[10] = _mm256_add_epi32(u[10], rnding);
3067 0 : u[10] = _mm256_srai_epi32(u[10], bit);
3068 :
3069 0 : u[13] = _mm256_add_epi32(x, y);
3070 0 : u[13] = _mm256_add_epi32(u[13], rnding);
3071 0 : u[13] = _mm256_srai_epi32(u[13], bit);
3072 :
3073 0 : x = _mm256_mullo_epi32(u[11], cospi32);
3074 0 : y = _mm256_mullo_epi32(u[12], cospi32);
3075 0 : u[11] = _mm256_sub_epi32(y, x);
3076 0 : u[11] = _mm256_add_epi32(u[11], rnding);
3077 0 : u[11] = _mm256_srai_epi32(u[11], bit);
3078 :
3079 0 : u[12] = _mm256_add_epi32(x, y);
3080 0 : u[12] = _mm256_add_epi32(u[12], rnding);
3081 0 : u[12] = _mm256_srai_epi32(u[12], bit);
3082 : // stage 7
3083 0 : if (do_cols) {
3084 0 : addsub_no_clamp_avx2(u[0], u[15], out + 0, out + 15);
3085 0 : addsub_no_clamp_avx2(u[1], u[14], out + 1, out + 14);
3086 0 : addsub_no_clamp_avx2(u[2], u[13], out + 2, out + 13);
3087 0 : addsub_no_clamp_avx2(u[3], u[12], out + 3, out + 12);
3088 0 : addsub_no_clamp_avx2(u[4], u[11], out + 4, out + 11);
3089 0 : addsub_no_clamp_avx2(u[5], u[10], out + 5, out + 10);
3090 0 : addsub_no_clamp_avx2(u[6], u[9], out + 6, out + 9);
3091 0 : addsub_no_clamp_avx2(u[7], u[8], out + 7, out + 8);
3092 : }
3093 : else {
3094 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
3095 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
3096 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3097 0 : const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
3098 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3099 :
3100 0 : addsub_shift_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
3101 : &clamp_hi_out, out_shift);
3102 0 : addsub_shift_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
3103 : &clamp_hi_out, out_shift);
3104 0 : addsub_shift_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
3105 : &clamp_hi_out, out_shift);
3106 0 : addsub_shift_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
3107 : &clamp_hi_out, out_shift);
3108 0 : addsub_shift_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
3109 : &clamp_hi_out, out_shift);
3110 0 : addsub_shift_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
3111 : &clamp_hi_out, out_shift);
3112 0 : addsub_shift_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
3113 : &clamp_hi_out, out_shift);
3114 0 : addsub_shift_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
3115 : &clamp_hi_out, out_shift);
3116 : }
3117 : }
3118 0 : }
3119 :
3120 0 : static void idct16_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols, int32_t bd,
3121 : int32_t out_shift) {
3122 0 : const int32_t *cospi = cospi_arr(bit);
3123 0 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3124 0 : const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3125 0 : const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3126 0 : const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3127 0 : const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3128 0 : const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3129 0 : const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3130 0 : const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3131 0 : const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3132 0 : const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
3133 0 : const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
3134 0 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3135 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3136 0 : const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3137 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3138 0 : const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3139 0 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3140 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3141 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3142 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3143 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3144 0 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3145 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3146 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3147 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3148 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3149 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3150 : __m256i u[16], v[16], x, y;
3151 :
3152 : {
3153 : // stage 0
3154 : // stage 1
3155 0 : u[0] = in[0];
3156 0 : u[1] = in[8];
3157 0 : u[2] = in[4];
3158 0 : u[3] = in[12];
3159 0 : u[4] = in[2];
3160 0 : u[5] = in[10];
3161 0 : u[6] = in[6];
3162 0 : u[7] = in[14];
3163 0 : u[8] = in[1];
3164 0 : u[9] = in[9];
3165 0 : u[10] = in[5];
3166 0 : u[11] = in[13];
3167 0 : u[12] = in[3];
3168 0 : u[13] = in[11];
3169 0 : u[14] = in[7];
3170 0 : u[15] = in[15];
3171 :
3172 : // stage 2
3173 0 : v[0] = u[0];
3174 0 : v[1] = u[1];
3175 0 : v[2] = u[2];
3176 0 : v[3] = u[3];
3177 0 : v[4] = u[4];
3178 0 : v[5] = u[5];
3179 0 : v[6] = u[6];
3180 0 : v[7] = u[7];
3181 :
3182 0 : v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
3183 0 : v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
3184 0 : v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
3185 0 : v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
3186 0 : v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
3187 0 : v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
3188 0 : v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
3189 0 : v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
3190 :
3191 : // stage 3
3192 0 : u[0] = v[0];
3193 0 : u[1] = v[1];
3194 0 : u[2] = v[2];
3195 0 : u[3] = v[3];
3196 0 : u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
3197 0 : u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
3198 0 : u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
3199 0 : u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
3200 0 : addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
3201 0 : addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
3202 0 : addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
3203 0 : addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
3204 :
3205 : // stage 4
3206 0 : x = _mm256_mullo_epi32(u[0], cospi32);
3207 0 : y = _mm256_mullo_epi32(u[1], cospi32);
3208 0 : v[0] = _mm256_add_epi32(x, y);
3209 0 : v[0] = _mm256_add_epi32(v[0], rnding);
3210 0 : v[0] = _mm256_srai_epi32(v[0], bit);
3211 :
3212 0 : v[1] = _mm256_sub_epi32(x, y);
3213 0 : v[1] = _mm256_add_epi32(v[1], rnding);
3214 0 : v[1] = _mm256_srai_epi32(v[1], bit);
3215 :
3216 0 : v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
3217 0 : v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
3218 0 : addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
3219 0 : addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
3220 0 : v[8] = u[8];
3221 0 : v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3222 0 : v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3223 0 : v[11] = u[11];
3224 0 : v[12] = u[12];
3225 0 : v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3226 0 : v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3227 0 : v[15] = u[15];
3228 :
3229 : // stage 5
3230 0 : addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
3231 0 : addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
3232 0 : u[4] = v[4];
3233 :
3234 0 : x = _mm256_mullo_epi32(v[5], cospi32);
3235 0 : y = _mm256_mullo_epi32(v[6], cospi32);
3236 0 : u[5] = _mm256_sub_epi32(y, x);
3237 0 : u[5] = _mm256_add_epi32(u[5], rnding);
3238 0 : u[5] = _mm256_srai_epi32(u[5], bit);
3239 :
3240 0 : u[6] = _mm256_add_epi32(y, x);
3241 0 : u[6] = _mm256_add_epi32(u[6], rnding);
3242 0 : u[6] = _mm256_srai_epi32(u[6], bit);
3243 :
3244 0 : u[7] = v[7];
3245 0 : addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3246 0 : addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3247 0 : addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3248 0 : addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3249 :
3250 : // stage 6
3251 0 : addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
3252 0 : addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
3253 0 : addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
3254 0 : addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
3255 0 : v[8] = u[8];
3256 0 : v[9] = u[9];
3257 :
3258 0 : x = _mm256_mullo_epi32(u[10], cospi32);
3259 0 : y = _mm256_mullo_epi32(u[13], cospi32);
3260 0 : v[10] = _mm256_sub_epi32(y, x);
3261 0 : v[10] = _mm256_add_epi32(v[10], rnding);
3262 0 : v[10] = _mm256_srai_epi32(v[10], bit);
3263 :
3264 0 : v[13] = _mm256_add_epi32(x, y);
3265 0 : v[13] = _mm256_add_epi32(v[13], rnding);
3266 0 : v[13] = _mm256_srai_epi32(v[13], bit);
3267 :
3268 0 : x = _mm256_mullo_epi32(u[11], cospi32);
3269 0 : y = _mm256_mullo_epi32(u[12], cospi32);
3270 0 : v[11] = _mm256_sub_epi32(y, x);
3271 0 : v[11] = _mm256_add_epi32(v[11], rnding);
3272 0 : v[11] = _mm256_srai_epi32(v[11], bit);
3273 :
3274 0 : v[12] = _mm256_add_epi32(x, y);
3275 0 : v[12] = _mm256_add_epi32(v[12], rnding);
3276 0 : v[12] = _mm256_srai_epi32(v[12], bit);
3277 :
3278 0 : v[14] = u[14];
3279 0 : v[15] = u[15];
3280 :
3281 : // stage 7
3282 0 : if (do_cols) {
3283 0 : addsub_no_clamp_avx2(v[0], v[15], out + 0, out + 15);
3284 0 : addsub_no_clamp_avx2(v[1], v[14], out + 1, out + 14);
3285 0 : addsub_no_clamp_avx2(v[2], v[13], out + 2, out + 13);
3286 0 : addsub_no_clamp_avx2(v[3], v[12], out + 3, out + 12);
3287 0 : addsub_no_clamp_avx2(v[4], v[11], out + 4, out + 11);
3288 0 : addsub_no_clamp_avx2(v[5], v[10], out + 5, out + 10);
3289 0 : addsub_no_clamp_avx2(v[6], v[9], out + 6, out + 9);
3290 0 : addsub_no_clamp_avx2(v[7], v[8], out + 7, out + 8);
3291 : }
3292 : else {
3293 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
3294 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
3295 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3296 0 : const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
3297 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3298 :
3299 0 : addsub_shift_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
3300 : &clamp_hi_out, out_shift);
3301 0 : addsub_shift_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
3302 : &clamp_hi_out, out_shift);
3303 0 : addsub_shift_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
3304 : &clamp_hi_out, out_shift);
3305 0 : addsub_shift_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
3306 : &clamp_hi_out, out_shift);
3307 0 : addsub_shift_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
3308 : &clamp_hi_out, out_shift);
3309 0 : addsub_shift_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
3310 : &clamp_hi_out, out_shift);
3311 0 : addsub_shift_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
3312 : &clamp_hi_out, out_shift);
3313 0 : addsub_shift_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
3314 : &clamp_hi_out, out_shift);
3315 : }
3316 : }
3317 0 : }
3318 :
3319 0 : static void iadst16_low1_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
3320 : int32_t bd, int32_t out_shift) {
3321 0 : const int32_t *cospi = cospi_arr(bit);
3322 0 : const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3323 0 : const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3324 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3325 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3326 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3327 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3328 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3329 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3330 0 : const __m256i zero = _mm256_setzero_si256();
3331 : __m256i v[16], x, y, temp1, temp2;
3332 :
3333 : // Calculate the column 0, 1, 2, 3
3334 : {
3335 : // stage 0
3336 : // stage 1
3337 : // stage 2
3338 0 : x = _mm256_mullo_epi32(in[0], cospi62);
3339 0 : v[0] = _mm256_add_epi32(x, rnding);
3340 0 : v[0] = _mm256_srai_epi32(v[0], bit);
3341 :
3342 0 : x = _mm256_mullo_epi32(in[0], cospi2);
3343 0 : v[1] = _mm256_sub_epi32(zero, x);
3344 0 : v[1] = _mm256_add_epi32(v[1], rnding);
3345 0 : v[1] = _mm256_srai_epi32(v[1], bit);
3346 :
3347 : // stage 3
3348 0 : v[8] = v[0];
3349 0 : v[9] = v[1];
3350 :
3351 : // stage 4
3352 0 : temp1 = _mm256_mullo_epi32(v[8], cospi8);
3353 0 : x = _mm256_mullo_epi32(v[9], cospi56);
3354 0 : temp1 = _mm256_add_epi32(temp1, x);
3355 0 : temp1 = _mm256_add_epi32(temp1, rnding);
3356 0 : temp1 = _mm256_srai_epi32(temp1, bit);
3357 :
3358 0 : temp2 = _mm256_mullo_epi32(v[8], cospi56);
3359 0 : x = _mm256_mullo_epi32(v[9], cospi8);
3360 0 : temp2 = _mm256_sub_epi32(temp2, x);
3361 0 : temp2 = _mm256_add_epi32(temp2, rnding);
3362 0 : temp2 = _mm256_srai_epi32(temp2, bit);
3363 0 : v[8] = temp1;
3364 0 : v[9] = temp2;
3365 :
3366 : // stage 5
3367 0 : v[4] = v[0];
3368 0 : v[5] = v[1];
3369 0 : v[12] = v[8];
3370 0 : v[13] = v[9];
3371 :
3372 : // stage 6
3373 0 : temp1 = _mm256_mullo_epi32(v[4], cospi16);
3374 0 : x = _mm256_mullo_epi32(v[5], cospi48);
3375 0 : temp1 = _mm256_add_epi32(temp1, x);
3376 0 : temp1 = _mm256_add_epi32(temp1, rnding);
3377 0 : temp1 = _mm256_srai_epi32(temp1, bit);
3378 :
3379 0 : temp2 = _mm256_mullo_epi32(v[4], cospi48);
3380 0 : x = _mm256_mullo_epi32(v[5], cospi16);
3381 0 : temp2 = _mm256_sub_epi32(temp2, x);
3382 0 : temp2 = _mm256_add_epi32(temp2, rnding);
3383 0 : temp2 = _mm256_srai_epi32(temp2, bit);
3384 0 : v[4] = temp1;
3385 0 : v[5] = temp2;
3386 :
3387 0 : temp1 = _mm256_mullo_epi32(v[12], cospi16);
3388 0 : x = _mm256_mullo_epi32(v[13], cospi48);
3389 0 : temp1 = _mm256_add_epi32(temp1, x);
3390 0 : temp1 = _mm256_add_epi32(temp1, rnding);
3391 0 : temp1 = _mm256_srai_epi32(temp1, bit);
3392 :
3393 0 : temp2 = _mm256_mullo_epi32(v[12], cospi48);
3394 0 : x = _mm256_mullo_epi32(v[13], cospi16);
3395 0 : temp2 = _mm256_sub_epi32(temp2, x);
3396 0 : temp2 = _mm256_add_epi32(temp2, rnding);
3397 0 : temp2 = _mm256_srai_epi32(temp2, bit);
3398 0 : v[12] = temp1;
3399 0 : v[13] = temp2;
3400 :
3401 : // stage 7
3402 0 : v[2] = v[0];
3403 0 : v[3] = v[1];
3404 0 : v[6] = v[4];
3405 0 : v[7] = v[5];
3406 0 : v[10] = v[8];
3407 0 : v[11] = v[9];
3408 0 : v[14] = v[12];
3409 0 : v[15] = v[13];
3410 :
3411 : // stage 8
3412 0 : y = _mm256_mullo_epi32(v[2], cospi32);
3413 0 : x = _mm256_mullo_epi32(v[3], cospi32);
3414 0 : v[2] = _mm256_add_epi32(y, x);
3415 0 : v[2] = _mm256_add_epi32(v[2], rnding);
3416 0 : v[2] = _mm256_srai_epi32(v[2], bit);
3417 :
3418 0 : v[3] = _mm256_sub_epi32(y, x);
3419 0 : v[3] = _mm256_add_epi32(v[3], rnding);
3420 0 : v[3] = _mm256_srai_epi32(v[3], bit);
3421 :
3422 0 : y = _mm256_mullo_epi32(v[6], cospi32);
3423 0 : x = _mm256_mullo_epi32(v[7], cospi32);
3424 0 : v[6] = _mm256_add_epi32(y, x);
3425 0 : v[6] = _mm256_add_epi32(v[6], rnding);
3426 0 : v[6] = _mm256_srai_epi32(v[6], bit);
3427 :
3428 0 : v[7] = _mm256_sub_epi32(y, x);
3429 0 : v[7] = _mm256_add_epi32(v[7], rnding);
3430 0 : v[7] = _mm256_srai_epi32(v[7], bit);
3431 :
3432 0 : y = _mm256_mullo_epi32(v[10], cospi32);
3433 0 : x = _mm256_mullo_epi32(v[11], cospi32);
3434 0 : v[10] = _mm256_add_epi32(y, x);
3435 0 : v[10] = _mm256_add_epi32(v[10], rnding);
3436 0 : v[10] = _mm256_srai_epi32(v[10], bit);
3437 :
3438 0 : v[11] = _mm256_sub_epi32(y, x);
3439 0 : v[11] = _mm256_add_epi32(v[11], rnding);
3440 0 : v[11] = _mm256_srai_epi32(v[11], bit);
3441 :
3442 0 : y = _mm256_mullo_epi32(v[14], cospi32);
3443 0 : x = _mm256_mullo_epi32(v[15], cospi32);
3444 0 : v[14] = _mm256_add_epi32(y, x);
3445 0 : v[14] = _mm256_add_epi32(v[14], rnding);
3446 0 : v[14] = _mm256_srai_epi32(v[14], bit);
3447 :
3448 0 : v[15] = _mm256_sub_epi32(y, x);
3449 0 : v[15] = _mm256_add_epi32(v[15], rnding);
3450 0 : v[15] = _mm256_srai_epi32(v[15], bit);
3451 :
3452 : // stage 9
3453 0 : if (do_cols) {
3454 0 : out[0] = v[0];
3455 0 : out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
3456 0 : out[2] = v[12];
3457 0 : out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
3458 0 : out[4] = v[6];
3459 0 : out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
3460 0 : out[6] = v[10];
3461 0 : out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
3462 0 : out[8] = v[3];
3463 0 : out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
3464 0 : out[10] = v[15];
3465 0 : out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
3466 0 : out[12] = v[5];
3467 0 : out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
3468 0 : out[14] = v[9];
3469 0 : out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
3470 : }
3471 : else {
3472 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
3473 0 : const __m256i clamp_lo_out =
3474 0 : _mm256_set1_epi32(-(1 << (log_range_out - 1)));
3475 0 : const __m256i clamp_hi_out =
3476 0 : _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
3477 :
3478 0 : neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
3479 : out_shift);
3480 0 : neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
3481 : &clamp_hi_out, out_shift);
3482 0 : neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
3483 : &clamp_hi_out, out_shift);
3484 0 : neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
3485 : &clamp_hi_out, out_shift);
3486 0 : neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
3487 : &clamp_hi_out, out_shift);
3488 0 : neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
3489 : &clamp_hi_out, out_shift);
3490 0 : neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
3491 : &clamp_hi_out, out_shift);
3492 0 : neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
3493 : &clamp_hi_out, out_shift);
3494 : }
3495 : }
3496 0 : }
3497 :
3498 0 : static void iadst16_low8_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
3499 : int32_t bd, int32_t out_shift) {
3500 0 : const int32_t *cospi = cospi_arr(bit);
3501 0 : const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3502 0 : const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3503 0 : const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
3504 0 : const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
3505 0 : const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
3506 0 : const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
3507 0 : const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
3508 0 : const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
3509 0 : const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
3510 0 : const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
3511 0 : const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
3512 0 : const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
3513 0 : const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
3514 0 : const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
3515 0 : const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
3516 0 : const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3517 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3518 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3519 0 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3520 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3521 0 : const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3522 0 : const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3523 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3524 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3525 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3526 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3527 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3528 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3529 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3530 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3531 : __m256i u[16], x, y;
3532 :
3533 : {
3534 : // stage 0
3535 : // stage 1
3536 : // stage 2
3537 0 : __m256i zero = _mm256_setzero_si256();
3538 0 : x = _mm256_mullo_epi32(in[0], cospi62);
3539 0 : u[0] = _mm256_add_epi32(x, rnding);
3540 0 : u[0] = _mm256_srai_epi32(u[0], bit);
3541 :
3542 0 : x = _mm256_mullo_epi32(in[0], cospi2);
3543 0 : u[1] = _mm256_sub_epi32(zero, x);
3544 0 : u[1] = _mm256_add_epi32(u[1], rnding);
3545 0 : u[1] = _mm256_srai_epi32(u[1], bit);
3546 :
3547 0 : x = _mm256_mullo_epi32(in[2], cospi54);
3548 0 : u[2] = _mm256_add_epi32(x, rnding);
3549 0 : u[2] = _mm256_srai_epi32(u[2], bit);
3550 :
3551 0 : x = _mm256_mullo_epi32(in[2], cospi10);
3552 0 : u[3] = _mm256_sub_epi32(zero, x);
3553 0 : u[3] = _mm256_add_epi32(u[3], rnding);
3554 0 : u[3] = _mm256_srai_epi32(u[3], bit);
3555 :
3556 0 : x = _mm256_mullo_epi32(in[4], cospi46);
3557 0 : u[4] = _mm256_add_epi32(x, rnding);
3558 0 : u[4] = _mm256_srai_epi32(u[4], bit);
3559 :
3560 0 : x = _mm256_mullo_epi32(in[4], cospi18);
3561 0 : u[5] = _mm256_sub_epi32(zero, x);
3562 0 : u[5] = _mm256_add_epi32(u[5], rnding);
3563 0 : u[5] = _mm256_srai_epi32(u[5], bit);
3564 :
3565 0 : x = _mm256_mullo_epi32(in[6], cospi38);
3566 0 : u[6] = _mm256_add_epi32(x, rnding);
3567 0 : u[6] = _mm256_srai_epi32(u[6], bit);
3568 :
3569 0 : x = _mm256_mullo_epi32(in[6], cospi26);
3570 0 : u[7] = _mm256_sub_epi32(zero, x);
3571 0 : u[7] = _mm256_add_epi32(u[7], rnding);
3572 0 : u[7] = _mm256_srai_epi32(u[7], bit);
3573 :
3574 0 : u[8] = _mm256_mullo_epi32(in[7], cospi34);
3575 0 : u[8] = _mm256_add_epi32(u[8], rnding);
3576 0 : u[8] = _mm256_srai_epi32(u[8], bit);
3577 :
3578 0 : u[9] = _mm256_mullo_epi32(in[7], cospi30);
3579 0 : u[9] = _mm256_add_epi32(u[9], rnding);
3580 0 : u[9] = _mm256_srai_epi32(u[9], bit);
3581 :
3582 0 : u[10] = _mm256_mullo_epi32(in[5], cospi42);
3583 0 : u[10] = _mm256_add_epi32(u[10], rnding);
3584 0 : u[10] = _mm256_srai_epi32(u[10], bit);
3585 :
3586 0 : u[11] = _mm256_mullo_epi32(in[5], cospi22);
3587 0 : u[11] = _mm256_add_epi32(u[11], rnding);
3588 0 : u[11] = _mm256_srai_epi32(u[11], bit);
3589 :
3590 0 : u[12] = _mm256_mullo_epi32(in[3], cospi50);
3591 0 : u[12] = _mm256_add_epi32(u[12], rnding);
3592 0 : u[12] = _mm256_srai_epi32(u[12], bit);
3593 :
3594 0 : u[13] = _mm256_mullo_epi32(in[3], cospi14);
3595 0 : u[13] = _mm256_add_epi32(u[13], rnding);
3596 0 : u[13] = _mm256_srai_epi32(u[13], bit);
3597 :
3598 0 : u[14] = _mm256_mullo_epi32(in[1], cospi58);
3599 0 : u[14] = _mm256_add_epi32(u[14], rnding);
3600 0 : u[14] = _mm256_srai_epi32(u[14], bit);
3601 :
3602 0 : u[15] = _mm256_mullo_epi32(in[1], cospi6);
3603 0 : u[15] = _mm256_add_epi32(u[15], rnding);
3604 0 : u[15] = _mm256_srai_epi32(u[15], bit);
3605 :
3606 : // stage 3
3607 0 : addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
3608 0 : addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
3609 0 : addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
3610 0 : addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
3611 0 : addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
3612 0 : addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
3613 0 : addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
3614 0 : addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
3615 :
3616 : // stage 4
3617 0 : y = _mm256_mullo_epi32(u[8], cospi56);
3618 0 : x = _mm256_mullo_epi32(u[9], cospi56);
3619 0 : u[8] = _mm256_mullo_epi32(u[8], cospi8);
3620 0 : u[8] = _mm256_add_epi32(u[8], x);
3621 0 : u[8] = _mm256_add_epi32(u[8], rnding);
3622 0 : u[8] = _mm256_srai_epi32(u[8], bit);
3623 :
3624 0 : x = _mm256_mullo_epi32(u[9], cospi8);
3625 0 : u[9] = _mm256_sub_epi32(y, x);
3626 0 : u[9] = _mm256_add_epi32(u[9], rnding);
3627 0 : u[9] = _mm256_srai_epi32(u[9], bit);
3628 :
3629 0 : x = _mm256_mullo_epi32(u[11], cospi24);
3630 0 : y = _mm256_mullo_epi32(u[10], cospi24);
3631 0 : u[10] = _mm256_mullo_epi32(u[10], cospi40);
3632 0 : u[10] = _mm256_add_epi32(u[10], x);
3633 0 : u[10] = _mm256_add_epi32(u[10], rnding);
3634 0 : u[10] = _mm256_srai_epi32(u[10], bit);
3635 :
3636 0 : x = _mm256_mullo_epi32(u[11], cospi40);
3637 0 : u[11] = _mm256_sub_epi32(y, x);
3638 0 : u[11] = _mm256_add_epi32(u[11], rnding);
3639 0 : u[11] = _mm256_srai_epi32(u[11], bit);
3640 :
3641 0 : x = _mm256_mullo_epi32(u[13], cospi8);
3642 0 : y = _mm256_mullo_epi32(u[12], cospi8);
3643 0 : u[12] = _mm256_mullo_epi32(u[12], cospim56);
3644 0 : u[12] = _mm256_add_epi32(u[12], x);
3645 0 : u[12] = _mm256_add_epi32(u[12], rnding);
3646 0 : u[12] = _mm256_srai_epi32(u[12], bit);
3647 :
3648 0 : x = _mm256_mullo_epi32(u[13], cospim56);
3649 0 : u[13] = _mm256_sub_epi32(y, x);
3650 0 : u[13] = _mm256_add_epi32(u[13], rnding);
3651 0 : u[13] = _mm256_srai_epi32(u[13], bit);
3652 :
3653 0 : x = _mm256_mullo_epi32(u[15], cospi40);
3654 0 : y = _mm256_mullo_epi32(u[14], cospi40);
3655 0 : u[14] = _mm256_mullo_epi32(u[14], cospim24);
3656 0 : u[14] = _mm256_add_epi32(u[14], x);
3657 0 : u[14] = _mm256_add_epi32(u[14], rnding);
3658 0 : u[14] = _mm256_srai_epi32(u[14], bit);
3659 :
3660 0 : x = _mm256_mullo_epi32(u[15], cospim24);
3661 0 : u[15] = _mm256_sub_epi32(y, x);
3662 0 : u[15] = _mm256_add_epi32(u[15], rnding);
3663 0 : u[15] = _mm256_srai_epi32(u[15], bit);
3664 :
3665 : // stage 5
3666 0 : addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
3667 0 : addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
3668 0 : addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
3669 0 : addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
3670 0 : addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
3671 0 : addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
3672 0 : addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
3673 0 : addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
3674 :
3675 : // stage 6
3676 0 : x = _mm256_mullo_epi32(u[5], cospi48);
3677 0 : y = _mm256_mullo_epi32(u[4], cospi48);
3678 0 : u[4] = _mm256_mullo_epi32(u[4], cospi16);
3679 0 : u[4] = _mm256_add_epi32(u[4], x);
3680 0 : u[4] = _mm256_add_epi32(u[4], rnding);
3681 0 : u[4] = _mm256_srai_epi32(u[4], bit);
3682 :
3683 0 : x = _mm256_mullo_epi32(u[5], cospi16);
3684 0 : u[5] = _mm256_sub_epi32(y, x);
3685 0 : u[5] = _mm256_add_epi32(u[5], rnding);
3686 0 : u[5] = _mm256_srai_epi32(u[5], bit);
3687 :
3688 0 : x = _mm256_mullo_epi32(u[7], cospi16);
3689 0 : y = _mm256_mullo_epi32(u[6], cospi16);
3690 0 : u[6] = _mm256_mullo_epi32(u[6], cospim48);
3691 0 : u[6] = _mm256_add_epi32(u[6], x);
3692 0 : u[6] = _mm256_add_epi32(u[6], rnding);
3693 0 : u[6] = _mm256_srai_epi32(u[6], bit);
3694 :
3695 0 : x = _mm256_mullo_epi32(u[7], cospim48);
3696 0 : u[7] = _mm256_sub_epi32(y, x);
3697 0 : u[7] = _mm256_add_epi32(u[7], rnding);
3698 0 : u[7] = _mm256_srai_epi32(u[7], bit);
3699 :
3700 0 : x = _mm256_mullo_epi32(u[13], cospi48);
3701 0 : y = _mm256_mullo_epi32(u[12], cospi48);
3702 0 : u[12] = _mm256_mullo_epi32(u[12], cospi16);
3703 0 : u[12] = _mm256_add_epi32(u[12], x);
3704 0 : u[12] = _mm256_add_epi32(u[12], rnding);
3705 0 : u[12] = _mm256_srai_epi32(u[12], bit);
3706 :
3707 0 : x = _mm256_mullo_epi32(u[13], cospi16);
3708 0 : u[13] = _mm256_sub_epi32(y, x);
3709 0 : u[13] = _mm256_add_epi32(u[13], rnding);
3710 0 : u[13] = _mm256_srai_epi32(u[13], bit);
3711 :
3712 0 : x = _mm256_mullo_epi32(u[15], cospi16);
3713 0 : y = _mm256_mullo_epi32(u[14], cospi16);
3714 0 : u[14] = _mm256_mullo_epi32(u[14], cospim48);
3715 0 : u[14] = _mm256_add_epi32(u[14], x);
3716 0 : u[14] = _mm256_add_epi32(u[14], rnding);
3717 0 : u[14] = _mm256_srai_epi32(u[14], bit);
3718 :
3719 0 : x = _mm256_mullo_epi32(u[15], cospim48);
3720 0 : u[15] = _mm256_sub_epi32(y, x);
3721 0 : u[15] = _mm256_add_epi32(u[15], rnding);
3722 0 : u[15] = _mm256_srai_epi32(u[15], bit);
3723 :
3724 : // stage 7
3725 0 : addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
3726 0 : addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
3727 0 : addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
3728 0 : addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
3729 0 : addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
3730 0 : addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
3731 0 : addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
3732 0 : addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
3733 :
3734 : // stage 8
3735 0 : y = _mm256_mullo_epi32(u[2], cospi32);
3736 0 : x = _mm256_mullo_epi32(u[3], cospi32);
3737 0 : u[2] = _mm256_add_epi32(y, x);
3738 0 : u[2] = _mm256_add_epi32(u[2], rnding);
3739 0 : u[2] = _mm256_srai_epi32(u[2], bit);
3740 :
3741 0 : u[3] = _mm256_sub_epi32(y, x);
3742 0 : u[3] = _mm256_add_epi32(u[3], rnding);
3743 0 : u[3] = _mm256_srai_epi32(u[3], bit);
3744 0 : y = _mm256_mullo_epi32(u[6], cospi32);
3745 0 : x = _mm256_mullo_epi32(u[7], cospi32);
3746 0 : u[6] = _mm256_add_epi32(y, x);
3747 0 : u[6] = _mm256_add_epi32(u[6], rnding);
3748 0 : u[6] = _mm256_srai_epi32(u[6], bit);
3749 :
3750 0 : u[7] = _mm256_sub_epi32(y, x);
3751 0 : u[7] = _mm256_add_epi32(u[7], rnding);
3752 0 : u[7] = _mm256_srai_epi32(u[7], bit);
3753 :
3754 0 : y = _mm256_mullo_epi32(u[10], cospi32);
3755 0 : x = _mm256_mullo_epi32(u[11], cospi32);
3756 0 : u[10] = _mm256_add_epi32(y, x);
3757 0 : u[10] = _mm256_add_epi32(u[10], rnding);
3758 0 : u[10] = _mm256_srai_epi32(u[10], bit);
3759 :
3760 0 : u[11] = _mm256_sub_epi32(y, x);
3761 0 : u[11] = _mm256_add_epi32(u[11], rnding);
3762 0 : u[11] = _mm256_srai_epi32(u[11], bit);
3763 :
3764 0 : y = _mm256_mullo_epi32(u[14], cospi32);
3765 0 : x = _mm256_mullo_epi32(u[15], cospi32);
3766 0 : u[14] = _mm256_add_epi32(y, x);
3767 0 : u[14] = _mm256_add_epi32(u[14], rnding);
3768 0 : u[14] = _mm256_srai_epi32(u[14], bit);
3769 :
3770 0 : u[15] = _mm256_sub_epi32(y, x);
3771 0 : u[15] = _mm256_add_epi32(u[15], rnding);
3772 0 : u[15] = _mm256_srai_epi32(u[15], bit);
3773 :
3774 : // stage 9
3775 0 : if (do_cols) {
3776 0 : out[0] = u[0];
3777 0 : out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]);
3778 0 : out[2] = u[12];
3779 0 : out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]);
3780 0 : out[4] = u[6];
3781 0 : out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]);
3782 0 : out[6] = u[10];
3783 0 : out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]);
3784 0 : out[8] = u[3];
3785 0 : out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]);
3786 0 : out[10] = u[15];
3787 0 : out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]);
3788 0 : out[12] = u[5];
3789 0 : out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]);
3790 0 : out[14] = u[9];
3791 0 : out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]);
3792 : }
3793 : else {
3794 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
3795 0 : const __m256i clamp_lo_out =
3796 0 : _mm256_set1_epi32(-(1 << (log_range_out - 1)));
3797 0 : const __m256i clamp_hi_out =
3798 0 : _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
3799 :
3800 0 : neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
3801 : out_shift);
3802 0 : neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
3803 : &clamp_hi_out, out_shift);
3804 0 : neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
3805 : &clamp_hi_out, out_shift);
3806 0 : neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
3807 : &clamp_hi_out, out_shift);
3808 0 : neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
3809 : &clamp_hi_out, out_shift);
3810 0 : neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
3811 : &clamp_hi_out, out_shift);
3812 0 : neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
3813 : &clamp_hi_out, out_shift);
3814 0 : neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
3815 : &clamp_hi_out, out_shift);
3816 : }
3817 : }
3818 0 : }
3819 :
3820 0 : static void iadst16_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
3821 : int32_t bd, int32_t out_shift) {
3822 0 : const int32_t *cospi = cospi_arr(bit);
3823 0 : const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3824 0 : const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3825 0 : const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
3826 0 : const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
3827 0 : const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
3828 0 : const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
3829 0 : const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
3830 0 : const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
3831 0 : const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
3832 0 : const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
3833 0 : const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
3834 0 : const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
3835 0 : const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
3836 0 : const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
3837 0 : const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
3838 0 : const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3839 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3840 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3841 0 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3842 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3843 0 : const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3844 0 : const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3845 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3846 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3847 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3848 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3849 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3850 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3851 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3852 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3853 : __m256i u[16], v[16], x, y;
3854 :
3855 : {
3856 : // stage 0
3857 : // stage 1
3858 : // stage 2
3859 0 : v[0] = _mm256_mullo_epi32(in[15], cospi2);
3860 0 : x = _mm256_mullo_epi32(in[0], cospi62);
3861 0 : v[0] = _mm256_add_epi32(v[0], x);
3862 0 : v[0] = _mm256_add_epi32(v[0], rnding);
3863 0 : v[0] = _mm256_srai_epi32(v[0], bit);
3864 :
3865 0 : v[1] = _mm256_mullo_epi32(in[15], cospi62);
3866 0 : x = _mm256_mullo_epi32(in[0], cospi2);
3867 0 : v[1] = _mm256_sub_epi32(v[1], x);
3868 0 : v[1] = _mm256_add_epi32(v[1], rnding);
3869 0 : v[1] = _mm256_srai_epi32(v[1], bit);
3870 :
3871 0 : v[2] = _mm256_mullo_epi32(in[13], cospi10);
3872 0 : x = _mm256_mullo_epi32(in[2], cospi54);
3873 0 : v[2] = _mm256_add_epi32(v[2], x);
3874 0 : v[2] = _mm256_add_epi32(v[2], rnding);
3875 0 : v[2] = _mm256_srai_epi32(v[2], bit);
3876 :
3877 0 : v[3] = _mm256_mullo_epi32(in[13], cospi54);
3878 0 : x = _mm256_mullo_epi32(in[2], cospi10);
3879 0 : v[3] = _mm256_sub_epi32(v[3], x);
3880 0 : v[3] = _mm256_add_epi32(v[3], rnding);
3881 0 : v[3] = _mm256_srai_epi32(v[3], bit);
3882 :
3883 0 : v[4] = _mm256_mullo_epi32(in[11], cospi18);
3884 0 : x = _mm256_mullo_epi32(in[4], cospi46);
3885 0 : v[4] = _mm256_add_epi32(v[4], x);
3886 0 : v[4] = _mm256_add_epi32(v[4], rnding);
3887 0 : v[4] = _mm256_srai_epi32(v[4], bit);
3888 :
3889 0 : v[5] = _mm256_mullo_epi32(in[11], cospi46);
3890 0 : x = _mm256_mullo_epi32(in[4], cospi18);
3891 0 : v[5] = _mm256_sub_epi32(v[5], x);
3892 0 : v[5] = _mm256_add_epi32(v[5], rnding);
3893 0 : v[5] = _mm256_srai_epi32(v[5], bit);
3894 :
3895 0 : v[6] = _mm256_mullo_epi32(in[9], cospi26);
3896 0 : x = _mm256_mullo_epi32(in[6], cospi38);
3897 0 : v[6] = _mm256_add_epi32(v[6], x);
3898 0 : v[6] = _mm256_add_epi32(v[6], rnding);
3899 0 : v[6] = _mm256_srai_epi32(v[6], bit);
3900 :
3901 0 : v[7] = _mm256_mullo_epi32(in[9], cospi38);
3902 0 : x = _mm256_mullo_epi32(in[6], cospi26);
3903 0 : v[7] = _mm256_sub_epi32(v[7], x);
3904 0 : v[7] = _mm256_add_epi32(v[7], rnding);
3905 0 : v[7] = _mm256_srai_epi32(v[7], bit);
3906 :
3907 0 : v[8] = _mm256_mullo_epi32(in[7], cospi34);
3908 0 : x = _mm256_mullo_epi32(in[8], cospi30);
3909 0 : v[8] = _mm256_add_epi32(v[8], x);
3910 0 : v[8] = _mm256_add_epi32(v[8], rnding);
3911 0 : v[8] = _mm256_srai_epi32(v[8], bit);
3912 :
3913 0 : v[9] = _mm256_mullo_epi32(in[7], cospi30);
3914 0 : x = _mm256_mullo_epi32(in[8], cospi34);
3915 0 : v[9] = _mm256_sub_epi32(v[9], x);
3916 0 : v[9] = _mm256_add_epi32(v[9], rnding);
3917 0 : v[9] = _mm256_srai_epi32(v[9], bit);
3918 :
3919 0 : v[10] = _mm256_mullo_epi32(in[5], cospi42);
3920 0 : x = _mm256_mullo_epi32(in[10], cospi22);
3921 0 : v[10] = _mm256_add_epi32(v[10], x);
3922 0 : v[10] = _mm256_add_epi32(v[10], rnding);
3923 0 : v[10] = _mm256_srai_epi32(v[10], bit);
3924 :
3925 0 : v[11] = _mm256_mullo_epi32(in[5], cospi22);
3926 0 : x = _mm256_mullo_epi32(in[10], cospi42);
3927 0 : v[11] = _mm256_sub_epi32(v[11], x);
3928 0 : v[11] = _mm256_add_epi32(v[11], rnding);
3929 0 : v[11] = _mm256_srai_epi32(v[11], bit);
3930 :
3931 0 : v[12] = _mm256_mullo_epi32(in[3], cospi50);
3932 0 : x = _mm256_mullo_epi32(in[12], cospi14);
3933 0 : v[12] = _mm256_add_epi32(v[12], x);
3934 0 : v[12] = _mm256_add_epi32(v[12], rnding);
3935 0 : v[12] = _mm256_srai_epi32(v[12], bit);
3936 :
3937 0 : v[13] = _mm256_mullo_epi32(in[3], cospi14);
3938 0 : x = _mm256_mullo_epi32(in[12], cospi50);
3939 0 : v[13] = _mm256_sub_epi32(v[13], x);
3940 0 : v[13] = _mm256_add_epi32(v[13], rnding);
3941 0 : v[13] = _mm256_srai_epi32(v[13], bit);
3942 :
3943 0 : v[14] = _mm256_mullo_epi32(in[1], cospi58);
3944 0 : x = _mm256_mullo_epi32(in[14], cospi6);
3945 0 : v[14] = _mm256_add_epi32(v[14], x);
3946 0 : v[14] = _mm256_add_epi32(v[14], rnding);
3947 0 : v[14] = _mm256_srai_epi32(v[14], bit);
3948 :
3949 0 : v[15] = _mm256_mullo_epi32(in[1], cospi6);
3950 0 : x = _mm256_mullo_epi32(in[14], cospi58);
3951 0 : v[15] = _mm256_sub_epi32(v[15], x);
3952 0 : v[15] = _mm256_add_epi32(v[15], rnding);
3953 0 : v[15] = _mm256_srai_epi32(v[15], bit);
3954 :
3955 : // stage 3
3956 0 : addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
3957 0 : addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
3958 0 : addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
3959 0 : addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
3960 0 : addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
3961 0 : addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
3962 0 : addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
3963 0 : addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
3964 :
3965 : // stage 4
3966 0 : v[0] = u[0];
3967 0 : v[1] = u[1];
3968 0 : v[2] = u[2];
3969 0 : v[3] = u[3];
3970 0 : v[4] = u[4];
3971 0 : v[5] = u[5];
3972 0 : v[6] = u[6];
3973 0 : v[7] = u[7];
3974 :
3975 0 : v[8] = _mm256_mullo_epi32(u[8], cospi8);
3976 0 : x = _mm256_mullo_epi32(u[9], cospi56);
3977 0 : v[8] = _mm256_add_epi32(v[8], x);
3978 0 : v[8] = _mm256_add_epi32(v[8], rnding);
3979 0 : v[8] = _mm256_srai_epi32(v[8], bit);
3980 :
3981 0 : v[9] = _mm256_mullo_epi32(u[8], cospi56);
3982 0 : x = _mm256_mullo_epi32(u[9], cospi8);
3983 0 : v[9] = _mm256_sub_epi32(v[9], x);
3984 0 : v[9] = _mm256_add_epi32(v[9], rnding);
3985 0 : v[9] = _mm256_srai_epi32(v[9], bit);
3986 :
3987 0 : v[10] = _mm256_mullo_epi32(u[10], cospi40);
3988 0 : x = _mm256_mullo_epi32(u[11], cospi24);
3989 0 : v[10] = _mm256_add_epi32(v[10], x);
3990 0 : v[10] = _mm256_add_epi32(v[10], rnding);
3991 0 : v[10] = _mm256_srai_epi32(v[10], bit);
3992 :
3993 0 : v[11] = _mm256_mullo_epi32(u[10], cospi24);
3994 0 : x = _mm256_mullo_epi32(u[11], cospi40);
3995 0 : v[11] = _mm256_sub_epi32(v[11], x);
3996 0 : v[11] = _mm256_add_epi32(v[11], rnding);
3997 0 : v[11] = _mm256_srai_epi32(v[11], bit);
3998 :
3999 0 : v[12] = _mm256_mullo_epi32(u[12], cospim56);
4000 0 : x = _mm256_mullo_epi32(u[13], cospi8);
4001 0 : v[12] = _mm256_add_epi32(v[12], x);
4002 0 : v[12] = _mm256_add_epi32(v[12], rnding);
4003 0 : v[12] = _mm256_srai_epi32(v[12], bit);
4004 :
4005 0 : v[13] = _mm256_mullo_epi32(u[12], cospi8);
4006 0 : x = _mm256_mullo_epi32(u[13], cospim56);
4007 0 : v[13] = _mm256_sub_epi32(v[13], x);
4008 0 : v[13] = _mm256_add_epi32(v[13], rnding);
4009 0 : v[13] = _mm256_srai_epi32(v[13], bit);
4010 :
4011 0 : v[14] = _mm256_mullo_epi32(u[14], cospim24);
4012 0 : x = _mm256_mullo_epi32(u[15], cospi40);
4013 0 : v[14] = _mm256_add_epi32(v[14], x);
4014 0 : v[14] = _mm256_add_epi32(v[14], rnding);
4015 0 : v[14] = _mm256_srai_epi32(v[14], bit);
4016 :
4017 0 : v[15] = _mm256_mullo_epi32(u[14], cospi40);
4018 0 : x = _mm256_mullo_epi32(u[15], cospim24);
4019 0 : v[15] = _mm256_sub_epi32(v[15], x);
4020 0 : v[15] = _mm256_add_epi32(v[15], rnding);
4021 0 : v[15] = _mm256_srai_epi32(v[15], bit);
4022 :
4023 : // stage 5
4024 0 : addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
4025 0 : addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
4026 0 : addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
4027 0 : addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
4028 0 : addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
4029 0 : addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
4030 0 : addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
4031 0 : addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
4032 :
4033 : // stage 6
4034 0 : v[0] = u[0];
4035 0 : v[1] = u[1];
4036 0 : v[2] = u[2];
4037 0 : v[3] = u[3];
4038 :
4039 0 : v[4] = _mm256_mullo_epi32(u[4], cospi16);
4040 0 : x = _mm256_mullo_epi32(u[5], cospi48);
4041 0 : v[4] = _mm256_add_epi32(v[4], x);
4042 0 : v[4] = _mm256_add_epi32(v[4], rnding);
4043 0 : v[4] = _mm256_srai_epi32(v[4], bit);
4044 :
4045 0 : v[5] = _mm256_mullo_epi32(u[4], cospi48);
4046 0 : x = _mm256_mullo_epi32(u[5], cospi16);
4047 0 : v[5] = _mm256_sub_epi32(v[5], x);
4048 0 : v[5] = _mm256_add_epi32(v[5], rnding);
4049 0 : v[5] = _mm256_srai_epi32(v[5], bit);
4050 :
4051 0 : v[6] = _mm256_mullo_epi32(u[6], cospim48);
4052 0 : x = _mm256_mullo_epi32(u[7], cospi16);
4053 0 : v[6] = _mm256_add_epi32(v[6], x);
4054 0 : v[6] = _mm256_add_epi32(v[6], rnding);
4055 0 : v[6] = _mm256_srai_epi32(v[6], bit);
4056 :
4057 0 : v[7] = _mm256_mullo_epi32(u[6], cospi16);
4058 0 : x = _mm256_mullo_epi32(u[7], cospim48);
4059 0 : v[7] = _mm256_sub_epi32(v[7], x);
4060 0 : v[7] = _mm256_add_epi32(v[7], rnding);
4061 0 : v[7] = _mm256_srai_epi32(v[7], bit);
4062 :
4063 0 : v[8] = u[8];
4064 0 : v[9] = u[9];
4065 0 : v[10] = u[10];
4066 0 : v[11] = u[11];
4067 :
4068 0 : v[12] = _mm256_mullo_epi32(u[12], cospi16);
4069 0 : x = _mm256_mullo_epi32(u[13], cospi48);
4070 0 : v[12] = _mm256_add_epi32(v[12], x);
4071 0 : v[12] = _mm256_add_epi32(v[12], rnding);
4072 0 : v[12] = _mm256_srai_epi32(v[12], bit);
4073 :
4074 0 : v[13] = _mm256_mullo_epi32(u[12], cospi48);
4075 0 : x = _mm256_mullo_epi32(u[13], cospi16);
4076 0 : v[13] = _mm256_sub_epi32(v[13], x);
4077 0 : v[13] = _mm256_add_epi32(v[13], rnding);
4078 0 : v[13] = _mm256_srai_epi32(v[13], bit);
4079 :
4080 0 : v[14] = _mm256_mullo_epi32(u[14], cospim48);
4081 0 : x = _mm256_mullo_epi32(u[15], cospi16);
4082 0 : v[14] = _mm256_add_epi32(v[14], x);
4083 0 : v[14] = _mm256_add_epi32(v[14], rnding);
4084 0 : v[14] = _mm256_srai_epi32(v[14], bit);
4085 :
4086 0 : v[15] = _mm256_mullo_epi32(u[14], cospi16);
4087 0 : x = _mm256_mullo_epi32(u[15], cospim48);
4088 0 : v[15] = _mm256_sub_epi32(v[15], x);
4089 0 : v[15] = _mm256_add_epi32(v[15], rnding);
4090 0 : v[15] = _mm256_srai_epi32(v[15], bit);
4091 :
4092 : // stage 7
4093 0 : addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
4094 0 : addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
4095 0 : addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
4096 0 : addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
4097 0 : addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
4098 0 : addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
4099 0 : addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
4100 0 : addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
4101 :
4102 : // stage 8
4103 0 : v[0] = u[0];
4104 0 : v[1] = u[1];
4105 :
4106 0 : y = _mm256_mullo_epi32(u[2], cospi32);
4107 0 : x = _mm256_mullo_epi32(u[3], cospi32);
4108 0 : v[2] = _mm256_add_epi32(y, x);
4109 0 : v[2] = _mm256_add_epi32(v[2], rnding);
4110 0 : v[2] = _mm256_srai_epi32(v[2], bit);
4111 :
4112 0 : v[3] = _mm256_sub_epi32(y, x);
4113 0 : v[3] = _mm256_add_epi32(v[3], rnding);
4114 0 : v[3] = _mm256_srai_epi32(v[3], bit);
4115 :
4116 0 : v[4] = u[4];
4117 0 : v[5] = u[5];
4118 :
4119 0 : y = _mm256_mullo_epi32(u[6], cospi32);
4120 0 : x = _mm256_mullo_epi32(u[7], cospi32);
4121 0 : v[6] = _mm256_add_epi32(y, x);
4122 0 : v[6] = _mm256_add_epi32(v[6], rnding);
4123 0 : v[6] = _mm256_srai_epi32(v[6], bit);
4124 :
4125 0 : v[7] = _mm256_sub_epi32(y, x);
4126 0 : v[7] = _mm256_add_epi32(v[7], rnding);
4127 0 : v[7] = _mm256_srai_epi32(v[7], bit);
4128 :
4129 0 : v[8] = u[8];
4130 0 : v[9] = u[9];
4131 :
4132 0 : y = _mm256_mullo_epi32(u[10], cospi32);
4133 0 : x = _mm256_mullo_epi32(u[11], cospi32);
4134 0 : v[10] = _mm256_add_epi32(y, x);
4135 0 : v[10] = _mm256_add_epi32(v[10], rnding);
4136 0 : v[10] = _mm256_srai_epi32(v[10], bit);
4137 :
4138 0 : v[11] = _mm256_sub_epi32(y, x);
4139 0 : v[11] = _mm256_add_epi32(v[11], rnding);
4140 0 : v[11] = _mm256_srai_epi32(v[11], bit);
4141 :
4142 0 : v[12] = u[12];
4143 0 : v[13] = u[13];
4144 :
4145 0 : y = _mm256_mullo_epi32(u[14], cospi32);
4146 0 : x = _mm256_mullo_epi32(u[15], cospi32);
4147 0 : v[14] = _mm256_add_epi32(y, x);
4148 0 : v[14] = _mm256_add_epi32(v[14], rnding);
4149 0 : v[14] = _mm256_srai_epi32(v[14], bit);
4150 :
4151 0 : v[15] = _mm256_sub_epi32(y, x);
4152 0 : v[15] = _mm256_add_epi32(v[15], rnding);
4153 0 : v[15] = _mm256_srai_epi32(v[15], bit);
4154 :
4155 : // stage 9
4156 0 : if (do_cols) {
4157 0 : out[0] = v[0];
4158 0 : out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
4159 0 : out[2] = v[12];
4160 0 : out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
4161 0 : out[4] = v[6];
4162 0 : out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
4163 0 : out[6] = v[10];
4164 0 : out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
4165 0 : out[8] = v[3];
4166 0 : out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
4167 0 : out[10] = v[15];
4168 0 : out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
4169 0 : out[12] = v[5];
4170 0 : out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
4171 0 : out[14] = v[9];
4172 0 : out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
4173 : }
4174 : else {
4175 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
4176 0 : const __m256i clamp_lo_out =
4177 0 : _mm256_set1_epi32(-(1 << (log_range_out - 1)));
4178 0 : const __m256i clamp_hi_out =
4179 0 : _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
4180 :
4181 0 : neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
4182 : out_shift);
4183 0 : neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
4184 : &clamp_hi_out, out_shift);
4185 0 : neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
4186 : &clamp_hi_out, out_shift);
4187 0 : neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
4188 : &clamp_hi_out, out_shift);
4189 0 : neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
4190 : &clamp_hi_out, out_shift);
4191 0 : neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
4192 : &clamp_hi_out, out_shift);
4193 0 : neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
4194 : &clamp_hi_out, out_shift);
4195 0 : neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
4196 : &clamp_hi_out, out_shift);
4197 : }
4198 : }
4199 0 : }
4200 0 : static void iidentity16_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
4201 : int32_t bd, int32_t out_shift) {
4202 : (void)bit;
4203 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4204 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
4205 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
4206 : __m256i v[16];
4207 0 : __m256i fact = _mm256_set1_epi32(2 * NewSqrt2);
4208 0 : __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1));
4209 : __m256i a0, a1, a2, a3;
4210 :
4211 0 : for (int32_t i = 0; i < 16; i += 8) {
4212 0 : a0 = _mm256_mullo_epi32(in[i], fact);
4213 0 : a1 = _mm256_mullo_epi32(in[i + 1], fact);
4214 0 : a0 = _mm256_add_epi32(a0, offset);
4215 0 : a1 = _mm256_add_epi32(a1, offset);
4216 0 : v[i] = _mm256_srai_epi32(a0, NewSqrt2Bits);
4217 0 : v[i + 1] = _mm256_srai_epi32(a1, NewSqrt2Bits);
4218 :
4219 0 : a2 = _mm256_mullo_epi32(in[i + 2], fact);
4220 0 : a3 = _mm256_mullo_epi32(in[i + 3], fact);
4221 0 : a2 = _mm256_add_epi32(a2, offset);
4222 0 : a3 = _mm256_add_epi32(a3, offset);
4223 0 : v[i + 2] = _mm256_srai_epi32(a2, NewSqrt2Bits);
4224 0 : v[i + 3] = _mm256_srai_epi32(a3, NewSqrt2Bits);
4225 :
4226 0 : a0 = _mm256_mullo_epi32(in[i + 4], fact);
4227 0 : a1 = _mm256_mullo_epi32(in[i + 5], fact);
4228 0 : a0 = _mm256_add_epi32(a0, offset);
4229 0 : a1 = _mm256_add_epi32(a1, offset);
4230 0 : v[i + 4] = _mm256_srai_epi32(a0, NewSqrt2Bits);
4231 0 : v[i + 5] = _mm256_srai_epi32(a1, NewSqrt2Bits);
4232 :
4233 0 : a2 = _mm256_mullo_epi32(in[i + 6], fact);
4234 0 : a3 = _mm256_mullo_epi32(in[i + 7], fact);
4235 0 : a2 = _mm256_add_epi32(a2, offset);
4236 0 : a3 = _mm256_add_epi32(a3, offset);
4237 0 : v[i + 6] = _mm256_srai_epi32(a2, NewSqrt2Bits);
4238 0 : v[i + 7] = _mm256_srai_epi32(a3, NewSqrt2Bits);
4239 : }
4240 :
4241 0 : if (!do_cols) {
4242 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
4243 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
4244 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
4245 0 : const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
4246 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
4247 :
4248 0 : shift_avx2(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 16);
4249 : }
4250 : else
4251 0 : highbd_clamp_epi32_avx2(v, out, &clamp_lo, &clamp_hi, 16);
4252 0 : }
4253 0 : static void idct32_low1_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
4254 : int32_t bd, int32_t out_shift) {
4255 0 : const int32_t *cospi = cospi_arr(bit);
4256 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
4257 0 : const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
4258 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4259 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
4260 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
4261 : __m256i x;
4262 : // stage 0
4263 : // stage 1
4264 : // stage 2
4265 : // stage 3
4266 : // stage 4
4267 : // stage 5
4268 0 : x = _mm256_mullo_epi32(in[0], cospi32);
4269 0 : x = _mm256_add_epi32(x, rounding);
4270 0 : x = _mm256_srai_epi32(x, bit);
4271 :
4272 : // stage 6
4273 : // stage 7
4274 : // stage 8
4275 : // stage 9
4276 0 : if (do_cols) {
4277 0 : x = _mm256_max_epi32(x, clamp_lo);
4278 0 : x = _mm256_min_epi32(x, clamp_hi);
4279 : }
4280 : else {
4281 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
4282 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
4283 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
4284 0 : const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
4285 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
4286 0 : __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
4287 0 : x = _mm256_add_epi32(offset, x);
4288 0 : x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
4289 0 : x = _mm256_max_epi32(x, clamp_lo_out);
4290 0 : x = _mm256_min_epi32(x, clamp_hi_out);
4291 : }
4292 :
4293 0 : out[0] = x;
4294 0 : out[1] = x;
4295 0 : out[2] = x;
4296 0 : out[3] = x;
4297 0 : out[4] = x;
4298 0 : out[5] = x;
4299 0 : out[6] = x;
4300 0 : out[7] = x;
4301 0 : out[8] = x;
4302 0 : out[9] = x;
4303 0 : out[10] = x;
4304 0 : out[11] = x;
4305 0 : out[12] = x;
4306 0 : out[13] = x;
4307 0 : out[14] = x;
4308 0 : out[15] = x;
4309 0 : out[16] = x;
4310 0 : out[17] = x;
4311 0 : out[18] = x;
4312 0 : out[19] = x;
4313 0 : out[20] = x;
4314 0 : out[21] = x;
4315 0 : out[22] = x;
4316 0 : out[23] = x;
4317 0 : out[24] = x;
4318 0 : out[25] = x;
4319 0 : out[26] = x;
4320 0 : out[27] = x;
4321 0 : out[28] = x;
4322 0 : out[29] = x;
4323 0 : out[30] = x;
4324 0 : out[31] = x;
4325 0 : }
4326 :
4327 0 : static void idct32_low8_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
4328 : int32_t bd, int32_t out_shift) {
4329 0 : const int32_t *cospi = cospi_arr(bit);
4330 0 : const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
4331 0 : const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
4332 0 : const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
4333 0 : const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
4334 0 : const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
4335 0 : const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
4336 0 : const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
4337 0 : const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
4338 0 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
4339 0 : const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
4340 0 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
4341 0 : const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
4342 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
4343 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
4344 0 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
4345 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
4346 0 : const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
4347 0 : const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
4348 0 : const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
4349 0 : const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
4350 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
4351 0 : const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
4352 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
4353 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
4354 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
4355 0 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
4356 0 : const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
4357 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4358 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
4359 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
4360 : __m256i bf1[32];
4361 :
4362 : {
4363 : // stage 0
4364 : // stage 1
4365 0 : bf1[0] = in[0];
4366 0 : bf1[4] = in[4];
4367 0 : bf1[8] = in[2];
4368 0 : bf1[12] = in[6];
4369 0 : bf1[16] = in[1];
4370 0 : bf1[20] = in[5];
4371 0 : bf1[24] = in[3];
4372 0 : bf1[28] = in[7];
4373 :
4374 : // stage 2
4375 0 : bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
4376 0 : bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
4377 0 : bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
4378 0 : bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
4379 0 : bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
4380 0 : bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
4381 0 : bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
4382 0 : bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
4383 :
4384 : // stage 3
4385 0 : bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
4386 0 : bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
4387 :
4388 0 : bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
4389 0 : bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
4390 0 : bf1[17] = bf1[16];
4391 0 : bf1[18] = bf1[19];
4392 0 : bf1[21] = bf1[20];
4393 0 : bf1[22] = bf1[23];
4394 0 : bf1[25] = bf1[24];
4395 0 : bf1[26] = bf1[27];
4396 0 : bf1[29] = bf1[28];
4397 0 : bf1[30] = bf1[31];
4398 :
4399 : // stage 4
4400 0 : bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
4401 0 : bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
4402 :
4403 0 : bf1[9] = bf1[8];
4404 0 : bf1[10] = bf1[11];
4405 0 : bf1[13] = bf1[12];
4406 0 : bf1[14] = bf1[15];
4407 :
4408 0 : idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4409 : &cospi24, &cospi40, &cospim24, &rounding, bit);
4410 :
4411 : // stage 5
4412 0 : bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
4413 0 : bf1[1] = bf1[0];
4414 0 : bf1[5] = bf1[4];
4415 0 : bf1[6] = bf1[7];
4416 :
4417 0 : idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4418 : &clamp_hi, &rounding, bit);
4419 :
4420 : // stage 6
4421 0 : bf1[3] = bf1[0];
4422 0 : bf1[2] = bf1[1];
4423 :
4424 0 : idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4425 : &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4426 :
4427 : // stage 7
4428 0 : idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4429 : &rounding, bit);
4430 :
4431 : // stage 8
4432 0 : idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4433 : &rounding, bit);
4434 :
4435 : // stage 9
4436 0 : idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
4437 : }
4438 0 : }
4439 :
4440 0 : static void idct32_low16_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
4441 : int32_t bd, int32_t out_shift) {
4442 0 : const int32_t *cospi = cospi_arr(bit);
4443 0 : const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
4444 0 : const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
4445 0 : const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
4446 0 : const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
4447 0 : const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
4448 0 : const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
4449 0 : const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
4450 0 : const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
4451 0 : const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
4452 0 : const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
4453 0 : const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
4454 0 : const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
4455 0 : const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
4456 0 : const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
4457 0 : const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
4458 0 : const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
4459 0 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
4460 0 : const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
4461 0 : const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
4462 0 : const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
4463 0 : const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
4464 0 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
4465 0 : const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
4466 0 : const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
4467 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
4468 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
4469 0 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
4470 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
4471 0 : const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
4472 0 : const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
4473 0 : const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
4474 0 : const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
4475 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
4476 0 : const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
4477 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
4478 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
4479 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
4480 0 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
4481 0 : const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
4482 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4483 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
4484 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
4485 : __m256i bf1[32];
4486 :
4487 : {
4488 : // stage 0
4489 : // stage 1
4490 0 : bf1[0] = in[0];
4491 0 : bf1[2] = in[8];
4492 0 : bf1[4] = in[4];
4493 0 : bf1[6] = in[12];
4494 0 : bf1[8] = in[2];
4495 0 : bf1[10] = in[10];
4496 0 : bf1[12] = in[6];
4497 0 : bf1[14] = in[14];
4498 0 : bf1[16] = in[1];
4499 0 : bf1[18] = in[9];
4500 0 : bf1[20] = in[5];
4501 0 : bf1[22] = in[13];
4502 0 : bf1[24] = in[3];
4503 0 : bf1[26] = in[11];
4504 0 : bf1[28] = in[7];
4505 0 : bf1[30] = in[15];
4506 :
4507 : // stage 2
4508 0 : bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
4509 0 : bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
4510 0 : bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
4511 0 : bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
4512 0 : bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
4513 0 : bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
4514 0 : bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
4515 0 : bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
4516 0 : bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
4517 0 : bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
4518 0 : bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
4519 0 : bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
4520 0 : bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
4521 0 : bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
4522 0 : bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
4523 0 : bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
4524 :
4525 : // stage 3
4526 0 : bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
4527 0 : bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
4528 0 : bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
4529 0 : bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
4530 0 : bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
4531 0 : bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
4532 0 : bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
4533 0 : bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
4534 :
4535 0 : addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4536 0 : addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4537 0 : addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4538 0 : addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4539 0 : addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4540 0 : addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4541 0 : addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4542 0 : addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4543 :
4544 : // stage 4
4545 0 : bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
4546 0 : bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
4547 0 : bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
4548 0 : bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
4549 :
4550 0 : addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
4551 0 : addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
4552 0 : addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
4553 0 : addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
4554 :
4555 0 : idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4556 : &cospi24, &cospi40, &cospim24, &rounding, bit);
4557 :
4558 : // stage 5
4559 0 : bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
4560 0 : bf1[1] = bf1[0];
4561 0 : bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
4562 0 : bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
4563 :
4564 0 : addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4565 0 : addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4566 :
4567 0 : idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4568 : &clamp_hi, &rounding, bit);
4569 :
4570 : // stage 6
4571 0 : addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
4572 0 : addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
4573 :
4574 0 : idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4575 : &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4576 :
4577 : // stage 7
4578 0 : idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4579 : &rounding, bit);
4580 :
4581 : // stage 8
4582 0 : idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4583 : &rounding, bit);
4584 :
4585 : // stage 9
4586 0 : idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
4587 : }
4588 0 : }
4589 :
4590 0 : static void idct32_avx2_new(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols, int32_t bd,
4591 : int32_t out_shift) {
4592 0 : const int32_t *cospi = cospi_arr(bit);
4593 0 : const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
4594 0 : const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
4595 0 : const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
4596 0 : const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
4597 0 : const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
4598 0 : const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
4599 0 : const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
4600 0 : const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
4601 0 : const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
4602 0 : const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
4603 0 : const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
4604 0 : const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
4605 0 : const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
4606 0 : const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
4607 0 : const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
4608 0 : const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
4609 0 : const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
4610 0 : const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
4611 0 : const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
4612 0 : const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
4613 0 : const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
4614 0 : const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
4615 0 : const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
4616 0 : const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
4617 0 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
4618 0 : const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
4619 0 : const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
4620 0 : const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
4621 0 : const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
4622 0 : const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
4623 0 : const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
4624 0 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
4625 0 : const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
4626 0 : const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
4627 0 : const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
4628 0 : const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
4629 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
4630 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
4631 0 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
4632 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
4633 0 : const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
4634 0 : const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
4635 0 : const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
4636 0 : const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
4637 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
4638 0 : const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
4639 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
4640 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
4641 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
4642 0 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
4643 0 : const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
4644 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4645 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
4646 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
4647 : __m256i bf1[32], bf0[32];
4648 :
4649 : {
4650 : // stage 0
4651 : // stage 1
4652 0 : bf1[0] = in[0];
4653 0 : bf1[1] = in[16];
4654 0 : bf1[2] = in[8];
4655 0 : bf1[3] = in[24];
4656 0 : bf1[4] = in[4];
4657 0 : bf1[5] = in[20];
4658 0 : bf1[6] = in[12];
4659 0 : bf1[7] = in[28];
4660 0 : bf1[8] = in[2];
4661 0 : bf1[9] = in[18];
4662 0 : bf1[10] = in[10];
4663 0 : bf1[11] = in[26];
4664 0 : bf1[12] = in[6];
4665 0 : bf1[13] = in[22];
4666 0 : bf1[14] = in[14];
4667 0 : bf1[15] = in[30];
4668 0 : bf1[16] = in[1];
4669 0 : bf1[17] = in[17];
4670 0 : bf1[18] = in[9];
4671 0 : bf1[19] = in[25];
4672 0 : bf1[20] = in[5];
4673 0 : bf1[21] = in[21];
4674 0 : bf1[22] = in[13];
4675 0 : bf1[23] = in[29];
4676 0 : bf1[24] = in[3];
4677 0 : bf1[25] = in[19];
4678 0 : bf1[26] = in[11];
4679 0 : bf1[27] = in[27];
4680 0 : bf1[28] = in[7];
4681 0 : bf1[29] = in[23];
4682 0 : bf1[30] = in[15];
4683 0 : bf1[31] = in[31];
4684 :
4685 : // stage 2
4686 0 : bf0[0] = bf1[0];
4687 0 : bf0[1] = bf1[1];
4688 0 : bf0[2] = bf1[2];
4689 0 : bf0[3] = bf1[3];
4690 0 : bf0[4] = bf1[4];
4691 0 : bf0[5] = bf1[5];
4692 0 : bf0[6] = bf1[6];
4693 0 : bf0[7] = bf1[7];
4694 0 : bf0[8] = bf1[8];
4695 0 : bf0[9] = bf1[9];
4696 0 : bf0[10] = bf1[10];
4697 0 : bf0[11] = bf1[11];
4698 0 : bf0[12] = bf1[12];
4699 0 : bf0[13] = bf1[13];
4700 0 : bf0[14] = bf1[14];
4701 0 : bf0[15] = bf1[15];
4702 0 : bf0[16] =
4703 0 : half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
4704 0 : bf0[17] =
4705 0 : half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
4706 0 : bf0[18] =
4707 0 : half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
4708 0 : bf0[19] =
4709 0 : half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
4710 0 : bf0[20] =
4711 0 : half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
4712 0 : bf0[21] =
4713 0 : half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
4714 0 : bf0[22] =
4715 0 : half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
4716 0 : bf0[23] =
4717 0 : half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
4718 0 : bf0[24] =
4719 0 : half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
4720 0 : bf0[25] =
4721 0 : half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
4722 0 : bf0[26] =
4723 0 : half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
4724 0 : bf0[27] =
4725 0 : half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
4726 0 : bf0[28] =
4727 0 : half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
4728 0 : bf0[29] =
4729 0 : half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
4730 0 : bf0[30] =
4731 0 : half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
4732 0 : bf0[31] =
4733 0 : half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
4734 :
4735 : // stage 3
4736 0 : bf1[0] = bf0[0];
4737 0 : bf1[1] = bf0[1];
4738 0 : bf1[2] = bf0[2];
4739 0 : bf1[3] = bf0[3];
4740 0 : bf1[4] = bf0[4];
4741 0 : bf1[5] = bf0[5];
4742 0 : bf1[6] = bf0[6];
4743 0 : bf1[7] = bf0[7];
4744 0 : bf1[8] =
4745 0 : half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
4746 0 : bf1[9] =
4747 0 : half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
4748 0 : bf1[10] =
4749 0 : half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
4750 0 : bf1[11] =
4751 0 : half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
4752 0 : bf1[12] =
4753 0 : half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
4754 0 : bf1[13] =
4755 0 : half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
4756 0 : bf1[14] =
4757 0 : half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
4758 0 : bf1[15] =
4759 0 : half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
4760 :
4761 0 : addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4762 0 : addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4763 0 : addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4764 0 : addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4765 0 : addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4766 0 : addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4767 0 : addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4768 0 : addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4769 :
4770 : // stage 4
4771 0 : bf0[0] = bf1[0];
4772 0 : bf0[1] = bf1[1];
4773 0 : bf0[2] = bf1[2];
4774 0 : bf0[3] = bf1[3];
4775 0 : bf0[4] =
4776 0 : half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
4777 0 : bf0[5] =
4778 0 : half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
4779 0 : bf0[6] =
4780 0 : half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
4781 0 : bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
4782 :
4783 0 : addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
4784 0 : addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
4785 0 : addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
4786 0 : addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
4787 :
4788 0 : bf0[16] = bf1[16];
4789 0 : bf0[17] =
4790 0 : half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
4791 0 : bf0[18] =
4792 0 : half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
4793 0 : bf0[19] = bf1[19];
4794 0 : bf0[20] = bf1[20];
4795 0 : bf0[21] =
4796 0 : half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
4797 0 : bf0[22] =
4798 0 : half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
4799 0 : bf0[23] = bf1[23];
4800 0 : bf0[24] = bf1[24];
4801 0 : bf0[25] =
4802 0 : half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
4803 0 : bf0[26] =
4804 0 : half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
4805 0 : bf0[27] = bf1[27];
4806 0 : bf0[28] = bf1[28];
4807 0 : bf0[29] =
4808 0 : half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
4809 0 : bf0[30] =
4810 0 : half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
4811 0 : bf0[31] = bf1[31];
4812 :
4813 : // stage 5
4814 0 : bf1[0] =
4815 0 : half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
4816 0 : bf1[1] =
4817 0 : half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
4818 0 : bf1[2] =
4819 0 : half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
4820 0 : bf1[3] =
4821 0 : half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
4822 0 : addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4823 0 : addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4824 0 : bf1[8] = bf0[8];
4825 0 : bf1[9] =
4826 0 : half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
4827 0 : bf1[10] =
4828 0 : half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
4829 0 : bf1[11] = bf0[11];
4830 0 : bf1[12] = bf0[12];
4831 0 : bf1[13] =
4832 0 : half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
4833 0 : bf1[14] =
4834 0 : half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
4835 0 : bf1[15] = bf0[15];
4836 0 : addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
4837 0 : addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
4838 0 : addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
4839 0 : addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
4840 0 : addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
4841 0 : addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
4842 0 : addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
4843 0 : addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
4844 :
4845 : // stage 6
4846 0 : addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
4847 0 : addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
4848 0 : bf0[4] = bf1[4];
4849 0 : bf0[5] =
4850 0 : half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
4851 0 : bf0[6] =
4852 0 : half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
4853 0 : bf0[7] = bf1[7];
4854 0 : addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
4855 0 : addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
4856 0 : addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
4857 0 : addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
4858 0 : bf0[16] = bf1[16];
4859 0 : bf0[17] = bf1[17];
4860 0 : bf0[18] =
4861 0 : half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
4862 0 : bf0[19] =
4863 0 : half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
4864 0 : bf0[20] =
4865 0 : half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
4866 0 : bf0[21] =
4867 0 : half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
4868 0 : bf0[22] = bf1[22];
4869 0 : bf0[23] = bf1[23];
4870 0 : bf0[24] = bf1[24];
4871 0 : bf0[25] = bf1[25];
4872 0 : bf0[26] =
4873 0 : half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
4874 0 : bf0[27] =
4875 0 : half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
4876 0 : bf0[28] =
4877 0 : half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
4878 0 : bf0[29] =
4879 0 : half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
4880 0 : bf0[30] = bf1[30];
4881 0 : bf0[31] = bf1[31];
4882 :
4883 : // stage 7
4884 0 : addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
4885 0 : addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
4886 0 : addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
4887 0 : addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
4888 0 : bf1[8] = bf0[8];
4889 0 : bf1[9] = bf0[9];
4890 0 : bf1[10] =
4891 0 : half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
4892 0 : bf1[11] =
4893 0 : half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
4894 0 : bf1[12] =
4895 0 : half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
4896 0 : bf1[13] =
4897 0 : half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
4898 0 : bf1[14] = bf0[14];
4899 0 : bf1[15] = bf0[15];
4900 0 : addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
4901 0 : addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
4902 0 : addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
4903 0 : addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
4904 0 : addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
4905 0 : addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
4906 0 : addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
4907 0 : addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
4908 :
4909 : // stage 8
4910 0 : addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
4911 0 : addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
4912 0 : addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
4913 0 : addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
4914 0 : addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
4915 0 : addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
4916 0 : addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
4917 0 : addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
4918 0 : bf0[16] = bf1[16];
4919 0 : bf0[17] = bf1[17];
4920 0 : bf0[18] = bf1[18];
4921 0 : bf0[19] = bf1[19];
4922 0 : bf0[20] =
4923 0 : half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
4924 0 : bf0[21] =
4925 0 : half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
4926 0 : bf0[22] =
4927 0 : half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
4928 0 : bf0[23] =
4929 0 : half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
4930 0 : bf0[24] =
4931 0 : half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
4932 0 : bf0[25] =
4933 0 : half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
4934 0 : bf0[26] =
4935 0 : half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
4936 0 : bf0[27] =
4937 0 : half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
4938 0 : bf0[28] = bf1[28];
4939 0 : bf0[29] = bf1[29];
4940 0 : bf0[30] = bf1[30];
4941 0 : bf0[31] = bf1[31];
4942 :
4943 : // stage 9
4944 0 : if (do_cols) {
4945 0 : addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31);
4946 0 : addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30);
4947 0 : addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29);
4948 0 : addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28);
4949 0 : addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27);
4950 0 : addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26);
4951 0 : addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25);
4952 0 : addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24);
4953 0 : addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23);
4954 0 : addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22);
4955 0 : addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21);
4956 0 : addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20);
4957 0 : addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19);
4958 0 : addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18);
4959 0 : addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17);
4960 0 : addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16);
4961 : }
4962 : else {
4963 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
4964 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
4965 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
4966 0 : const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
4967 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
4968 :
4969 0 : addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
4970 : &clamp_hi_out, out_shift);
4971 0 : addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
4972 : &clamp_hi_out, out_shift);
4973 0 : addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
4974 : &clamp_hi_out, out_shift);
4975 0 : addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
4976 : &clamp_hi_out, out_shift);
4977 0 : addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
4978 : &clamp_hi_out, out_shift);
4979 0 : addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
4980 : &clamp_hi_out, out_shift);
4981 0 : addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
4982 : &clamp_hi_out, out_shift);
4983 0 : addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
4984 : &clamp_hi_out, out_shift);
4985 0 : addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
4986 : &clamp_hi_out, out_shift);
4987 0 : addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
4988 : &clamp_hi_out, out_shift);
4989 0 : addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
4990 : &clamp_hi_out, out_shift);
4991 0 : addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
4992 : &clamp_hi_out, out_shift);
4993 0 : addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
4994 : &clamp_hi_out, out_shift);
4995 0 : addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
4996 : &clamp_hi_out, out_shift);
4997 0 : addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
4998 : &clamp_hi_out, out_shift);
4999 0 : addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
5000 : &clamp_hi_out, out_shift);
5001 : }
5002 : }
5003 0 : }
5004 0 : static void iidentity32_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
5005 : int32_t bd, int32_t out_shift) {
5006 : (void)bit;
5007 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
5008 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
5009 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
5010 : __m256i v[64];
5011 0 : for (int32_t i = 0; i < 32; i += 16) {
5012 0 : v[i] = _mm256_slli_epi32(in[i], 2);
5013 0 : v[i + 1] = _mm256_slli_epi32(in[i + 1], 2);
5014 0 : v[i + 2] = _mm256_slli_epi32(in[i + 2], 2);
5015 0 : v[i + 3] = _mm256_slli_epi32(in[i + 3], 2);
5016 0 : v[i + 4] = _mm256_slli_epi32(in[i + 4], 2);
5017 0 : v[i + 5] = _mm256_slli_epi32(in[i + 5], 2);
5018 0 : v[i + 6] = _mm256_slli_epi32(in[i + 6], 2);
5019 0 : v[i + 7] = _mm256_slli_epi32(in[i + 7], 2);
5020 0 : v[i + 8] = _mm256_slli_epi32(in[i + 8], 2);
5021 0 : v[i + 9] = _mm256_slli_epi32(in[i + 9], 2);
5022 0 : v[i + 10] = _mm256_slli_epi32(in[i + 10], 2);
5023 0 : v[i + 11] = _mm256_slli_epi32(in[i + 11], 2);
5024 0 : v[i + 12] = _mm256_slli_epi32(in[i + 12], 2);
5025 0 : v[i + 13] = _mm256_slli_epi32(in[i + 13], 2);
5026 0 : v[i + 14] = _mm256_slli_epi32(in[i + 14], 2);
5027 0 : v[i + 15] = _mm256_slli_epi32(in[i + 15], 2);
5028 : }
5029 :
5030 0 : if (!do_cols) {
5031 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
5032 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
5033 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
5034 0 : const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
5035 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
5036 0 : shift_avx2(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 32);
5037 : }
5038 : else
5039 0 : highbd_clamp_epi32_avx2(v, out, &clamp_lo, &clamp_hi, 32);
5040 0 : }
5041 0 : static void idct64_low1_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
5042 : int32_t bd, int32_t out_shift) {
5043 0 : const int32_t *cospi = cospi_arr(bit);
5044 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
5045 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
5046 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
5047 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
5048 :
5049 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
5050 :
5051 : {
5052 : __m256i x;
5053 :
5054 : // stage 1
5055 : // stage 2
5056 : // stage 3
5057 : // stage 4
5058 : // stage 5
5059 : // stage 6
5060 0 : x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit);
5061 :
5062 : // stage 8
5063 : // stage 9
5064 : // stage 10
5065 : // stage 11
5066 0 : if (do_cols) {
5067 0 : x = _mm256_max_epi32(x, clamp_lo);
5068 0 : x = _mm256_min_epi32(x, clamp_hi);
5069 : }
5070 : else {
5071 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
5072 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
5073 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
5074 0 : const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
5075 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
5076 :
5077 0 : __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
5078 0 : x = _mm256_add_epi32(x, offset);
5079 0 : x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
5080 :
5081 0 : x = _mm256_max_epi32(x, clamp_lo_out);
5082 0 : x = _mm256_min_epi32(x, clamp_hi_out);
5083 : }
5084 :
5085 0 : out[0] = x;
5086 0 : out[1] = x;
5087 0 : out[2] = x;
5088 0 : out[3] = x;
5089 0 : out[4] = x;
5090 0 : out[5] = x;
5091 0 : out[6] = x;
5092 0 : out[7] = x;
5093 0 : out[8] = x;
5094 0 : out[9] = x;
5095 0 : out[10] = x;
5096 0 : out[11] = x;
5097 0 : out[12] = x;
5098 0 : out[13] = x;
5099 0 : out[14] = x;
5100 0 : out[15] = x;
5101 0 : out[16] = x;
5102 0 : out[17] = x;
5103 0 : out[18] = x;
5104 0 : out[19] = x;
5105 0 : out[20] = x;
5106 0 : out[21] = x;
5107 0 : out[22] = x;
5108 0 : out[23] = x;
5109 0 : out[24] = x;
5110 0 : out[25] = x;
5111 0 : out[26] = x;
5112 0 : out[27] = x;
5113 0 : out[28] = x;
5114 0 : out[29] = x;
5115 0 : out[30] = x;
5116 0 : out[31] = x;
5117 0 : out[32] = x;
5118 0 : out[33] = x;
5119 0 : out[34] = x;
5120 0 : out[35] = x;
5121 0 : out[36] = x;
5122 0 : out[37] = x;
5123 0 : out[38] = x;
5124 0 : out[39] = x;
5125 0 : out[40] = x;
5126 0 : out[41] = x;
5127 0 : out[42] = x;
5128 0 : out[43] = x;
5129 0 : out[44] = x;
5130 0 : out[45] = x;
5131 0 : out[46] = x;
5132 0 : out[47] = x;
5133 0 : out[48] = x;
5134 0 : out[49] = x;
5135 0 : out[50] = x;
5136 0 : out[51] = x;
5137 0 : out[52] = x;
5138 0 : out[53] = x;
5139 0 : out[54] = x;
5140 0 : out[55] = x;
5141 0 : out[56] = x;
5142 0 : out[57] = x;
5143 0 : out[58] = x;
5144 0 : out[59] = x;
5145 0 : out[60] = x;
5146 0 : out[61] = x;
5147 0 : out[62] = x;
5148 0 : out[63] = x;
5149 : }
5150 0 : }
5151 :
5152 0 : static void idct64_low8_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
5153 : int32_t bd, int32_t out_shift) {
5154 : int32_t i, j;
5155 0 : const int32_t *cospi = cospi_arr(bit);
5156 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
5157 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
5158 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
5159 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
5160 :
5161 0 : const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
5162 0 : const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
5163 0 : const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
5164 0 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
5165 0 : const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
5166 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
5167 0 : const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
5168 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
5169 0 : const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
5170 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
5171 0 : const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
5172 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
5173 0 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
5174 0 : const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
5175 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
5176 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
5177 0 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
5178 0 : const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
5179 0 : const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
5180 0 : const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
5181 0 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
5182 0 : const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
5183 0 : const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
5184 0 : const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
5185 0 : const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
5186 0 : const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
5187 0 : const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
5188 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
5189 0 : const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
5190 0 : const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
5191 0 : const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
5192 0 : const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
5193 0 : const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
5194 0 : const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
5195 0 : const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
5196 0 : const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
5197 0 : const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
5198 0 : const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
5199 :
5200 : {
5201 : __m256i u[64];
5202 :
5203 : // stage 1
5204 0 : u[0] = in[0];
5205 0 : u[8] = in[4];
5206 0 : u[16] = in[2];
5207 0 : u[24] = in[6];
5208 0 : u[32] = in[1];
5209 0 : u[40] = in[5];
5210 0 : u[48] = in[3];
5211 0 : u[56] = in[7];
5212 :
5213 : // stage 2
5214 0 : u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
5215 0 : u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
5216 0 : u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
5217 0 : u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
5218 0 : u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
5219 0 : u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
5220 0 : u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
5221 0 : u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
5222 :
5223 : // stage 3
5224 0 : u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
5225 0 : u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
5226 0 : u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
5227 0 : u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
5228 0 : u[33] = u[32];
5229 0 : u[38] = u[39];
5230 0 : u[41] = u[40];
5231 0 : u[46] = u[47];
5232 0 : u[49] = u[48];
5233 0 : u[54] = u[55];
5234 0 : u[57] = u[56];
5235 0 : u[62] = u[63];
5236 :
5237 : // stage 4
5238 : __m256i temp1, temp2;
5239 0 : u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
5240 0 : u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
5241 0 : u[17] = u[16];
5242 0 : u[22] = u[23];
5243 0 : u[25] = u[24];
5244 0 : u[30] = u[31];
5245 :
5246 0 : temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
5247 0 : u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
5248 0 : u[33] = temp1;
5249 :
5250 0 : temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
5251 0 : u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
5252 0 : u[57] = temp2;
5253 :
5254 0 : temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
5255 0 : u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
5256 0 : u[41] = temp1;
5257 :
5258 0 : temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
5259 0 : u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
5260 0 : u[46] = temp2;
5261 :
5262 : // stage 5
5263 0 : u[9] = u[8];
5264 0 : u[14] = u[15];
5265 :
5266 0 : temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
5267 0 : u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
5268 0 : u[17] = temp1;
5269 :
5270 0 : temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
5271 0 : u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
5272 0 : u[22] = temp2;
5273 :
5274 0 : u[35] = u[32];
5275 0 : u[34] = u[33];
5276 0 : u[36] = u[39];
5277 0 : u[37] = u[38];
5278 0 : u[43] = u[40];
5279 0 : u[42] = u[41];
5280 0 : u[44] = u[47];
5281 0 : u[45] = u[46];
5282 0 : u[51] = u[48];
5283 0 : u[50] = u[49];
5284 0 : u[52] = u[55];
5285 0 : u[53] = u[54];
5286 0 : u[59] = u[56];
5287 0 : u[58] = u[57];
5288 0 : u[60] = u[63];
5289 0 : u[61] = u[62];
5290 :
5291 : // stage 6
5292 0 : temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
5293 0 : u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
5294 0 : u[0] = temp1;
5295 :
5296 0 : temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
5297 0 : u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
5298 0 : u[9] = temp2;
5299 0 : u[19] = u[16];
5300 0 : u[18] = u[17];
5301 0 : u[20] = u[23];
5302 0 : u[21] = u[22];
5303 0 : u[27] = u[24];
5304 0 : u[26] = u[25];
5305 0 : u[28] = u[31];
5306 0 : u[29] = u[30];
5307 :
5308 0 : temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
5309 0 : u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
5310 0 : u[34] = temp1;
5311 0 : temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
5312 0 : u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
5313 0 : u[35] = temp2;
5314 0 : temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
5315 0 : u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
5316 0 : u[36] = temp1;
5317 0 : temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
5318 0 : u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
5319 0 : u[37] = temp2;
5320 0 : temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
5321 0 : u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
5322 0 : u[42] = temp1;
5323 0 : temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
5324 0 : u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
5325 0 : u[43] = temp2;
5326 0 : temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
5327 0 : u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
5328 0 : u[44] = temp1;
5329 0 : temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
5330 0 : u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
5331 0 : u[45] = temp2;
5332 :
5333 : // stage 7
5334 0 : u[3] = u[0];
5335 0 : u[2] = u[1];
5336 0 : u[11] = u[8];
5337 0 : u[10] = u[9];
5338 0 : u[12] = u[15];
5339 0 : u[13] = u[14];
5340 :
5341 0 : temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
5342 0 : u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
5343 0 : u[18] = temp1;
5344 0 : temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
5345 0 : u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
5346 0 : u[19] = temp2;
5347 0 : temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
5348 0 : u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
5349 0 : u[20] = temp1;
5350 0 : temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
5351 0 : u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
5352 0 : u[21] = temp2;
5353 0 : for (i = 32; i < 64; i += 16) {
5354 0 : for (j = i; j < i + 4; j++) {
5355 0 : addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
5356 0 : addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
5357 : &clamp_hi);
5358 : }
5359 : }
5360 :
5361 : // stage 8
5362 0 : u[7] = u[0];
5363 0 : u[6] = u[1];
5364 0 : u[5] = u[2];
5365 0 : u[4] = u[3];
5366 0 : u[9] = u[9];
5367 :
5368 0 : idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
5369 : &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
5370 :
5371 : // stage 9
5372 0 : idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
5373 : bit);
5374 :
5375 : // stage 10
5376 0 : idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
5377 : bit);
5378 :
5379 : // stage 11
5380 0 : idct64_stage11_avx2(u, out, do_cols, bd, out_shift, log_range);
5381 : }
5382 0 : }
5383 :
5384 0 : static void idct64_low16_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
5385 : int32_t bd, int32_t out_shift) {
5386 : int32_t i, j;
5387 0 : const int32_t *cospi = cospi_arr(bit);
5388 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
5389 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
5390 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
5391 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
5392 :
5393 0 : const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
5394 0 : const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
5395 0 : const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
5396 0 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
5397 0 : const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
5398 0 : const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
5399 0 : const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
5400 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
5401 0 : const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
5402 0 : const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
5403 0 : const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
5404 0 : const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
5405 0 : const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
5406 0 : const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
5407 0 : const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
5408 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
5409 0 : const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
5410 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
5411 0 : const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
5412 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
5413 0 : const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
5414 0 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
5415 0 : const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
5416 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
5417 0 : const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
5418 0 : const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
5419 0 : const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
5420 0 : const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
5421 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
5422 0 : const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
5423 0 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
5424 0 : const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
5425 0 : const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
5426 :
5427 0 : const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
5428 0 : const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
5429 0 : const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
5430 0 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
5431 0 : const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
5432 0 : const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
5433 0 : const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
5434 0 : const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
5435 0 : const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
5436 0 : const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
5437 0 : const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
5438 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
5439 0 : const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
5440 0 : const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
5441 0 : const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
5442 0 : const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
5443 0 : const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
5444 0 : const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
5445 0 : const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
5446 0 : const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
5447 0 : const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
5448 :
5449 : {
5450 : __m256i u[64];
5451 : __m256i tmp1, tmp2, tmp3, tmp4;
5452 : // stage 1
5453 0 : u[0] = in[0];
5454 0 : u[32] = in[1];
5455 0 : u[36] = in[9];
5456 0 : u[40] = in[5];
5457 0 : u[44] = in[13];
5458 0 : u[48] = in[3];
5459 0 : u[52] = in[11];
5460 0 : u[56] = in[7];
5461 0 : u[60] = in[15];
5462 0 : u[16] = in[2];
5463 0 : u[20] = in[10];
5464 0 : u[24] = in[6];
5465 0 : u[28] = in[14];
5466 0 : u[4] = in[8];
5467 0 : u[8] = in[4];
5468 0 : u[12] = in[12];
5469 :
5470 : // stage 2
5471 0 : u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
5472 0 : u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
5473 0 : u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
5474 0 : u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
5475 0 : u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
5476 0 : u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
5477 0 : u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
5478 0 : u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
5479 0 : u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
5480 0 : u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
5481 0 : u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
5482 0 : u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
5483 0 : u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
5484 0 : u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
5485 0 : u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
5486 0 : u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
5487 :
5488 : // stage 3
5489 0 : u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
5490 0 : u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
5491 0 : u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit);
5492 0 : u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit);
5493 0 : u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit);
5494 0 : u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit);
5495 0 : u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
5496 0 : u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
5497 0 : u[33] = u[32];
5498 0 : u[34] = u[35];
5499 0 : u[37] = u[36];
5500 0 : u[38] = u[39];
5501 0 : u[41] = u[40];
5502 0 : u[42] = u[43];
5503 0 : u[45] = u[44];
5504 0 : u[46] = u[47];
5505 0 : u[49] = u[48];
5506 0 : u[50] = u[51];
5507 0 : u[53] = u[52];
5508 0 : u[54] = u[55];
5509 0 : u[57] = u[56];
5510 0 : u[58] = u[59];
5511 0 : u[61] = u[60];
5512 0 : u[62] = u[63];
5513 :
5514 : // stage 4
5515 0 : u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
5516 0 : u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
5517 0 : u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
5518 0 : u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
5519 :
5520 0 : u[17] = u[16];
5521 0 : u[18] = u[19];
5522 0 : u[21] = u[20];
5523 0 : u[22] = u[23];
5524 0 : u[25] = u[24];
5525 0 : u[26] = u[27];
5526 0 : u[29] = u[28];
5527 0 : u[30] = u[31];
5528 :
5529 0 : tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
5530 0 : tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
5531 0 : tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
5532 0 : tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
5533 0 : u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
5534 0 : u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
5535 0 : u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
5536 0 : u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
5537 0 : u[33] = tmp1;
5538 0 : u[34] = tmp2;
5539 0 : u[37] = tmp3;
5540 0 : u[38] = tmp4;
5541 :
5542 0 : tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
5543 0 : tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
5544 0 : tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
5545 0 : tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
5546 0 : u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
5547 0 : u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
5548 0 : u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
5549 0 : u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
5550 0 : u[41] = tmp1;
5551 0 : u[42] = tmp2;
5552 0 : u[45] = tmp3;
5553 0 : u[46] = tmp4;
5554 :
5555 : // stage 5
5556 0 : u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
5557 0 : u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
5558 :
5559 0 : u[9] = u[8];
5560 0 : u[10] = u[11];
5561 0 : u[13] = u[12];
5562 0 : u[14] = u[15];
5563 :
5564 0 : tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
5565 0 : tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
5566 0 : tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
5567 0 : tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
5568 0 : u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
5569 0 : u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
5570 0 : u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
5571 0 : u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
5572 0 : u[17] = tmp1;
5573 0 : u[18] = tmp2;
5574 0 : u[21] = tmp3;
5575 0 : u[22] = tmp4;
5576 :
5577 0 : for (i = 32; i < 64; i += 8) {
5578 0 : addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
5579 : &clamp_hi);
5580 0 : addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
5581 : &clamp_hi);
5582 :
5583 0 : addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
5584 : &clamp_hi);
5585 0 : addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
5586 : &clamp_hi);
5587 : }
5588 :
5589 : // stage 6
5590 0 : tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
5591 0 : u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
5592 0 : u[0] = tmp1;
5593 0 : u[5] = u[4];
5594 0 : u[6] = u[7];
5595 :
5596 0 : tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
5597 0 : u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
5598 0 : u[9] = tmp1;
5599 0 : tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
5600 0 : u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
5601 0 : u[10] = tmp2;
5602 :
5603 0 : for (i = 16; i < 32; i += 8) {
5604 0 : addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
5605 : &clamp_hi);
5606 0 : addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
5607 : &clamp_hi);
5608 :
5609 0 : addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
5610 : &clamp_hi);
5611 0 : addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
5612 : &clamp_hi);
5613 : }
5614 :
5615 0 : tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
5616 0 : tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
5617 0 : tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
5618 0 : tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
5619 0 : u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
5620 0 : u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
5621 0 : u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
5622 0 : u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
5623 0 : u[34] = tmp1;
5624 0 : u[35] = tmp2;
5625 0 : u[36] = tmp3;
5626 0 : u[37] = tmp4;
5627 :
5628 0 : tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
5629 0 : tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
5630 0 : tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
5631 0 : tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
5632 0 : u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
5633 0 : u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
5634 0 : u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
5635 0 : u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
5636 0 : u[42] = tmp1;
5637 0 : u[43] = tmp2;
5638 0 : u[44] = tmp3;
5639 0 : u[45] = tmp4;
5640 :
5641 : // stage 7
5642 0 : u[3] = u[0];
5643 0 : u[2] = u[1];
5644 0 : tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
5645 0 : u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
5646 0 : u[5] = tmp1;
5647 0 : addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
5648 0 : addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
5649 0 : addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
5650 0 : addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
5651 :
5652 0 : tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
5653 0 : tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
5654 0 : tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
5655 0 : tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
5656 0 : u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
5657 0 : u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
5658 0 : u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
5659 0 : u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
5660 0 : u[18] = tmp1;
5661 0 : u[19] = tmp2;
5662 0 : u[20] = tmp3;
5663 0 : u[21] = tmp4;
5664 :
5665 0 : for (i = 32; i < 64; i += 16) {
5666 0 : for (j = i; j < i + 4; j++) {
5667 0 : addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
5668 0 : addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
5669 : &clamp_hi);
5670 : }
5671 : }
5672 :
5673 : // stage 8
5674 0 : for (i = 0; i < 4; ++i)
5675 0 : addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
5676 0 : idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
5677 : &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
5678 :
5679 : // stage 9
5680 0 : idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
5681 : bit);
5682 :
5683 : // stage 10
5684 0 : idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
5685 : bit);
5686 :
5687 : // stage 11
5688 0 : idct64_stage11_avx2(u, out, do_cols, bd, out_shift, log_range);
5689 : }
5690 0 : }
5691 :
5692 0 : static void idct64_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols, int32_t bd,
5693 : int32_t out_shift) {
5694 : int32_t i, j;
5695 0 : const int32_t *cospi = cospi_arr(bit);
5696 0 : const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
5697 0 : const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
5698 0 : const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
5699 0 : const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
5700 :
5701 0 : const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
5702 0 : const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
5703 0 : const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
5704 0 : const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
5705 0 : const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
5706 0 : const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
5707 0 : const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
5708 0 : const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
5709 0 : const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
5710 0 : const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
5711 0 : const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
5712 0 : const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
5713 0 : const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
5714 0 : const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
5715 0 : const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
5716 0 : const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
5717 0 : const __m256i cospi17 = _mm256_set1_epi32(cospi[17]);
5718 0 : const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
5719 0 : const __m256i cospi19 = _mm256_set1_epi32(cospi[19]);
5720 0 : const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
5721 0 : const __m256i cospi21 = _mm256_set1_epi32(cospi[21]);
5722 0 : const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
5723 0 : const __m256i cospi23 = _mm256_set1_epi32(cospi[23]);
5724 0 : const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
5725 0 : const __m256i cospi25 = _mm256_set1_epi32(cospi[25]);
5726 0 : const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
5727 0 : const __m256i cospi27 = _mm256_set1_epi32(cospi[27]);
5728 0 : const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
5729 0 : const __m256i cospi29 = _mm256_set1_epi32(cospi[29]);
5730 0 : const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
5731 0 : const __m256i cospi31 = _mm256_set1_epi32(cospi[31]);
5732 0 : const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
5733 0 : const __m256i cospi35 = _mm256_set1_epi32(cospi[35]);
5734 0 : const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
5735 0 : const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
5736 0 : const __m256i cospi39 = _mm256_set1_epi32(cospi[39]);
5737 0 : const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
5738 0 : const __m256i cospi43 = _mm256_set1_epi32(cospi[43]);
5739 0 : const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
5740 0 : const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
5741 0 : const __m256i cospi47 = _mm256_set1_epi32(cospi[47]);
5742 0 : const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
5743 0 : const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
5744 0 : const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
5745 0 : const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
5746 0 : const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
5747 0 : const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
5748 0 : const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
5749 0 : const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
5750 0 : const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
5751 0 : const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
5752 :
5753 0 : const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
5754 0 : const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
5755 0 : const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
5756 0 : const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
5757 0 : const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
5758 0 : const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
5759 0 : const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
5760 0 : const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
5761 0 : const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]);
5762 0 : const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
5763 0 : const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
5764 0 : const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]);
5765 0 : const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
5766 0 : const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]);
5767 0 : const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
5768 0 : const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
5769 0 : const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]);
5770 0 : const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
5771 0 : const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
5772 0 : const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
5773 0 : const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
5774 0 : const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
5775 0 : const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
5776 0 : const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
5777 0 : const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
5778 0 : const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
5779 0 : const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
5780 :
5781 : {
5782 : __m256i u[64], v[64];
5783 :
5784 : // stage 1
5785 0 : u[32] = in[1];
5786 0 : u[34] = in[17];
5787 0 : u[36] = in[9];
5788 0 : u[38] = in[25];
5789 0 : u[40] = in[5];
5790 0 : u[42] = in[21];
5791 0 : u[44] = in[13];
5792 0 : u[46] = in[29];
5793 0 : u[48] = in[3];
5794 0 : u[50] = in[19];
5795 0 : u[52] = in[11];
5796 0 : u[54] = in[27];
5797 0 : u[56] = in[7];
5798 0 : u[58] = in[23];
5799 0 : u[60] = in[15];
5800 0 : u[62] = in[31];
5801 :
5802 0 : v[16] = in[2];
5803 0 : v[18] = in[18];
5804 0 : v[20] = in[10];
5805 0 : v[22] = in[26];
5806 0 : v[24] = in[6];
5807 0 : v[26] = in[22];
5808 0 : v[28] = in[14];
5809 0 : v[30] = in[30];
5810 :
5811 0 : u[8] = in[4];
5812 0 : u[10] = in[20];
5813 0 : u[12] = in[12];
5814 0 : u[14] = in[28];
5815 :
5816 0 : v[4] = in[8];
5817 0 : v[6] = in[24];
5818 :
5819 0 : u[0] = in[0];
5820 0 : u[2] = in[16];
5821 :
5822 : // stage 2
5823 0 : v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
5824 0 : v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit);
5825 0 : v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit);
5826 0 : v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
5827 0 : v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
5828 0 : v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit);
5829 0 : v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit);
5830 0 : v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
5831 0 : v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
5832 0 : v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit);
5833 0 : v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit);
5834 0 : v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
5835 0 : v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
5836 0 : v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit);
5837 0 : v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit);
5838 0 : v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
5839 0 : v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
5840 0 : v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit);
5841 0 : v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit);
5842 0 : v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
5843 0 : v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
5844 0 : v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit);
5845 0 : v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit);
5846 0 : v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
5847 0 : v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
5848 0 : v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit);
5849 0 : v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit);
5850 0 : v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
5851 0 : v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
5852 0 : v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit);
5853 0 : v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit);
5854 0 : v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
5855 :
5856 : // stage 3
5857 0 : u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit);
5858 0 : u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit);
5859 0 : u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit);
5860 0 : u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit);
5861 0 : u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit);
5862 0 : u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit);
5863 0 : u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit);
5864 0 : u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit);
5865 0 : u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit);
5866 0 : u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit);
5867 0 : u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit);
5868 0 : u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit);
5869 0 : u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit);
5870 0 : u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit);
5871 0 : u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit);
5872 0 : u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit);
5873 :
5874 0 : for (i = 32; i < 64; i += 4) {
5875 0 : addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
5876 : &clamp_hi);
5877 0 : addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
5878 : &clamp_hi);
5879 : }
5880 :
5881 : // stage 4
5882 0 : v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
5883 0 : v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
5884 0 : v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
5885 0 : v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
5886 0 : v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
5887 0 : v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
5888 0 : v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
5889 0 : v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
5890 :
5891 0 : for (i = 16; i < 32; i += 4) {
5892 0 : addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
5893 : &clamp_hi);
5894 0 : addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
5895 : &clamp_hi);
5896 : }
5897 :
5898 0 : for (i = 32; i < 64; i += 4) {
5899 0 : v[i + 0] = u[i + 0];
5900 0 : v[i + 3] = u[i + 3];
5901 : }
5902 :
5903 0 : v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
5904 0 : v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
5905 0 : v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
5906 0 : v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
5907 0 : v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
5908 0 : v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
5909 0 : v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
5910 0 : v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
5911 0 : v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
5912 0 : v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
5913 0 : v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
5914 0 : v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
5915 0 : v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
5916 0 : v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
5917 0 : v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
5918 0 : v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
5919 :
5920 : // stage 5
5921 0 : u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit);
5922 0 : u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit);
5923 0 : u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit);
5924 0 : u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit);
5925 :
5926 0 : for (i = 8; i < 16; i += 4) {
5927 0 : addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
5928 : &clamp_hi);
5929 0 : addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
5930 : &clamp_hi);
5931 : }
5932 :
5933 0 : for (i = 16; i < 32; i += 4) {
5934 0 : u[i + 0] = v[i + 0];
5935 0 : u[i + 3] = v[i + 3];
5936 : }
5937 :
5938 0 : u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
5939 0 : u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
5940 0 : u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
5941 0 : u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
5942 0 : u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
5943 0 : u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
5944 0 : u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
5945 0 : u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
5946 :
5947 0 : for (i = 32; i < 64; i += 8) {
5948 0 : addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
5949 : &clamp_hi);
5950 0 : addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
5951 : &clamp_hi);
5952 :
5953 0 : addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
5954 : &clamp_hi);
5955 0 : addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
5956 : &clamp_hi);
5957 : }
5958 :
5959 : // stage 6
5960 0 : v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
5961 0 : v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
5962 0 : v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
5963 0 : v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
5964 :
5965 0 : addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
5966 0 : addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
5967 :
5968 0 : for (i = 8; i < 16; i += 4) {
5969 0 : v[i + 0] = u[i + 0];
5970 0 : v[i + 3] = u[i + 3];
5971 : }
5972 :
5973 0 : v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
5974 0 : v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
5975 0 : v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
5976 0 : v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
5977 :
5978 0 : for (i = 16; i < 32; i += 8) {
5979 0 : addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
5980 : &clamp_hi);
5981 0 : addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
5982 : &clamp_hi);
5983 :
5984 0 : addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
5985 : &clamp_hi);
5986 0 : addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
5987 : &clamp_hi);
5988 : }
5989 :
5990 0 : for (i = 32; i < 64; i += 8) {
5991 0 : v[i + 0] = u[i + 0];
5992 0 : v[i + 1] = u[i + 1];
5993 0 : v[i + 6] = u[i + 6];
5994 0 : v[i + 7] = u[i + 7];
5995 : }
5996 :
5997 0 : v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
5998 0 : v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
5999 0 : v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
6000 0 : v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
6001 0 : v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
6002 0 : v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
6003 0 : v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
6004 0 : v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
6005 0 : v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
6006 0 : v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
6007 0 : v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
6008 0 : v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
6009 0 : v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
6010 0 : v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
6011 0 : v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
6012 0 : v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
6013 :
6014 : // stage 7
6015 0 : addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
6016 0 : addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
6017 :
6018 0 : u[4] = v[4];
6019 0 : u[7] = v[7];
6020 0 : u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
6021 0 : u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
6022 :
6023 0 : addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
6024 0 : addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
6025 0 : addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
6026 0 : addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
6027 :
6028 0 : for (i = 16; i < 32; i += 8) {
6029 0 : u[i + 0] = v[i + 0];
6030 0 : u[i + 1] = v[i + 1];
6031 0 : u[i + 6] = v[i + 6];
6032 0 : u[i + 7] = v[i + 7];
6033 : }
6034 :
6035 0 : u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
6036 0 : u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
6037 0 : u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
6038 0 : u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
6039 0 : u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
6040 0 : u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
6041 0 : u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
6042 0 : u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
6043 :
6044 0 : for (i = 32; i < 64; i += 16) {
6045 0 : for (j = i; j < i + 4; j++) {
6046 0 : addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
6047 0 : addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
6048 : &clamp_hi);
6049 : }
6050 : }
6051 :
6052 : // stage 8
6053 0 : for (i = 0; i < 4; ++i)
6054 0 : addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
6055 0 : v[8] = u[8];
6056 0 : v[9] = u[9];
6057 0 : v[14] = u[14];
6058 0 : v[15] = u[15];
6059 :
6060 0 : v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
6061 0 : v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
6062 0 : v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
6063 0 : v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
6064 :
6065 0 : for (i = 16; i < 20; ++i) {
6066 0 : addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
6067 0 : addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
6068 : &clamp_hi);
6069 : }
6070 :
6071 0 : for (i = 32; i < 36; ++i) {
6072 0 : v[i] = u[i];
6073 0 : v[i + 12] = u[i + 12];
6074 0 : v[i + 16] = u[i + 16];
6075 0 : v[i + 28] = u[i + 28];
6076 : }
6077 :
6078 0 : v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
6079 0 : v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
6080 0 : v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
6081 0 : v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
6082 0 : v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
6083 0 : v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
6084 0 : v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
6085 0 : v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
6086 0 : v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
6087 0 : v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
6088 0 : v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
6089 0 : v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
6090 0 : v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
6091 0 : v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
6092 0 : v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
6093 0 : v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
6094 :
6095 : // stage 9
6096 0 : for (i = 0; i < 8; ++i)
6097 0 : addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
6098 0 : for (i = 16; i < 20; ++i) {
6099 0 : u[i] = v[i];
6100 0 : u[i + 12] = v[i + 12];
6101 : }
6102 :
6103 0 : u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
6104 0 : u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
6105 0 : u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
6106 0 : u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
6107 0 : u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
6108 0 : u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
6109 0 : u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
6110 0 : u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
6111 :
6112 0 : for (i = 32; i < 40; i++)
6113 0 : addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
6114 0 : for (i = 48; i < 56; i++)
6115 0 : addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
6116 : // stage 10
6117 0 : for (i = 0; i < 16; i++)
6118 0 : addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
6119 0 : for (i = 32; i < 40; i++) v[i] = u[i];
6120 :
6121 0 : v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
6122 0 : v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
6123 0 : v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
6124 0 : v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
6125 0 : v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
6126 0 : v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
6127 0 : v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
6128 0 : v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
6129 0 : v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
6130 0 : v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
6131 0 : v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
6132 0 : v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
6133 0 : v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
6134 0 : v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
6135 0 : v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
6136 0 : v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
6137 :
6138 0 : for (i = 56; i < 64; i++) v[i] = u[i];
6139 :
6140 : // stage 11
6141 0 : if (do_cols) {
6142 0 : for (i = 0; i < 32; i++)
6143 0 : addsub_no_clamp_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
6144 : }
6145 : else {
6146 0 : const int32_t log_range_out = AOMMAX(16, bd + 6);
6147 0 : const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
6148 : -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
6149 0 : const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
6150 : (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
6151 :
6152 0 : for (i = 0; i < 32; i++) {
6153 0 : addsub_shift_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
6154 : &clamp_lo_out, &clamp_hi_out, out_shift);
6155 : }
6156 : }
6157 : }
6158 0 : }
6159 :
6160 : typedef void(*transform_1d_avx2)(__m256i *in, __m256i *out, int32_t bit,
6161 : int32_t do_cols, int32_t bd, int32_t out_shift);
6162 :
6163 : static const transform_1d_avx2
6164 : highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
6165 : {
6166 : { NULL, NULL, NULL, NULL },
6167 : { NULL, NULL, NULL, NULL },
6168 : { NULL, NULL, NULL, NULL },
6169 : },
6170 : {
6171 : { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL },
6172 : { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL },
6173 : { iidentity8_avx2, iidentity8_avx2, iidentity8_avx2, iidentity8_avx2 },
6174 : },
6175 : {
6176 : { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
6177 : { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
6178 : { iidentity16_avx2, iidentity16_avx2, iidentity16_avx2, iidentity16_avx2 },
6179 : },
6180 : { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2_new },
6181 : { NULL, NULL, NULL, NULL },
6182 : { iidentity32_avx2, iidentity32_avx2, iidentity32_avx2, iidentity32_avx2 } },
6183 : { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 },
6184 : { NULL, NULL, NULL, NULL },
6185 : { NULL, NULL, NULL, NULL } }
6186 : };
6187 :
6188 0 : static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
6189 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
6190 : TxType tx_type,
6191 : TxSize tx_size, int32_t eob,
6192 : const int32_t bd) {
6193 : __m256i buf1[64 * 8];
6194 : int32_t eobx, eoby;
6195 0 : get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
6196 0 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
6197 0 : const int32_t txw_idx = get_txw_idx(tx_size);
6198 0 : const int32_t txh_idx = get_txh_idx(tx_size);
6199 0 : const int32_t txfm_size_col = tx_size_wide[tx_size];
6200 0 : const int32_t txfm_size_row = tx_size_high[tx_size];
6201 0 : const int32_t buf_size_w_div8 = txfm_size_col >> 3;
6202 0 : const int32_t buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
6203 0 : const int32_t buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
6204 0 : const int32_t input_stride = AOMMIN(32, txfm_size_col);
6205 0 : const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
6206 : ASSERT(eobx < 32);
6207 : ASSERT(eoby < 32);
6208 0 : const int32_t fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
6209 0 : const int32_t fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
6210 0 : const transform_1d_avx2 row_txfm =
6211 0 : highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
6212 0 : const transform_1d_avx2 col_txfm =
6213 0 : highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
6214 :
6215 0 : assert(col_txfm != NULL);
6216 0 : assert(row_txfm != NULL);
6217 : int32_t ud_flip, lr_flip;
6218 0 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
6219 :
6220 : // 1st stage: column transform
6221 0 : for (int32_t i = 0; i < buf_size_nonzero_h_div8; i++) {
6222 : __m256i buf0[64];
6223 0 : const int32_t *input_row = input + i * input_stride * 8;
6224 0 : for (int32_t j = 0; j < buf_size_nonzero_w_div8; ++j) {
6225 0 : __m256i *buf0_cur = buf0 + j * 8;
6226 0 : load_buffer_32x32_new(input_row + j * 8, buf0_cur, input_stride, 8);
6227 :
6228 0 : transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
6229 : }
6230 0 : if (rect_type == 1 || rect_type == -1) {
6231 0 : av1_round_shift_rect_array_32_avx2(
6232 : buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
6233 : }
6234 0 : row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
6235 :
6236 0 : __m256i *_buf1 = buf1 + i * 8;
6237 0 : if (lr_flip) {
6238 0 : for (int32_t j = 0; j < buf_size_w_div8; ++j) {
6239 0 : transpose_8x8_flip_avx2(
6240 0 : &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
6241 : }
6242 : }
6243 : else {
6244 0 : for (int32_t j = 0; j < buf_size_w_div8; ++j)
6245 0 : transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
6246 : }
6247 : }
6248 : // 2nd stage: column transform
6249 0 : for (int32_t i = 0; i < buf_size_w_div8; i++) {
6250 0 : col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
6251 0 : inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
6252 :
6253 0 : av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
6254 0 : buf1 + i * txfm_size_row, txfm_size_row,
6255 0 : -shift[1]);
6256 : }
6257 :
6258 : // write to buffer
6259 0 : if (txfm_size_col >= 16) {
6260 0 : for (int32_t i = 0; i < (txfm_size_col >> 4); i++) {
6261 0 : highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
6262 0 : output_r + 16 * i, stride_r,
6263 0 : output_w + 16 * i, stride_w,
6264 : ud_flip, txfm_size_row, bd);
6265 : }
6266 : }
6267 0 : else if (txfm_size_col == 8) {
6268 0 : highbd_write_buffer_8xn_avx2(buf1,
6269 : output_r, stride_r, output_w, stride_w,
6270 : ud_flip, txfm_size_row, bd);
6271 : }
6272 0 : }
6273 :
6274 0 : static void highbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
6275 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
6276 : TxType tx_type, TxSize tx_size,
6277 : int32_t eob, const int8_t bd) {
6278 : (void)eob;
6279 : __m256i buf1[64 * 2];
6280 :
6281 0 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
6282 0 : const int32_t txw_idx = get_txw_idx(tx_size);
6283 0 : const int32_t txh_idx = get_txh_idx(tx_size);
6284 0 : const int32_t txfm_size_col = tx_size_wide[tx_size];
6285 0 : const int32_t txfm_size_row = tx_size_high[tx_size];
6286 :
6287 0 : const int32_t input_stride = AOMMIN(32, txfm_size_col);
6288 0 : const int32_t row_max = AOMMIN(32, txfm_size_row);
6289 0 : const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
6290 :
6291 0 : const transform_1d_avx2 row_txfm =
6292 0 : highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
6293 0 : const transform_1d_avx2 col_txfm =
6294 0 : highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
6295 :
6296 : int32_t ud_flip, lr_flip, j;
6297 0 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
6298 :
6299 : // 1st stage: row transform
6300 0 : for (int32_t i = 0; i < (row_max >> 3); ++i) {
6301 : __m256i buf0[32];
6302 0 : const int32_t *input_row = input + i * input_stride * 8;
6303 0 : for (int32_t j = 0; j < (input_stride >> 3); ++j) {
6304 0 : __m256i *buf0_cur = buf0 + j * 8;
6305 0 : load_buffer_32x32_new(input_row + j * 8, buf0_cur, input_stride, 8);
6306 : }
6307 0 : if (rect_type == 1 || rect_type == -1) {
6308 0 : av1_round_shift_rect_array_32_avx2(buf0, buf0, input_stride, 0,
6309 : NewInvSqrt2);
6310 : }
6311 0 : row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
6312 :
6313 0 : __m256i *_buf1 = buf1 + i * 8;
6314 :
6315 0 : for (j = 0; j < (input_stride >> 3); ++j) {
6316 0 : _buf1[j * txfm_size_row + 0] = buf0[j * 8 + 0];
6317 0 : _buf1[j * txfm_size_row + 1] = buf0[j * 8 + 1];
6318 0 : _buf1[j * txfm_size_row + 2] = buf0[j * 8 + 2];
6319 0 : _buf1[j * txfm_size_row + 3] = buf0[j * 8 + 3];
6320 0 : _buf1[j * txfm_size_row + 4] = buf0[j * 8 + 4];
6321 0 : _buf1[j * txfm_size_row + 5] = buf0[j * 8 + 5];
6322 0 : _buf1[j * txfm_size_row + 6] = buf0[j * 8 + 6];
6323 0 : _buf1[j * txfm_size_row + 7] = buf0[j * 8 + 7];
6324 : }
6325 : }
6326 : // 2nd stage: column transform
6327 0 : for (int32_t i = 0; i < (input_stride >> 3); i++) {
6328 0 : col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
6329 0 : inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
6330 :
6331 0 : av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
6332 0 : buf1 + i * txfm_size_row, txfm_size_row,
6333 0 : -shift[1]);
6334 : }
6335 :
6336 : // write to buffer
6337 0 : if (txfm_size_col >= 16) {
6338 0 : for (int32_t i = 0; i < (txfm_size_col >> 4); i++) {
6339 0 : highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
6340 0 : output_r + 16 * i, stride_r,
6341 0 : output_w + 16 * i, stride_w,
6342 : ud_flip, txfm_size_row, bd);
6343 : }
6344 : }
6345 0 : else if (txfm_size_col == 8) {
6346 0 : highbd_write_buffer_8xn_avx2(buf1,
6347 : output_r, stride_r, output_w, stride_w,
6348 : ud_flip, txfm_size_row,
6349 : bd);
6350 : }
6351 0 : }
6352 0 : static void highbd_inv_txfm2d_add_v_identity_avx2(const int32_t *input,
6353 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
6354 : TxType tx_type, TxSize tx_size,
6355 : int32_t eob, const int8_t bd) {
6356 : __m256i buf1[64];
6357 : int32_t eobx, eoby;
6358 0 : get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
6359 0 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
6360 0 : const int32_t txw_idx = get_txw_idx(tx_size);
6361 0 : const int32_t txh_idx = get_txh_idx(tx_size);
6362 0 : const int32_t txfm_size_col = tx_size_wide[tx_size];
6363 0 : const int32_t txfm_size_row = tx_size_high[tx_size];
6364 0 : const int32_t input_stride = AOMMIN(32, txfm_size_col);
6365 0 : const int32_t buf_size_w_div4 = input_stride >> 3;
6366 0 : const int32_t buf_size_h_div8 = (eoby + 8) >> 3;
6367 0 : const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
6368 0 : const int32_t fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
6369 0 : const transform_1d_avx2 row_txfm =
6370 0 : highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
6371 0 : const transform_1d_avx2 col_txfm =
6372 0 : highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
6373 : int32_t ud_flip, lr_flip;
6374 0 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
6375 :
6376 0 : for (int32_t i = 0; i < (buf_size_h_div8 /*<< 1*/); ++i) {
6377 : __m256i buf0[16];
6378 0 : const int32_t *input_row = input + i * input_stride * 8;
6379 0 : for (int32_t j = 0; j < buf_size_w_div4; ++j) {
6380 0 : __m256i *buf0_cur = buf0 + j * 8;
6381 0 : load_buffer_32x32_new(input_row + j * 8, buf0_cur, input_stride, 8);
6382 : }
6383 0 : if (rect_type == 1 || rect_type == -1) {
6384 0 : av1_round_shift_rect_array_32_avx2(buf0, buf0, input_stride, 0,
6385 : NewInvSqrt2);
6386 : }
6387 0 : row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
6388 :
6389 0 : __m256i *_buf1 = buf1 + i * 8;
6390 :
6391 0 : for (int32_t j = 0; j < buf_size_w_div4; ++j) {
6392 0 : _buf1[j * txfm_size_row + 0] = buf0[j * 8 + 0];
6393 0 : _buf1[j * txfm_size_row + 1] = buf0[j * 8 + 1];
6394 0 : _buf1[j * txfm_size_row + 2] = buf0[j * 8 + 2];
6395 0 : _buf1[j * txfm_size_row + 3] = buf0[j * 8 + 3];
6396 0 : _buf1[j * txfm_size_row + 4] = buf0[j * 8 + 4];
6397 0 : _buf1[j * txfm_size_row + 5] = buf0[j * 8 + 5];
6398 0 : _buf1[j * txfm_size_row + 6] = buf0[j * 8 + 6];
6399 0 : _buf1[j * txfm_size_row + 7] = buf0[j * 8 + 7];
6400 : }
6401 : }
6402 0 : for (int32_t i = 0; i < buf_size_w_div4; i++) {
6403 0 : col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
6404 0 : inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
6405 :
6406 0 : av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
6407 0 : buf1 + i * txfm_size_row, txfm_size_row,
6408 0 : -shift[1]);
6409 : }
6410 :
6411 : // write to buffer
6412 0 : if (txfm_size_col >= 16) {
6413 0 : for (int32_t i = 0; i < (txfm_size_col >> 4); i++) {
6414 0 : highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
6415 0 : output_r + 16 * i, stride_r,
6416 0 : output_w + 16 * i, stride_w,
6417 : ud_flip, txfm_size_row, bd);
6418 : }
6419 : }
6420 0 : else if (txfm_size_col == 8) {
6421 0 : highbd_write_buffer_8xn_avx2(buf1,
6422 : output_r, stride_r, output_w, stride_w,
6423 : ud_flip, txfm_size_row, bd);
6424 : }
6425 0 : }
6426 0 : static void highbd_inv_txfm2d_add_h_identity_avx2(const int32_t *input,
6427 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
6428 : TxType tx_type, TxSize tx_size, int32_t eob, const int32_t bd) {
6429 : __m256i buf1[32];
6430 : int32_t eobx, eoby;
6431 0 : get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
6432 0 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
6433 0 : const int32_t txw_idx = get_txw_idx(tx_size);
6434 0 : const int32_t txh_idx = get_txh_idx(tx_size);
6435 0 : const int32_t txfm_size_col = tx_size_wide[tx_size];
6436 0 : const int32_t txfm_size_row = tx_size_high[tx_size];
6437 0 : const int32_t input_stride = AOMMIN(32, txfm_size_col);
6438 0 : const int32_t buf_size_w_div8 = input_stride >> 3;
6439 0 : const int32_t row_max = AOMMIN(32, txfm_size_row);
6440 0 : const int32_t buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
6441 0 : const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
6442 0 : const int32_t fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
6443 0 : const transform_1d_avx2 row_txfm =
6444 0 : highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
6445 0 : const transform_1d_avx2 col_txfm =
6446 0 : highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
6447 : int32_t ud_flip, lr_flip;
6448 0 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
6449 :
6450 0 : for (int32_t i = 0; i < (row_max >> 3); ++i) {
6451 : __m256i buf0[32];
6452 0 : const int32_t *input_row = input + i * input_stride * 8;
6453 0 : for (int32_t j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
6454 0 : __m256i *buf0_cur = buf0 + j * 8;
6455 0 : load_buffer_32x32_new(input_row + j * 8, buf0_cur, input_stride, 8);
6456 :
6457 0 : transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
6458 : }
6459 0 : if (rect_type == 1 || rect_type == -1) {
6460 0 : av1_round_shift_rect_array_32_avx2(
6461 : buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
6462 : }
6463 0 : row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
6464 :
6465 0 : __m256i *_buf1 = buf1 + i * 8;
6466 0 : if (lr_flip) {
6467 0 : for (int32_t j = 0; j < buf_size_w_div8; ++j) {
6468 0 : transpose_8x8_flip_avx2(
6469 0 : &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
6470 : }
6471 : }
6472 : else {
6473 0 : for (int32_t j = 0; j < buf_size_w_div8; ++j)
6474 0 : transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
6475 : }
6476 : }
6477 0 : for (int32_t i = 0; i < buf_size_w_div8; i++) {
6478 0 : col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
6479 0 : inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
6480 :
6481 0 : av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
6482 0 : buf1 + i * txfm_size_row, txfm_size_row,
6483 0 : -shift[1]);
6484 : }
6485 :
6486 : // write to buffer
6487 0 : if (txfm_size_col >= 16) {
6488 0 : for (int32_t i = 0; i < (txfm_size_col >> 4); i++) {
6489 0 : highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
6490 0 : output_r + 16 * i, stride_r,
6491 0 : output_w + 16 * i, stride_w,
6492 : ud_flip, txfm_size_row, bd);
6493 : }
6494 : }
6495 0 : else if (txfm_size_col == 8) {
6496 0 : highbd_write_buffer_8xn_avx2(buf1,
6497 : output_r, stride_r, output_w, stride_w,
6498 : ud_flip, txfm_size_row, bd);
6499 : }
6500 0 : }
6501 0 : void eb_av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
6502 : uint16_t *output_r, int32_t stride_r,
6503 : uint16_t *output_w, int32_t stride_w,
6504 : TxType tx_type, TxSize tx_size,
6505 : int32_t eob, const int32_t bd) {
6506 0 : switch (tx_type)
6507 : {
6508 0 : case DCT_DCT:
6509 : case ADST_DCT:
6510 : case DCT_ADST:
6511 : case ADST_ADST:
6512 : case FLIPADST_DCT:
6513 : case DCT_FLIPADST:
6514 : case FLIPADST_FLIPADST:
6515 : case ADST_FLIPADST:
6516 : case FLIPADST_ADST:
6517 0 : highbd_inv_txfm2d_add_no_identity_avx2(input,
6518 : output_r, stride_r, output_w, stride_w,
6519 : tx_type, tx_size, eob, bd);
6520 0 : break;
6521 0 : case IDTX:
6522 0 : highbd_inv_txfm2d_add_idtx_avx2(input,
6523 : output_r, stride_r, output_w, stride_w,
6524 : tx_type, tx_size, eob, bd);
6525 0 : break;
6526 0 : case V_DCT:
6527 : case V_ADST:
6528 : case V_FLIPADST:
6529 0 : highbd_inv_txfm2d_add_v_identity_avx2(input,
6530 : output_r, stride_r, output_w, stride_w,
6531 : tx_type, tx_size, eob, bd);
6532 0 : break;
6533 0 : case H_DCT:
6534 : case H_ADST:
6535 : case H_FLIPADST:
6536 0 : highbd_inv_txfm2d_add_h_identity_avx2(input,
6537 : output_r, stride_r, output_w, stride_w,
6538 : tx_type, tx_size, eob, bd);
6539 0 : break;
6540 0 : default: break;
6541 : }
6542 0 : }
6543 0 : void eb_av1_highbd_inv_txfm_add_avx2(const int32_t *input,
6544 : uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
6545 : TxType tx_type, TxSize tx_size, int32_t eob, int32_t bd) {
6546 : //assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
6547 :
6548 0 : eb_av1_highbd_inv_txfm2d_add_universe_avx2(
6549 : input, output_r, stride_r, output_w, stride_w, tx_type, tx_size,
6550 : eob, bd);
6551 0 : }
|