Line data Source code
1 : /*
2 : * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include "EbDefinitions.h"
13 : #include "aom_dsp_rtcd.h"
14 : #include <tmmintrin.h>
15 : #include "EbTransforms.h"
16 : #include "av1_inv_txfm_ssse3.h"
17 : #include "av1_txfm_sse2.h"
18 : #include "transpose_sse2.h"
19 :
20 : // TODO(binpengsmail@gmail.com): replace some for loop with do {} while
21 :
22 : // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
23 : static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
24 : 4 * 5793 };
25 :
26 9740620 : static void idct4_new_sse2(const __m128i *input, __m128i *output,
27 : int8_t cos_bit) {
28 : (void)cos_bit;
29 9740620 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
30 9740140 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
31 :
32 9740140 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
33 9740140 : const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
34 9740140 : const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
35 9740140 : const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
36 :
37 : // stage 1
38 : __m128i x[4];
39 9740140 : x[0] = input[0];
40 9740140 : x[1] = input[2];
41 9740140 : x[2] = input[1];
42 9740140 : x[3] = input[3];
43 :
44 : // stage 2
45 155842000 : btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
46 155842000 : btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
47 :
48 : // stage 3
49 19480300 : btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
50 19480300 : btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
51 9740140 : }
52 :
53 12536600 : static void idct4_w4_new_sse2(const __m128i *input, __m128i *output,
54 : int8_t cos_bit) {
55 : (void)cos_bit;
56 12536600 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
57 12536600 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
58 :
59 12536600 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
60 12536600 : const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
61 12536600 : const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
62 12536600 : const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
63 :
64 : // stage 1
65 : __m128i x[4];
66 12536600 : x[0] = input[0];
67 12536600 : x[1] = input[2];
68 12536600 : x[2] = input[1];
69 12536600 : x[3] = input[3];
70 :
71 : // stage 2
72 112829000 : btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
73 112829000 : btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
74 :
75 : // stage 3
76 25073200 : btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
77 25073200 : btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
78 12536600 : }
79 :
80 2550 : static void idct8_low1_new_ssse3(const __m128i *input, __m128i *output,
81 : int8_t cos_bit) {
82 : (void)cos_bit;
83 2550 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
84 :
85 : // stage 1
86 : __m128i x[2];
87 2550 : x[0] = input[0];
88 :
89 : // stage 2
90 : // stage 3
91 10200 : btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
92 :
93 : // stage 4
94 : // stage 5
95 2550 : output[0] = x[0];
96 2550 : output[7] = x[0];
97 2550 : output[1] = x[1];
98 2550 : output[6] = x[1];
99 2550 : output[2] = x[1];
100 2550 : output[5] = x[1];
101 2550 : output[3] = x[0];
102 2550 : output[4] = x[0];
103 2550 : }
104 :
105 24031100 : static void idct8_new_sse2(const __m128i *input, __m128i *output,
106 : int8_t cos_bit) {
107 : (void)cos_bit;
108 24031100 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
109 24029500 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
110 :
111 24029500 : const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
112 24029500 : const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
113 24029500 : const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
114 24029500 : const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
115 24029500 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
116 24029500 : const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
117 24029500 : const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
118 24029500 : const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
119 24029500 : const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
120 :
121 : // stage 1
122 : __m128i x[8];
123 24029500 : x[0] = input[0];
124 24029500 : x[1] = input[4];
125 24029500 : x[2] = input[2];
126 24029500 : x[3] = input[6];
127 24029500 : x[4] = input[1];
128 24029500 : x[5] = input[5];
129 24029500 : x[6] = input[3];
130 24029500 : x[7] = input[7];
131 :
132 : // stage 2
133 384472000 : btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
134 384472000 : btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
135 :
136 : // stage 3
137 384472000 : btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
138 384472000 : btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
139 48059000 : btf_16_adds_subs_sse2(x[4], x[5]);
140 48059000 : btf_16_subs_adds_sse2(x[7], x[6]);
141 :
142 : // stage 4
143 48059000 : btf_16_adds_subs_sse2(x[0], x[3]);
144 48059000 : btf_16_adds_subs_sse2(x[1], x[2]);
145 384472000 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
146 :
147 : // stage 5
148 48059000 : btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
149 48059000 : btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
150 48059000 : btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
151 48059000 : btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
152 24029500 : }
153 :
154 4168520 : static void idct8_w4_new_sse2(const __m128i *input, __m128i *output,
155 : int8_t cos_bit) {
156 : (void)cos_bit;
157 4168520 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
158 4168440 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
159 :
160 4168440 : const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
161 4168440 : const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
162 4168440 : const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
163 4168440 : const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
164 4168440 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
165 4168440 : const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
166 4168440 : const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
167 4168440 : const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
168 4168440 : const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
169 :
170 : // stage 1
171 : __m128i x[8];
172 4168440 : x[0] = input[0];
173 4168440 : x[1] = input[4];
174 4168440 : x[2] = input[2];
175 4168440 : x[3] = input[6];
176 4168440 : x[4] = input[1];
177 4168440 : x[5] = input[5];
178 4168440 : x[6] = input[3];
179 4168440 : x[7] = input[7];
180 :
181 : // stage 2
182 37515900 : btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
183 37515900 : btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
184 :
185 : // stage 3
186 37515900 : btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
187 37515900 : btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
188 8336880 : btf_16_adds_subs_sse2(x[4], x[5]);
189 8336880 : btf_16_subs_adds_sse2(x[7], x[6]);
190 :
191 : // stage 4
192 8336880 : btf_16_adds_subs_sse2(x[0], x[3]);
193 8336880 : btf_16_adds_subs_sse2(x[1], x[2]);
194 37515900 : btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
195 :
196 : // stage 5
197 8336880 : btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
198 8336880 : btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
199 8336880 : btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
200 8336880 : btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
201 4168440 : }
202 :
203 3768230 : static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
204 : const __m128i __rounding,
205 : int8_t cos_bit) {
206 3768230 : const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
207 3768230 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
208 7536470 : btf_16_adds_subs_sse2(x[0], x[3]);
209 7536470 : btf_16_adds_subs_sse2(x[1], x[2]);
210 60291700 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
211 7536470 : btf_16_adds_subs_sse2(x[8], x[11]);
212 7536470 : btf_16_adds_subs_sse2(x[9], x[10]);
213 7536470 : btf_16_subs_adds_sse2(x[15], x[12]);
214 7536470 : btf_16_subs_adds_sse2(x[14], x[13]);
215 3768230 : }
216 :
217 3768130 : static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
218 : const __m128i __rounding,
219 : int8_t cos_bit) {
220 3768130 : const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
221 3768130 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
222 7536260 : btf_16_adds_subs_sse2(x[0], x[7]);
223 7536260 : btf_16_adds_subs_sse2(x[1], x[6]);
224 7536260 : btf_16_adds_subs_sse2(x[2], x[5]);
225 7536260 : btf_16_adds_subs_sse2(x[3], x[4]);
226 60290100 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
227 60290100 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
228 3768130 : }
229 :
230 6508300 : static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
231 13016600 : btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
232 13016600 : btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
233 13016600 : btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
234 13016600 : btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
235 13016600 : btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
236 13016600 : btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
237 13016600 : btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
238 13016600 : btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
239 6508300 : }
240 :
241 62 : static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output,
242 : int8_t cos_bit) {
243 : (void)cos_bit;
244 62 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
245 :
246 : // stage 1
247 : __m128i x[2];
248 62 : x[0] = input[0];
249 :
250 : // stage 2
251 : // stage 3
252 : // stage 4
253 248 : btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
254 :
255 : // stage 5
256 : // stage 6
257 : // stage 7
258 62 : output[0] = x[0];
259 62 : output[15] = x[0];
260 62 : output[1] = x[1];
261 62 : output[14] = x[1];
262 62 : output[2] = x[1];
263 62 : output[13] = x[1];
264 62 : output[3] = x[0];
265 62 : output[12] = x[0];
266 62 : output[4] = x[0];
267 62 : output[11] = x[0];
268 62 : output[5] = x[1];
269 62 : output[10] = x[1];
270 62 : output[6] = x[1];
271 62 : output[9] = x[1];
272 62 : output[7] = x[0];
273 62 : output[8] = x[0];
274 62 : }
275 :
276 152 : static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output,
277 : int8_t cos_bit) {
278 : (void)cos_bit;
279 152 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
280 152 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
281 152 : const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
282 152 : const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
283 152 : const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
284 :
285 : // stage 1
286 : __m128i x[16];
287 152 : x[0] = input[0];
288 152 : x[2] = input[4];
289 152 : x[4] = input[2];
290 152 : x[6] = input[6];
291 152 : x[8] = input[1];
292 152 : x[10] = input[5];
293 152 : x[12] = input[3];
294 152 : x[14] = input[7];
295 :
296 : // stage 2
297 608 : btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
298 608 : btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
299 608 : btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
300 608 : btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
301 :
302 : // stage 3
303 608 : btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
304 608 : btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
305 304 : btf_16_adds_subs_sse2(x[8], x[9]);
306 304 : btf_16_subs_adds_sse2(x[11], x[10]);
307 304 : btf_16_adds_subs_sse2(x[12], x[13]);
308 304 : btf_16_subs_adds_sse2(x[15], x[14]);
309 :
310 : // stage 4
311 608 : btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
312 608 : btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
313 304 : btf_16_adds_subs_sse2(x[4], x[5]);
314 304 : btf_16_subs_adds_sse2(x[7], x[6]);
315 2432 : btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
316 2432 : btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
317 :
318 152 : idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
319 152 : idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
320 152 : idct16_stage7_sse2(output, x);
321 152 : }
322 :
323 3767880 : static void idct16_new_sse2(const __m128i *input, __m128i *output,
324 : int8_t cos_bit) {
325 : (void)cos_bit;
326 3767880 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
327 3767860 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
328 :
329 3767860 : const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
330 3767860 : const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
331 3767860 : const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
332 3767860 : const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
333 3767860 : const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
334 3767860 : const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
335 3767860 : const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
336 3767860 : const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
337 3767860 : const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
338 3767860 : const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
339 3767860 : const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
340 3767860 : const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
341 3767860 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
342 3767860 : const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
343 3767860 : const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
344 3767860 : const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
345 3767860 : const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
346 3767860 : const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
347 3767860 : const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
348 :
349 : // stage 1
350 : __m128i x[16];
351 3767860 : x[0] = input[0];
352 3767860 : x[1] = input[8];
353 3767860 : x[2] = input[4];
354 3767860 : x[3] = input[12];
355 3767860 : x[4] = input[2];
356 3767860 : x[5] = input[10];
357 3767860 : x[6] = input[6];
358 3767860 : x[7] = input[14];
359 3767860 : x[8] = input[1];
360 3767860 : x[9] = input[9];
361 3767860 : x[10] = input[5];
362 3767860 : x[11] = input[13];
363 3767860 : x[12] = input[3];
364 3767860 : x[13] = input[11];
365 3767860 : x[14] = input[7];
366 3767860 : x[15] = input[15];
367 :
368 : // stage 2
369 60285700 : btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
370 60285700 : btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
371 60285700 : btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
372 60285700 : btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
373 :
374 : // stage 3
375 60285700 : btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
376 60285700 : btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
377 7535720 : btf_16_adds_subs_sse2(x[8], x[9]);
378 7535720 : btf_16_subs_adds_sse2(x[11], x[10]);
379 7535720 : btf_16_adds_subs_sse2(x[12], x[13]);
380 7535720 : btf_16_subs_adds_sse2(x[15], x[14]);
381 :
382 : // stage 4
383 60285700 : btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
384 60285700 : btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
385 7535720 : btf_16_adds_subs_sse2(x[4], x[5]);
386 7535720 : btf_16_subs_adds_sse2(x[7], x[6]);
387 60285700 : btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
388 60285700 : btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
389 :
390 : // stage 5~7
391 3767860 : idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
392 3768020 : idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
393 3768020 : idct16_stage7_sse2(output, x);
394 3768110 : }
395 :
396 2740660 : static void idct16_w4_new_sse2(const __m128i *input, __m128i *output,
397 : int8_t cos_bit) {
398 : (void)cos_bit;
399 2740660 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
400 2740640 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
401 :
402 2740640 : const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
403 2740640 : const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
404 2740640 : const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
405 2740640 : const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
406 2740640 : const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
407 2740640 : const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
408 2740640 : const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
409 2740640 : const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
410 2740640 : const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
411 2740640 : const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
412 2740640 : const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
413 2740640 : const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
414 2740640 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
415 2740640 : const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
416 2740640 : const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
417 2740640 : const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
418 2740640 : const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
419 2740640 : const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
420 2740640 : const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
421 2740640 : const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
422 :
423 : // stage 1
424 : __m128i x[16];
425 2740640 : x[0] = input[0];
426 2740640 : x[1] = input[8];
427 2740640 : x[2] = input[4];
428 2740640 : x[3] = input[12];
429 2740640 : x[4] = input[2];
430 2740640 : x[5] = input[10];
431 2740640 : x[6] = input[6];
432 2740640 : x[7] = input[14];
433 2740640 : x[8] = input[1];
434 2740640 : x[9] = input[9];
435 2740640 : x[10] = input[5];
436 2740640 : x[11] = input[13];
437 2740640 : x[12] = input[3];
438 2740640 : x[13] = input[11];
439 2740640 : x[14] = input[7];
440 2740640 : x[15] = input[15];
441 :
442 : // stage 2
443 24665800 : btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
444 24665800 : btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
445 24665800 : btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
446 24665800 : btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
447 :
448 : // stage 3
449 24665800 : btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
450 24665800 : btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
451 5481280 : btf_16_adds_subs_sse2(x[8], x[9]);
452 5481280 : btf_16_subs_adds_sse2(x[11], x[10]);
453 5481280 : btf_16_adds_subs_sse2(x[12], x[13]);
454 5481280 : btf_16_subs_adds_sse2(x[15], x[14]);
455 :
456 : // stage 4
457 24665800 : btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
458 24665800 : btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
459 5481280 : btf_16_adds_subs_sse2(x[4], x[5]);
460 5481280 : btf_16_subs_adds_sse2(x[7], x[6]);
461 24665800 : btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
462 24665800 : btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
463 :
464 : // stage 5
465 5481280 : btf_16_adds_subs_sse2(x[0], x[3]);
466 5481280 : btf_16_adds_subs_sse2(x[1], x[2]);
467 24665800 : btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
468 5481280 : btf_16_adds_subs_sse2(x[8], x[11]);
469 5481280 : btf_16_adds_subs_sse2(x[9], x[10]);
470 5481280 : btf_16_subs_adds_sse2(x[15], x[12]);
471 5481280 : btf_16_subs_adds_sse2(x[14], x[13]);
472 :
473 : // stage 6
474 5481280 : btf_16_adds_subs_sse2(x[0], x[7]);
475 5481280 : btf_16_adds_subs_sse2(x[1], x[6]);
476 5481280 : btf_16_adds_subs_sse2(x[2], x[5]);
477 5481280 : btf_16_adds_subs_sse2(x[3], x[4]);
478 24665800 : btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
479 24665800 : btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
480 :
481 : // stage 7
482 2740640 : idct16_stage7_sse2(output, x);
483 2740760 : }
484 :
485 1278550 : static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
486 2557100 : btf_16_adds_subs_sse2(x[16], x[17]);
487 2557100 : btf_16_subs_adds_sse2(x[19], x[18]);
488 2557100 : btf_16_adds_subs_sse2(x[20], x[21]);
489 2557100 : btf_16_subs_adds_sse2(x[23], x[22]);
490 2557100 : btf_16_adds_subs_sse2(x[24], x[25]);
491 2557100 : btf_16_subs_adds_sse2(x[27], x[26]);
492 2557100 : btf_16_adds_subs_sse2(x[28], x[29]);
493 2557100 : btf_16_subs_adds_sse2(x[31], x[30]);
494 1278550 : }
495 :
496 1278580 : static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
497 : const __m128i __rounding,
498 : int8_t cos_bit) {
499 1278580 : const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
500 1278580 : const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
501 1278580 : const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
502 1278580 : const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
503 1278580 : const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
504 1278580 : const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
505 20457300 : btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
506 20457300 : btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
507 20457300 : btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
508 20457300 : btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
509 1278580 : }
510 :
511 1278590 : static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
512 : const __m128i __rounding,
513 : int8_t cos_bit) {
514 1278590 : const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
515 1278590 : const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
516 1278590 : const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
517 20457400 : btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
518 20457400 : btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
519 2557180 : btf_16_adds_subs_sse2(x[16], x[19]);
520 2557180 : btf_16_adds_subs_sse2(x[17], x[18]);
521 2557180 : btf_16_subs_adds_sse2(x[23], x[20]);
522 2557180 : btf_16_subs_adds_sse2(x[22], x[21]);
523 2557180 : btf_16_adds_subs_sse2(x[24], x[27]);
524 2557180 : btf_16_adds_subs_sse2(x[25], x[26]);
525 2557180 : btf_16_subs_adds_sse2(x[31], x[28]);
526 2557180 : btf_16_subs_adds_sse2(x[30], x[29]);
527 1278590 : }
528 :
529 1278590 : static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
530 : const __m128i __rounding,
531 : int8_t cos_bit) {
532 1278590 : const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
533 1278590 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
534 1278590 : const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
535 1278590 : const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
536 1278590 : const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
537 20457400 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
538 2557180 : btf_16_adds_subs_sse2(x[8], x[11]);
539 2557180 : btf_16_adds_subs_sse2(x[9], x[10]);
540 2557180 : btf_16_subs_adds_sse2(x[15], x[12]);
541 2557180 : btf_16_subs_adds_sse2(x[14], x[13]);
542 20457400 : btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
543 20457400 : btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
544 20457400 : btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
545 20457400 : btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
546 1278590 : }
547 :
548 1278590 : static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
549 : const __m128i __rounding,
550 : int8_t cos_bit) {
551 1278590 : const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
552 1278590 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
553 2557180 : btf_16_adds_subs_sse2(x[0], x[7]);
554 2557180 : btf_16_adds_subs_sse2(x[1], x[6]);
555 2557180 : btf_16_adds_subs_sse2(x[2], x[5]);
556 2557180 : btf_16_adds_subs_sse2(x[3], x[4]);
557 20457500 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
558 20457500 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
559 2557180 : btf_16_adds_subs_sse2(x[16], x[23]);
560 2557180 : btf_16_adds_subs_sse2(x[17], x[22]);
561 2557180 : btf_16_adds_subs_sse2(x[18], x[21]);
562 2557180 : btf_16_adds_subs_sse2(x[19], x[20]);
563 2557180 : btf_16_subs_adds_sse2(x[31], x[24]);
564 2557180 : btf_16_subs_adds_sse2(x[30], x[25]);
565 2557180 : btf_16_subs_adds_sse2(x[29], x[26]);
566 2557180 : btf_16_subs_adds_sse2(x[28], x[27]);
567 1278590 : }
568 :
569 1278590 : static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
570 : const __m128i __rounding,
571 : int8_t cos_bit) {
572 1278590 : const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
573 1278590 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
574 2557180 : btf_16_adds_subs_sse2(x[0], x[15]);
575 2557180 : btf_16_adds_subs_sse2(x[1], x[14]);
576 2557180 : btf_16_adds_subs_sse2(x[2], x[13]);
577 2557180 : btf_16_adds_subs_sse2(x[3], x[12]);
578 2557180 : btf_16_adds_subs_sse2(x[4], x[11]);
579 2557180 : btf_16_adds_subs_sse2(x[5], x[10]);
580 2557180 : btf_16_adds_subs_sse2(x[6], x[9]);
581 2557180 : btf_16_adds_subs_sse2(x[7], x[8]);
582 20457500 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
583 20457500 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
584 20457500 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
585 20457500 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
586 1278590 : }
587 :
588 1278590 : static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
589 2557180 : btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
590 2557180 : btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
591 2557180 : btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
592 2557180 : btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
593 2557180 : btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
594 2557180 : btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
595 2557180 : btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
596 2557180 : btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
597 2557180 : btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
598 2557180 : btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
599 2557180 : btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
600 2557180 : btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
601 2557180 : btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
602 2557180 : btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
603 2557180 : btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
604 2557180 : btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
605 1278590 : }
606 :
607 50 : static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output,
608 : int8_t cos_bit) {
609 : (void)cos_bit;
610 50 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
611 :
612 : // stage 1
613 : __m128i x[2];
614 50 : x[0] = input[0];
615 :
616 : // stage 2
617 : // stage 3
618 : // stage 4
619 : // stage 5
620 200 : btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
621 :
622 : // stage 6
623 : // stage 7
624 : // stage 8
625 : // stage 9
626 50 : output[0] = x[0];
627 50 : output[31] = x[0];
628 50 : output[1] = x[1];
629 50 : output[30] = x[1];
630 50 : output[2] = x[1];
631 50 : output[29] = x[1];
632 50 : output[3] = x[0];
633 50 : output[28] = x[0];
634 50 : output[4] = x[0];
635 50 : output[27] = x[0];
636 50 : output[5] = x[1];
637 50 : output[26] = x[1];
638 50 : output[6] = x[1];
639 50 : output[25] = x[1];
640 50 : output[7] = x[0];
641 50 : output[24] = x[0];
642 50 : output[8] = x[0];
643 50 : output[23] = x[0];
644 50 : output[9] = x[1];
645 50 : output[22] = x[1];
646 50 : output[10] = x[1];
647 50 : output[21] = x[1];
648 50 : output[11] = x[0];
649 50 : output[20] = x[0];
650 50 : output[12] = x[0];
651 50 : output[19] = x[0];
652 50 : output[13] = x[1];
653 50 : output[18] = x[1];
654 50 : output[14] = x[1];
655 50 : output[17] = x[1];
656 50 : output[15] = x[0];
657 50 : output[16] = x[0];
658 50 : }
659 :
660 30 : static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output,
661 : int8_t cos_bit) {
662 : (void)cos_bit;
663 30 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
664 30 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
665 :
666 : // stage 1
667 : __m128i x[32];
668 30 : x[0] = input[0];
669 30 : x[4] = input[4];
670 30 : x[8] = input[2];
671 30 : x[12] = input[6];
672 30 : x[16] = input[1];
673 30 : x[20] = input[5];
674 30 : x[24] = input[3];
675 30 : x[28] = input[7];
676 :
677 : // stage 2
678 120 : btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
679 120 : btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
680 120 : btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
681 120 : btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
682 :
683 : // stage 3
684 120 : btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
685 120 : btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
686 30 : x[17] = x[16];
687 30 : x[18] = x[19];
688 30 : x[21] = x[20];
689 30 : x[22] = x[23];
690 30 : x[25] = x[24];
691 30 : x[26] = x[27];
692 30 : x[29] = x[28];
693 30 : x[30] = x[31];
694 :
695 : // stage 4
696 120 : btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
697 30 : x[9] = x[8];
698 30 : x[10] = x[11];
699 30 : x[13] = x[12];
700 30 : x[14] = x[15];
701 30 : idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
702 :
703 : // stage 5
704 120 : btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
705 30 : x[5] = x[4];
706 30 : x[6] = x[7];
707 30 : idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
708 : // stage 6
709 30 : x[3] = x[0];
710 30 : x[2] = x[1];
711 30 : idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
712 :
713 30 : idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
714 30 : idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
715 30 : idct32_stage9_sse2(output, x);
716 30 : }
717 :
718 8 : static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output,
719 : int8_t cos_bit) {
720 : (void)cos_bit;
721 8 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
722 8 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
723 :
724 : // stage 1
725 : __m128i x[32];
726 8 : x[0] = input[0];
727 8 : x[2] = input[8];
728 8 : x[4] = input[4];
729 8 : x[6] = input[12];
730 8 : x[8] = input[2];
731 8 : x[10] = input[10];
732 8 : x[12] = input[6];
733 8 : x[14] = input[14];
734 8 : x[16] = input[1];
735 8 : x[18] = input[9];
736 8 : x[20] = input[5];
737 8 : x[22] = input[13];
738 8 : x[24] = input[3];
739 8 : x[26] = input[11];
740 8 : x[28] = input[7];
741 8 : x[30] = input[15];
742 :
743 : // stage 2
744 32 : btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
745 32 : btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
746 32 : btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
747 32 : btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
748 32 : btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
749 32 : btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
750 32 : btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
751 32 : btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
752 :
753 : // stage 3
754 32 : btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
755 32 : btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
756 32 : btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
757 32 : btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
758 8 : idct32_high16_stage3_sse2(x);
759 :
760 : // stage 4
761 32 : btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
762 32 : btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
763 16 : btf_16_adds_subs_sse2(x[8], x[9]);
764 16 : btf_16_subs_adds_sse2(x[11], x[10]);
765 16 : btf_16_adds_subs_sse2(x[12], x[13]);
766 16 : btf_16_subs_adds_sse2(x[15], x[14]);
767 8 : idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
768 :
769 : // stage 5
770 32 : btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
771 32 : btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
772 16 : btf_16_adds_subs_sse2(x[4], x[5]);
773 16 : btf_16_subs_adds_sse2(x[7], x[6]);
774 8 : idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
775 :
776 16 : btf_16_adds_subs_sse2(x[0], x[3]);
777 16 : btf_16_adds_subs_sse2(x[1], x[2]);
778 8 : idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
779 :
780 8 : idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
781 8 : idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
782 8 : idct32_stage9_sse2(output, x);
783 8 : }
784 :
785 1278540 : static void idct32_new_sse2(const __m128i *input, __m128i *output,
786 : int8_t cos_bit) {
787 : (void)cos_bit;
788 1278540 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
789 1278540 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
790 :
791 1278540 : const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
792 1278540 : const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
793 1278540 : const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
794 1278540 : const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
795 1278540 : const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
796 1278540 : const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
797 1278540 : const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
798 1278540 : const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
799 1278540 : const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
800 1278540 : const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
801 1278540 : const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
802 1278540 : const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
803 1278540 : const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
804 1278540 : const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
805 1278540 : const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
806 1278540 : const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
807 1278540 : const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
808 1278540 : const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
809 1278540 : const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
810 1278540 : const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
811 1278540 : const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
812 1278540 : const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
813 1278540 : const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
814 1278540 : const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
815 1278540 : const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
816 1278540 : const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
817 1278540 : const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
818 1278540 : const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
819 1278540 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
820 1278540 : const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
821 1278540 : const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
822 1278540 : const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
823 :
824 : // stage 1
825 : __m128i x[32];
826 1278540 : x[0] = input[0];
827 1278540 : x[1] = input[16];
828 1278540 : x[2] = input[8];
829 1278540 : x[3] = input[24];
830 1278540 : x[4] = input[4];
831 1278540 : x[5] = input[20];
832 1278540 : x[6] = input[12];
833 1278540 : x[7] = input[28];
834 1278540 : x[8] = input[2];
835 1278540 : x[9] = input[18];
836 1278540 : x[10] = input[10];
837 1278540 : x[11] = input[26];
838 1278540 : x[12] = input[6];
839 1278540 : x[13] = input[22];
840 1278540 : x[14] = input[14];
841 1278540 : x[15] = input[30];
842 1278540 : x[16] = input[1];
843 1278540 : x[17] = input[17];
844 1278540 : x[18] = input[9];
845 1278540 : x[19] = input[25];
846 1278540 : x[20] = input[5];
847 1278540 : x[21] = input[21];
848 1278540 : x[22] = input[13];
849 1278540 : x[23] = input[29];
850 1278540 : x[24] = input[3];
851 1278540 : x[25] = input[19];
852 1278540 : x[26] = input[11];
853 1278540 : x[27] = input[27];
854 1278540 : x[28] = input[7];
855 1278540 : x[29] = input[23];
856 1278540 : x[30] = input[15];
857 1278540 : x[31] = input[31];
858 :
859 : // stage 2
860 20456700 : btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
861 20456700 : btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
862 20456700 : btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
863 20456700 : btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
864 20456700 : btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
865 20456700 : btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
866 20456700 : btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
867 20456700 : btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
868 :
869 : // stage 3
870 20456700 : btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
871 20456700 : btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
872 20456700 : btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
873 20456700 : btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
874 1278540 : idct32_high16_stage3_sse2(x);
875 :
876 : // stage 4
877 20456800 : btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
878 20456800 : btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
879 2557100 : btf_16_adds_subs_sse2(x[8], x[9]);
880 2557100 : btf_16_subs_adds_sse2(x[11], x[10]);
881 2557100 : btf_16_adds_subs_sse2(x[12], x[13]);
882 2557100 : btf_16_subs_adds_sse2(x[15], x[14]);
883 1278550 : idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
884 :
885 : // stage 5
886 20456800 : btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
887 20456800 : btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
888 2557100 : btf_16_adds_subs_sse2(x[4], x[5]);
889 2557100 : btf_16_adds_subs_sse2(x[7], x[6]);
890 1278550 : idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
891 :
892 : // stage 6
893 2557110 : btf_16_adds_subs_sse2(x[0], x[3]);
894 2557110 : btf_16_adds_subs_sse2(x[1], x[2]);
895 1278560 : idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
896 :
897 : // stage 7~8
898 1278560 : idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
899 1278550 : idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
900 1278550 : idct32_stage9_sse2(output, x);
901 1278560 : }
902 :
903 0 : static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
904 : const __m128i __rounding,
905 : int8_t cos_bit) {
906 0 : const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
907 0 : const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
908 0 : const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
909 0 : const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
910 0 : const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
911 0 : const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
912 0 : const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
913 0 : const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
914 0 : const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
915 0 : const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
916 0 : const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
917 0 : const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
918 0 : btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
919 0 : btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
920 0 : btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
921 0 : btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
922 0 : btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
923 0 : btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
924 0 : btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
925 0 : btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
926 0 : }
927 :
928 0 : static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
929 : const __m128i __rounding,
930 : int8_t cos_bit) {
931 0 : const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
932 0 : const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
933 0 : const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
934 0 : const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
935 0 : const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
936 0 : const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
937 0 : btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
938 0 : btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
939 0 : btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
940 0 : btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
941 0 : btf_16_adds_subs_sse2(x[32], x[35]);
942 0 : btf_16_adds_subs_sse2(x[33], x[34]);
943 0 : btf_16_subs_adds_sse2(x[39], x[36]);
944 0 : btf_16_subs_adds_sse2(x[38], x[37]);
945 0 : btf_16_adds_subs_sse2(x[40], x[43]);
946 0 : btf_16_adds_subs_sse2(x[41], x[42]);
947 0 : btf_16_subs_adds_sse2(x[47], x[44]);
948 0 : btf_16_subs_adds_sse2(x[46], x[45]);
949 0 : btf_16_adds_subs_sse2(x[48], x[51]);
950 0 : btf_16_adds_subs_sse2(x[49], x[50]);
951 0 : btf_16_subs_adds_sse2(x[55], x[52]);
952 0 : btf_16_subs_adds_sse2(x[54], x[53]);
953 0 : btf_16_adds_subs_sse2(x[56], x[59]);
954 0 : btf_16_adds_subs_sse2(x[57], x[58]);
955 0 : btf_16_subs_adds_sse2(x[63], x[60]);
956 0 : btf_16_subs_adds_sse2(x[62], x[61]);
957 0 : }
958 :
959 0 : static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
960 : const __m128i __rounding,
961 : int8_t cos_bit) {
962 0 : const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
963 0 : const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
964 0 : const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
965 0 : const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
966 0 : const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
967 0 : const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
968 0 : btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
969 0 : btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
970 0 : btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
971 0 : btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
972 0 : btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
973 0 : btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
974 0 : btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
975 0 : btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
976 0 : }
977 :
978 0 : static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
979 : const __m128i __rounding,
980 : int8_t cos_bit) {
981 0 : btf_16_adds_subs_sse2(x[16], x[19]);
982 0 : btf_16_adds_subs_sse2(x[17], x[18]);
983 0 : btf_16_subs_adds_sse2(x[23], x[20]);
984 0 : btf_16_subs_adds_sse2(x[22], x[21]);
985 0 : btf_16_adds_subs_sse2(x[24], x[27]);
986 0 : btf_16_adds_subs_sse2(x[25], x[26]);
987 0 : btf_16_subs_adds_sse2(x[31], x[28]);
988 0 : btf_16_subs_adds_sse2(x[30], x[29]);
989 0 : idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
990 0 : }
991 :
992 0 : static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
993 : const __m128i __rounding,
994 : int8_t cos_bit) {
995 0 : const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
996 0 : const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
997 0 : const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
998 0 : btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
999 0 : btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
1000 0 : btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
1001 0 : btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
1002 0 : btf_16_adds_subs_sse2(x[32], x[39]);
1003 0 : btf_16_adds_subs_sse2(x[33], x[38]);
1004 0 : btf_16_adds_subs_sse2(x[34], x[37]);
1005 0 : btf_16_adds_subs_sse2(x[35], x[36]);
1006 0 : btf_16_subs_adds_sse2(x[47], x[40]);
1007 0 : btf_16_subs_adds_sse2(x[46], x[41]);
1008 0 : btf_16_subs_adds_sse2(x[45], x[42]);
1009 0 : btf_16_subs_adds_sse2(x[44], x[43]);
1010 0 : btf_16_adds_subs_sse2(x[48], x[55]);
1011 0 : btf_16_adds_subs_sse2(x[49], x[54]);
1012 0 : btf_16_adds_subs_sse2(x[50], x[53]);
1013 0 : btf_16_adds_subs_sse2(x[51], x[52]);
1014 0 : btf_16_subs_adds_sse2(x[63], x[56]);
1015 0 : btf_16_subs_adds_sse2(x[62], x[57]);
1016 0 : btf_16_subs_adds_sse2(x[61], x[58]);
1017 0 : btf_16_subs_adds_sse2(x[60], x[59]);
1018 0 : }
1019 :
1020 0 : static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
1021 : const __m128i __rounding,
1022 : int8_t cos_bit) {
1023 0 : const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1024 0 : const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1025 0 : const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1026 0 : btf_16_adds_subs_sse2(x[16], x[23]);
1027 0 : btf_16_adds_subs_sse2(x[17], x[22]);
1028 0 : btf_16_adds_subs_sse2(x[18], x[21]);
1029 0 : btf_16_adds_subs_sse2(x[19], x[20]);
1030 0 : btf_16_subs_adds_sse2(x[31], x[24]);
1031 0 : btf_16_subs_adds_sse2(x[30], x[25]);
1032 0 : btf_16_subs_adds_sse2(x[29], x[26]);
1033 0 : btf_16_subs_adds_sse2(x[28], x[27]);
1034 0 : btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
1035 0 : btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
1036 0 : btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
1037 0 : btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
1038 0 : btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
1039 0 : btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
1040 0 : btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
1041 0 : btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
1042 0 : }
1043 :
1044 0 : static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
1045 : const __m128i __rounding,
1046 : int8_t cos_bit) {
1047 0 : const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1048 0 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1049 0 : btf_16_adds_subs_sse2(x[0], x[15]);
1050 0 : btf_16_adds_subs_sse2(x[1], x[14]);
1051 0 : btf_16_adds_subs_sse2(x[2], x[13]);
1052 0 : btf_16_adds_subs_sse2(x[3], x[12]);
1053 0 : btf_16_adds_subs_sse2(x[4], x[11]);
1054 0 : btf_16_adds_subs_sse2(x[5], x[10]);
1055 0 : btf_16_adds_subs_sse2(x[6], x[9]);
1056 0 : btf_16_adds_subs_sse2(x[7], x[8]);
1057 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
1058 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
1059 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
1060 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
1061 0 : btf_16_adds_subs_sse2(x[32], x[47]);
1062 0 : btf_16_adds_subs_sse2(x[33], x[46]);
1063 0 : btf_16_adds_subs_sse2(x[34], x[45]);
1064 0 : btf_16_adds_subs_sse2(x[35], x[44]);
1065 0 : btf_16_adds_subs_sse2(x[36], x[43]);
1066 0 : btf_16_adds_subs_sse2(x[37], x[42]);
1067 0 : btf_16_adds_subs_sse2(x[38], x[41]);
1068 0 : btf_16_adds_subs_sse2(x[39], x[40]);
1069 0 : btf_16_subs_adds_sse2(x[63], x[48]);
1070 0 : btf_16_subs_adds_sse2(x[62], x[49]);
1071 0 : btf_16_subs_adds_sse2(x[61], x[50]);
1072 0 : btf_16_subs_adds_sse2(x[60], x[51]);
1073 0 : btf_16_subs_adds_sse2(x[59], x[52]);
1074 0 : btf_16_subs_adds_sse2(x[58], x[53]);
1075 0 : btf_16_subs_adds_sse2(x[57], x[54]);
1076 0 : btf_16_subs_adds_sse2(x[56], x[55]);
1077 0 : }
1078 :
1079 0 : static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
1080 : const __m128i __rounding,
1081 : int8_t cos_bit) {
1082 0 : const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1083 0 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1084 0 : btf_16_adds_subs_sse2(x[0], x[31]);
1085 0 : btf_16_adds_subs_sse2(x[1], x[30]);
1086 0 : btf_16_adds_subs_sse2(x[2], x[29]);
1087 0 : btf_16_adds_subs_sse2(x[3], x[28]);
1088 0 : btf_16_adds_subs_sse2(x[4], x[27]);
1089 0 : btf_16_adds_subs_sse2(x[5], x[26]);
1090 0 : btf_16_adds_subs_sse2(x[6], x[25]);
1091 0 : btf_16_adds_subs_sse2(x[7], x[24]);
1092 0 : btf_16_adds_subs_sse2(x[8], x[23]);
1093 0 : btf_16_adds_subs_sse2(x[9], x[22]);
1094 0 : btf_16_adds_subs_sse2(x[10], x[21]);
1095 0 : btf_16_adds_subs_sse2(x[11], x[20]);
1096 0 : btf_16_adds_subs_sse2(x[12], x[19]);
1097 0 : btf_16_adds_subs_sse2(x[13], x[18]);
1098 0 : btf_16_adds_subs_sse2(x[14], x[17]);
1099 0 : btf_16_adds_subs_sse2(x[15], x[16]);
1100 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
1101 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
1102 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
1103 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
1104 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
1105 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
1106 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
1107 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
1108 0 : }
1109 :
1110 0 : static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
1111 0 : btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
1112 0 : btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
1113 0 : btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
1114 0 : btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
1115 0 : btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
1116 0 : btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
1117 0 : btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
1118 0 : btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
1119 0 : btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
1120 0 : btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
1121 0 : btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
1122 0 : btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
1123 0 : btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
1124 0 : btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
1125 0 : btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
1126 0 : btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
1127 0 : btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
1128 0 : btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
1129 0 : btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
1130 0 : btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
1131 0 : btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
1132 0 : btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
1133 0 : btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
1134 0 : btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
1135 0 : btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
1136 0 : btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
1137 0 : btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
1138 0 : btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
1139 0 : btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
1140 0 : btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
1141 0 : btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
1142 0 : btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
1143 0 : }
1144 :
1145 0 : static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output,
1146 : int8_t cos_bit) {
1147 : (void)cos_bit;
1148 0 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
1149 :
1150 : // stage 1
1151 : __m128i x[32];
1152 0 : x[0] = input[0];
1153 :
1154 : // stage 2
1155 : // stage 3
1156 : // stage 4
1157 : // stage 5
1158 : // stage 6
1159 0 : btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1160 :
1161 : // stage 7
1162 : // stage 8
1163 : // stage 9
1164 : // stage 10
1165 : // stage 11
1166 0 : output[0] = x[0];
1167 0 : output[63] = x[0];
1168 0 : output[1] = x[1];
1169 0 : output[62] = x[1];
1170 0 : output[2] = x[1];
1171 0 : output[61] = x[1];
1172 0 : output[3] = x[0];
1173 0 : output[60] = x[0];
1174 0 : output[4] = x[0];
1175 0 : output[59] = x[0];
1176 0 : output[5] = x[1];
1177 0 : output[58] = x[1];
1178 0 : output[6] = x[1];
1179 0 : output[57] = x[1];
1180 0 : output[7] = x[0];
1181 0 : output[56] = x[0];
1182 0 : output[8] = x[0];
1183 0 : output[55] = x[0];
1184 0 : output[9] = x[1];
1185 0 : output[54] = x[1];
1186 0 : output[10] = x[1];
1187 0 : output[53] = x[1];
1188 0 : output[11] = x[0];
1189 0 : output[52] = x[0];
1190 0 : output[12] = x[0];
1191 0 : output[51] = x[0];
1192 0 : output[13] = x[1];
1193 0 : output[50] = x[1];
1194 0 : output[14] = x[1];
1195 0 : output[49] = x[1];
1196 0 : output[15] = x[0];
1197 0 : output[48] = x[0];
1198 0 : output[16] = x[0];
1199 0 : output[47] = x[0];
1200 0 : output[17] = x[1];
1201 0 : output[46] = x[1];
1202 0 : output[18] = x[1];
1203 0 : output[45] = x[1];
1204 0 : output[19] = x[0];
1205 0 : output[44] = x[0];
1206 0 : output[20] = x[0];
1207 0 : output[43] = x[0];
1208 0 : output[21] = x[1];
1209 0 : output[42] = x[1];
1210 0 : output[22] = x[1];
1211 0 : output[41] = x[1];
1212 0 : output[23] = x[0];
1213 0 : output[40] = x[0];
1214 0 : output[24] = x[0];
1215 0 : output[39] = x[0];
1216 0 : output[25] = x[1];
1217 0 : output[38] = x[1];
1218 0 : output[26] = x[1];
1219 0 : output[37] = x[1];
1220 0 : output[27] = x[0];
1221 0 : output[36] = x[0];
1222 0 : output[28] = x[0];
1223 0 : output[35] = x[0];
1224 0 : output[29] = x[1];
1225 0 : output[34] = x[1];
1226 0 : output[30] = x[1];
1227 0 : output[33] = x[1];
1228 0 : output[31] = x[0];
1229 0 : output[32] = x[0];
1230 0 : }
1231 :
1232 0 : static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
1233 : int8_t cos_bit) {
1234 : (void)cos_bit;
1235 0 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
1236 0 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1237 0 : const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
1238 0 : const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
1239 0 : const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
1240 0 : const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
1241 0 : const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
1242 0 : const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
1243 0 : const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
1244 0 : const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
1245 0 : const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
1246 0 : const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
1247 0 : const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
1248 0 : const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
1249 0 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1250 0 : const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1251 0 : const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1252 0 : const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1253 :
1254 : // stage 1
1255 : __m128i x[64];
1256 0 : x[0] = input[0];
1257 0 : x[8] = input[4];
1258 0 : x[16] = input[2];
1259 0 : x[24] = input[6];
1260 0 : x[32] = input[1];
1261 0 : x[40] = input[5];
1262 0 : x[48] = input[3];
1263 0 : x[56] = input[7];
1264 :
1265 : // stage 2
1266 0 : btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1267 0 : btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1268 0 : btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1269 0 : btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1270 :
1271 : // stage 3
1272 0 : btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1273 0 : btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1274 0 : x[33] = x[32];
1275 0 : x[38] = x[39];
1276 0 : x[41] = x[40];
1277 0 : x[46] = x[47];
1278 0 : x[49] = x[48];
1279 0 : x[54] = x[55];
1280 0 : x[57] = x[56];
1281 0 : x[62] = x[63];
1282 :
1283 : // stage 4
1284 0 : btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1285 0 : x[17] = x[16];
1286 0 : x[22] = x[23];
1287 0 : x[25] = x[24];
1288 0 : x[30] = x[31];
1289 0 : btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
1290 0 : btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
1291 0 : btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
1292 0 : btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
1293 :
1294 : // stage 5
1295 0 : x[9] = x[8];
1296 0 : x[14] = x[15];
1297 0 : btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
1298 0 : btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
1299 0 : x[35] = x[32];
1300 0 : x[34] = x[33];
1301 0 : x[36] = x[39];
1302 0 : x[37] = x[38];
1303 0 : x[43] = x[40];
1304 0 : x[42] = x[41];
1305 0 : x[44] = x[47];
1306 0 : x[45] = x[46];
1307 0 : x[51] = x[48];
1308 0 : x[50] = x[49];
1309 0 : x[52] = x[55];
1310 0 : x[53] = x[54];
1311 0 : x[59] = x[56];
1312 0 : x[58] = x[57];
1313 0 : x[60] = x[63];
1314 0 : x[61] = x[62];
1315 :
1316 : // stage 6
1317 0 : btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1318 0 : btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1319 0 : x[19] = x[16];
1320 0 : x[18] = x[17];
1321 0 : x[20] = x[23];
1322 0 : x[21] = x[22];
1323 0 : x[27] = x[24];
1324 0 : x[26] = x[25];
1325 0 : x[28] = x[31];
1326 0 : x[29] = x[30];
1327 0 : idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
1328 :
1329 : // stage 7
1330 0 : x[3] = x[0];
1331 0 : x[2] = x[1];
1332 0 : x[11] = x[8];
1333 0 : x[10] = x[9];
1334 0 : x[12] = x[15];
1335 0 : x[13] = x[14];
1336 0 : idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1337 :
1338 : // stage 8
1339 0 : x[7] = x[0];
1340 0 : x[6] = x[1];
1341 0 : x[5] = x[2];
1342 0 : x[4] = x[3];
1343 0 : x[9] = x[9];
1344 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1345 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1346 0 : idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1347 :
1348 0 : idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1349 0 : idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1350 0 : idct64_stage11_sse2(output, x);
1351 0 : }
1352 :
1353 0 : static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output,
1354 : int8_t cos_bit) {
1355 : (void)cos_bit;
1356 0 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
1357 0 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1358 :
1359 0 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1360 0 : const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1361 0 : const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1362 0 : const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1363 0 : const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1364 :
1365 : // stage 1
1366 : __m128i x[64];
1367 0 : x[0] = input[0];
1368 0 : x[4] = input[8];
1369 0 : x[8] = input[4];
1370 0 : x[12] = input[12];
1371 0 : x[16] = input[2];
1372 0 : x[20] = input[10];
1373 0 : x[24] = input[6];
1374 0 : x[28] = input[14];
1375 0 : x[32] = input[1];
1376 0 : x[36] = input[9];
1377 0 : x[40] = input[5];
1378 0 : x[44] = input[13];
1379 0 : x[48] = input[3];
1380 0 : x[52] = input[11];
1381 0 : x[56] = input[7];
1382 0 : x[60] = input[15];
1383 :
1384 : // stage 2
1385 0 : btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1386 0 : btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
1387 0 : btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
1388 0 : btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1389 0 : btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1390 0 : btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
1391 0 : btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
1392 0 : btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1393 :
1394 : // stage 3
1395 0 : btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1396 0 : btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
1397 0 : btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
1398 0 : btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1399 0 : x[33] = x[32];
1400 0 : x[34] = x[35];
1401 0 : x[37] = x[36];
1402 0 : x[38] = x[39];
1403 0 : x[41] = x[40];
1404 0 : x[42] = x[43];
1405 0 : x[45] = x[44];
1406 0 : x[46] = x[47];
1407 0 : x[49] = x[48];
1408 0 : x[50] = x[51];
1409 0 : x[53] = x[52];
1410 0 : x[54] = x[55];
1411 0 : x[57] = x[56];
1412 0 : x[58] = x[59];
1413 0 : x[61] = x[60];
1414 0 : x[62] = x[63];
1415 :
1416 : // stage 4
1417 0 : btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1418 0 : btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
1419 0 : x[17] = x[16];
1420 0 : x[18] = x[19];
1421 0 : x[21] = x[20];
1422 0 : x[22] = x[23];
1423 0 : x[25] = x[24];
1424 0 : x[26] = x[27];
1425 0 : x[29] = x[28];
1426 0 : x[30] = x[31];
1427 0 : idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
1428 :
1429 : // stage 5
1430 0 : btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
1431 0 : x[9] = x[8];
1432 0 : x[10] = x[11];
1433 0 : x[13] = x[12];
1434 0 : x[14] = x[15];
1435 0 : idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
1436 :
1437 : // stage 6
1438 0 : btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1439 0 : x[5] = x[4];
1440 0 : x[6] = x[7];
1441 0 : btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1442 0 : btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
1443 0 : idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
1444 :
1445 : // stage 7
1446 0 : x[3] = x[0];
1447 0 : x[2] = x[1];
1448 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
1449 0 : btf_16_adds_subs_sse2(x[8], x[11]);
1450 0 : btf_16_adds_subs_sse2(x[9], x[10]);
1451 0 : btf_16_subs_adds_sse2(x[15], x[12]);
1452 0 : btf_16_subs_adds_sse2(x[14], x[13]);
1453 0 : idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1454 :
1455 : // stage 8
1456 0 : btf_16_adds_subs_sse2(x[0], x[7]);
1457 0 : btf_16_adds_subs_sse2(x[1], x[6]);
1458 0 : btf_16_adds_subs_sse2(x[2], x[5]);
1459 0 : btf_16_adds_subs_sse2(x[3], x[4]);
1460 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1461 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1462 0 : idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1463 :
1464 0 : idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1465 0 : idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1466 0 : idct64_stage11_sse2(output, x);
1467 0 : }
1468 :
1469 0 : static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output,
1470 : int8_t cos_bit) {
1471 : (void)cos_bit;
1472 0 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
1473 0 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1474 :
1475 0 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1476 0 : const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1477 0 : const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1478 0 : const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1479 0 : const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1480 :
1481 : // stage 1
1482 : __m128i x[64];
1483 0 : x[0] = input[0];
1484 0 : x[2] = input[16];
1485 0 : x[4] = input[8];
1486 0 : x[6] = input[24];
1487 0 : x[8] = input[4];
1488 0 : x[10] = input[20];
1489 0 : x[12] = input[12];
1490 0 : x[14] = input[28];
1491 0 : x[16] = input[2];
1492 0 : x[18] = input[18];
1493 0 : x[20] = input[10];
1494 0 : x[22] = input[26];
1495 0 : x[24] = input[6];
1496 0 : x[26] = input[22];
1497 0 : x[28] = input[14];
1498 0 : x[30] = input[30];
1499 0 : x[32] = input[1];
1500 0 : x[34] = input[17];
1501 0 : x[36] = input[9];
1502 0 : x[38] = input[25];
1503 0 : x[40] = input[5];
1504 0 : x[42] = input[21];
1505 0 : x[44] = input[13];
1506 0 : x[46] = input[29];
1507 0 : x[48] = input[3];
1508 0 : x[50] = input[19];
1509 0 : x[52] = input[11];
1510 0 : x[54] = input[27];
1511 0 : x[56] = input[7];
1512 0 : x[58] = input[23];
1513 0 : x[60] = input[15];
1514 0 : x[62] = input[31];
1515 :
1516 : // stage 2
1517 0 : btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1518 0 : btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
1519 0 : btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
1520 0 : btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
1521 0 : btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
1522 0 : btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
1523 0 : btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
1524 0 : btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1525 0 : btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1526 0 : btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
1527 0 : btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
1528 0 : btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
1529 0 : btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
1530 0 : btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
1531 0 : btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
1532 0 : btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1533 :
1534 : // stage 3
1535 0 : btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1536 0 : btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
1537 0 : btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
1538 0 : btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
1539 0 : btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
1540 0 : btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
1541 0 : btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
1542 0 : btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1543 0 : btf_16_adds_subs_sse2(x[32], x[33]);
1544 0 : btf_16_subs_adds_sse2(x[35], x[34]);
1545 0 : btf_16_adds_subs_sse2(x[36], x[37]);
1546 0 : btf_16_subs_adds_sse2(x[39], x[38]);
1547 0 : btf_16_adds_subs_sse2(x[40], x[41]);
1548 0 : btf_16_subs_adds_sse2(x[43], x[42]);
1549 0 : btf_16_adds_subs_sse2(x[44], x[45]);
1550 0 : btf_16_subs_adds_sse2(x[47], x[46]);
1551 0 : btf_16_adds_subs_sse2(x[48], x[49]);
1552 0 : btf_16_subs_adds_sse2(x[51], x[50]);
1553 0 : btf_16_adds_subs_sse2(x[52], x[53]);
1554 0 : btf_16_subs_adds_sse2(x[55], x[54]);
1555 0 : btf_16_adds_subs_sse2(x[56], x[57]);
1556 0 : btf_16_subs_adds_sse2(x[59], x[58]);
1557 0 : btf_16_adds_subs_sse2(x[60], x[61]);
1558 0 : btf_16_subs_adds_sse2(x[63], x[62]);
1559 :
1560 : // stage 4
1561 0 : btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1562 0 : btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
1563 0 : btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
1564 0 : btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
1565 0 : btf_16_adds_subs_sse2(x[16], x[17]);
1566 0 : btf_16_subs_adds_sse2(x[19], x[18]);
1567 0 : btf_16_adds_subs_sse2(x[20], x[21]);
1568 0 : btf_16_subs_adds_sse2(x[23], x[22]);
1569 0 : btf_16_adds_subs_sse2(x[24], x[25]);
1570 0 : btf_16_subs_adds_sse2(x[27], x[26]);
1571 0 : btf_16_adds_subs_sse2(x[28], x[29]);
1572 0 : btf_16_subs_adds_sse2(x[31], x[30]);
1573 0 : idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
1574 :
1575 : // stage 5
1576 0 : btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
1577 0 : btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
1578 0 : btf_16_adds_subs_sse2(x[8], x[9]);
1579 0 : btf_16_subs_adds_sse2(x[11], x[10]);
1580 0 : btf_16_adds_subs_sse2(x[12], x[13]);
1581 0 : btf_16_subs_adds_sse2(x[15], x[14]);
1582 0 : idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
1583 :
1584 : // stage 6
1585 0 : btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1586 0 : btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
1587 0 : btf_16_adds_subs_sse2(x[4], x[5]);
1588 0 : btf_16_subs_adds_sse2(x[7], x[6]);
1589 0 : btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1590 0 : btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
1591 0 : idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
1592 :
1593 : // stage 7
1594 0 : btf_16_adds_subs_sse2(x[0], x[3]);
1595 0 : btf_16_adds_subs_sse2(x[1], x[2]);
1596 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
1597 0 : btf_16_adds_subs_sse2(x[8], x[11]);
1598 0 : btf_16_adds_subs_sse2(x[9], x[10]);
1599 0 : btf_16_subs_adds_sse2(x[15], x[12]);
1600 0 : btf_16_subs_adds_sse2(x[14], x[13]);
1601 0 : idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1602 :
1603 : // stage 8
1604 0 : btf_16_adds_subs_sse2(x[0], x[7]);
1605 0 : btf_16_adds_subs_sse2(x[1], x[6]);
1606 0 : btf_16_adds_subs_sse2(x[2], x[5]);
1607 0 : btf_16_adds_subs_sse2(x[3], x[4]);
1608 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1609 0 : btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1610 0 : idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1611 :
1612 : // stage 9~11
1613 0 : idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1614 0 : idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1615 0 : idct64_stage11_sse2(output, x);
1616 0 : }
1617 :
1618 2737620 : static void iadst4_new_sse2(const __m128i *input, __m128i *output,
1619 : int8_t cos_bit) {
1620 : (void)cos_bit;
1621 2737620 : const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
1622 2737600 : const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
1623 2737600 : const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
1624 2737600 : const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
1625 2737600 : const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
1626 2737600 : const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
1627 2737600 : const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
1628 2737600 : const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
1629 2737600 : const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
1630 : __m128i x0[4];
1631 2737600 : x0[0] = input[0];
1632 2737600 : x0[1] = input[1];
1633 2737600 : x0[2] = input[2];
1634 2737600 : x0[3] = input[3];
1635 :
1636 : __m128i u[4];
1637 2737600 : u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
1638 2737600 : u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
1639 2737600 : u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
1640 2737600 : u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
1641 :
1642 : __m128i x1[16];
1643 2737600 : x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
1644 2737600 : x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
1645 2737600 : x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
1646 2737600 : x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
1647 2737600 : x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2
1648 2737600 : x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
1649 2737600 : x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4
1650 2737600 : x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
1651 2737600 : x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
1652 2737600 : x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
1653 2737600 : x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3
1654 2737600 : x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
1655 2737600 : x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
1656 2737600 : x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
1657 2737600 : x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1
1658 2737600 : x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
1659 :
1660 : __m128i x2[8];
1661 2737600 : x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
1662 2737600 : x2[1] = _mm_add_epi32(x1[1], x1[5]);
1663 2737600 : x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
1664 2737600 : x2[3] = _mm_add_epi32(x1[3], x1[7]);
1665 2737600 : x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3
1666 2737600 : x2[5] = _mm_add_epi32(x1[9], x1[11]);
1667 2737600 : x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
1668 5475200 : x2[7] = _mm_add_epi32(x1[13], x1[15]);
1669 :
1670 2737600 : const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1671 13688000 : for (int32_t i = 0; i < 4; ++i) {
1672 10950400 : __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
1673 21900800 : __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
1674 10950400 : out0 = _mm_srai_epi32(out0, INV_COS_BIT);
1675 10950400 : out1 = _mm_srai_epi32(out1, INV_COS_BIT);
1676 21900800 : output[i] = _mm_packs_epi32(out0, out1);
1677 : }
1678 2737600 : }
1679 :
1680 : // TODO(binpengsmail@gmail.com):
1681 : // To explore the reuse of VP9 versions of corresponding SSE2 functions and
1682 : // evaluate whether there is a possibility for further speedup.
1683 7637690 : static void iadst4_w4_new_sse2(const __m128i *input, __m128i *output,
1684 : int8_t cos_bit) {
1685 : (void)cos_bit;
1686 7637690 : const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
1687 7637760 : const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
1688 7637760 : const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
1689 7637760 : const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
1690 7637760 : const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
1691 7637760 : const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
1692 7637760 : const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
1693 7637760 : const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
1694 7637760 : const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
1695 : __m128i x0[4];
1696 7637760 : x0[0] = input[0];
1697 7637760 : x0[1] = input[1];
1698 7637760 : x0[2] = input[2];
1699 7637760 : x0[3] = input[3];
1700 :
1701 : __m128i u[2];
1702 7637760 : u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
1703 7637760 : u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
1704 :
1705 : __m128i x1[8];
1706 7637760 : x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
1707 7637760 : x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
1708 7637760 : x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2
1709 7637760 : x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4
1710 7637760 : x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
1711 7637760 : x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3
1712 7637760 : x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
1713 7637760 : x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1
1714 :
1715 : __m128i x2[4];
1716 7637760 : x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
1717 7637760 : x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
1718 7637760 : x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3
1719 15275500 : x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
1720 :
1721 7637760 : const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1722 38189000 : for (int32_t i = 0; i < 4; ++i) {
1723 61102600 : __m128i out0 = _mm_add_epi32(x2[i], rounding);
1724 30551300 : out0 = _mm_srai_epi32(out0, INV_COS_BIT);
1725 61102600 : output[i] = _mm_packs_epi32(out0, out0);
1726 : }
1727 7637760 : }
1728 :
1729 1342 : static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output,
1730 : int8_t cos_bit) {
1731 : (void)cos_bit;
1732 1342 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
1733 1342 : const __m128i __zero = _mm_setzero_si128();
1734 1342 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1735 :
1736 1342 : const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1737 1342 : const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1738 1342 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1739 1342 : const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1740 :
1741 : // stage 1
1742 : __m128i x[8];
1743 1342 : x[1] = input[0];
1744 :
1745 : // stage 2
1746 5368 : btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
1747 :
1748 : // stage 3
1749 1342 : x[4] = x[0];
1750 1342 : x[5] = x[1];
1751 :
1752 : // stage 4
1753 21472 : btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1754 :
1755 : // stage 5
1756 1342 : x[2] = x[0];
1757 1342 : x[3] = x[1];
1758 1342 : x[6] = x[4];
1759 1342 : x[7] = x[5];
1760 :
1761 : // stage 6
1762 21472 : btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1763 21472 : btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1764 :
1765 : // stage 7
1766 1342 : output[0] = x[0];
1767 1342 : output[1] = _mm_subs_epi16(__zero, x[4]);
1768 1342 : output[2] = x[6];
1769 1342 : output[3] = _mm_subs_epi16(__zero, x[2]);
1770 1342 : output[4] = x[3];
1771 1342 : output[5] = _mm_subs_epi16(__zero, x[7]);
1772 1342 : output[6] = x[5];
1773 1342 : output[7] = _mm_subs_epi16(__zero, x[1]);
1774 1342 : }
1775 :
1776 7163460 : static void iadst8_new_sse2(const __m128i *input, __m128i *output,
1777 : int8_t cos_bit) {
1778 : (void)cos_bit;
1779 7163460 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
1780 7163310 : const __m128i __zero = _mm_setzero_si128();
1781 7163310 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1782 :
1783 7163310 : const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
1784 7163310 : const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
1785 7163310 : const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
1786 7163310 : const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
1787 7163310 : const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
1788 7163310 : const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
1789 7163310 : const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
1790 7163310 : const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
1791 7163310 : const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1792 7163310 : const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1793 7163310 : const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1794 7163310 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1795 7163310 : const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1796 :
1797 : // stage 1
1798 : __m128i x[8];
1799 7163310 : x[0] = input[7];
1800 7163310 : x[1] = input[0];
1801 7163310 : x[2] = input[5];
1802 7163310 : x[3] = input[2];
1803 7163310 : x[4] = input[3];
1804 7163310 : x[5] = input[4];
1805 7163310 : x[6] = input[1];
1806 7163310 : x[7] = input[6];
1807 :
1808 : // stage 2
1809 114613000 : btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
1810 114613000 : btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
1811 114613000 : btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
1812 114613000 : btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
1813 :
1814 : // stage 3
1815 14326600 : btf_16_adds_subs_sse2(x[0], x[4]);
1816 14326600 : btf_16_adds_subs_sse2(x[1], x[5]);
1817 14326600 : btf_16_adds_subs_sse2(x[2], x[6]);
1818 14326600 : btf_16_adds_subs_sse2(x[3], x[7]);
1819 :
1820 : // stage 4
1821 114613000 : btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1822 114613000 : btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1823 :
1824 : // stage 5
1825 14326600 : btf_16_adds_subs_sse2(x[0], x[2]);
1826 14326600 : btf_16_adds_subs_sse2(x[1], x[3]);
1827 14326600 : btf_16_adds_subs_sse2(x[4], x[6]);
1828 14326600 : btf_16_adds_subs_sse2(x[5], x[7]);
1829 :
1830 : // stage 6
1831 114613000 : btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1832 114613000 : btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1833 :
1834 : // stage 7
1835 7163310 : output[0] = x[0];
1836 7163310 : output[1] = _mm_subs_epi16(__zero, x[4]);
1837 7163310 : output[2] = x[6];
1838 7163310 : output[3] = _mm_subs_epi16(__zero, x[2]);
1839 7163310 : output[4] = x[3];
1840 7163310 : output[5] = _mm_subs_epi16(__zero, x[7]);
1841 7163310 : output[6] = x[5];
1842 7163310 : output[7] = _mm_subs_epi16(__zero, x[1]);
1843 7163310 : }
1844 :
1845 1321730 : static void iadst8_w4_new_sse2(const __m128i *input, __m128i *output,
1846 : int8_t cos_bit) {
1847 : (void)cos_bit;
1848 1321730 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
1849 1321720 : const __m128i __zero = _mm_setzero_si128();
1850 1321720 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1851 :
1852 1321720 : const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
1853 1321720 : const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
1854 1321720 : const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
1855 1321720 : const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
1856 1321720 : const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
1857 1321720 : const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
1858 1321720 : const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
1859 1321720 : const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
1860 1321720 : const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1861 1321720 : const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1862 1321720 : const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1863 1321720 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1864 1321720 : const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1865 :
1866 : // stage 1
1867 : __m128i x[8];
1868 1321720 : x[0] = input[7];
1869 1321720 : x[1] = input[0];
1870 1321720 : x[2] = input[5];
1871 1321720 : x[3] = input[2];
1872 1321720 : x[4] = input[3];
1873 1321720 : x[5] = input[4];
1874 1321720 : x[6] = input[1];
1875 1321720 : x[7] = input[6];
1876 :
1877 : // stage 2
1878 11895500 : btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
1879 11895500 : btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
1880 11895500 : btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
1881 11895500 : btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
1882 :
1883 : // stage 3
1884 2643450 : btf_16_adds_subs_sse2(x[0], x[4]);
1885 2643450 : btf_16_adds_subs_sse2(x[1], x[5]);
1886 2643450 : btf_16_adds_subs_sse2(x[2], x[6]);
1887 2643450 : btf_16_adds_subs_sse2(x[3], x[7]);
1888 :
1889 : // stage 4
1890 11895500 : btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1891 11895500 : btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1892 :
1893 : // stage 5
1894 2643450 : btf_16_adds_subs_sse2(x[0], x[2]);
1895 2643450 : btf_16_adds_subs_sse2(x[1], x[3]);
1896 2643450 : btf_16_adds_subs_sse2(x[4], x[6]);
1897 2643450 : btf_16_adds_subs_sse2(x[5], x[7]);
1898 :
1899 : // stage 6
1900 11895500 : btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1901 11895500 : btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1902 :
1903 : // stage 7
1904 1321720 : output[0] = x[0];
1905 1321720 : output[1] = _mm_subs_epi16(__zero, x[4]);
1906 1321720 : output[2] = x[6];
1907 1321720 : output[3] = _mm_subs_epi16(__zero, x[2]);
1908 1321720 : output[4] = x[3];
1909 1321720 : output[5] = _mm_subs_epi16(__zero, x[7]);
1910 1321720 : output[6] = x[5];
1911 1321720 : output[7] = _mm_subs_epi16(__zero, x[1]);
1912 1321720 : }
1913 :
1914 1575200 : static INLINE void iadst16_stage3_ssse3(__m128i *x) {
1915 3150400 : btf_16_adds_subs_sse2(x[0], x[8]);
1916 3150400 : btf_16_adds_subs_sse2(x[1], x[9]);
1917 3150400 : btf_16_adds_subs_sse2(x[2], x[10]);
1918 3150400 : btf_16_adds_subs_sse2(x[3], x[11]);
1919 3150400 : btf_16_adds_subs_sse2(x[4], x[12]);
1920 3150400 : btf_16_adds_subs_sse2(x[5], x[13]);
1921 3150400 : btf_16_adds_subs_sse2(x[6], x[14]);
1922 3150400 : btf_16_adds_subs_sse2(x[7], x[15]);
1923 1575200 : }
1924 :
1925 854235 : static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
1926 : const __m128i __rounding,
1927 : int8_t cos_bit) {
1928 854235 : const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
1929 854235 : const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
1930 854235 : const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
1931 854235 : const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
1932 854235 : const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
1933 854235 : const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
1934 13667800 : btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
1935 13667800 : btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
1936 13667800 : btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
1937 13667800 : btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
1938 854235 : }
1939 :
1940 1575190 : static INLINE void iadst16_stage5_ssse3(__m128i *x) {
1941 3150380 : btf_16_adds_subs_sse2(x[0], x[4]);
1942 3150380 : btf_16_adds_subs_sse2(x[1], x[5]);
1943 3150380 : btf_16_adds_subs_sse2(x[2], x[6]);
1944 3150380 : btf_16_adds_subs_sse2(x[3], x[7]);
1945 3150380 : btf_16_adds_subs_sse2(x[8], x[12]);
1946 3150380 : btf_16_adds_subs_sse2(x[9], x[13]);
1947 3150380 : btf_16_adds_subs_sse2(x[10], x[14]);
1948 3150380 : btf_16_adds_subs_sse2(x[11], x[15]);
1949 1575190 : }
1950 :
1951 854240 : static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
1952 : const __m128i __rounding,
1953 : int8_t cos_bit) {
1954 854240 : const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1955 854240 : const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1956 854240 : const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1957 13667800 : btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1958 13667800 : btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1959 13667800 : btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
1960 13667800 : btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
1961 854240 : }
1962 :
1963 1575200 : static INLINE void iadst16_stage7_ssse3(__m128i *x) {
1964 3150410 : btf_16_adds_subs_sse2(x[0], x[2]);
1965 3150410 : btf_16_adds_subs_sse2(x[1], x[3]);
1966 3150410 : btf_16_adds_subs_sse2(x[4], x[6]);
1967 3150410 : btf_16_adds_subs_sse2(x[5], x[7]);
1968 3150410 : btf_16_adds_subs_sse2(x[8], x[10]);
1969 3150410 : btf_16_adds_subs_sse2(x[9], x[11]);
1970 3150410 : btf_16_adds_subs_sse2(x[12], x[14]);
1971 3150410 : btf_16_adds_subs_sse2(x[13], x[15]);
1972 1575200 : }
1973 :
1974 854278 : static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
1975 : const __m128i __rounding,
1976 : int8_t cos_bit) {
1977 854278 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1978 854278 : const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1979 13668400 : btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1980 13668400 : btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1981 13668400 : btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
1982 13668400 : btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
1983 854278 : }
1984 :
1985 1575230 : static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
1986 1575230 : const __m128i __zero = _mm_setzero_si128();
1987 1575230 : output[0] = x[0];
1988 1575230 : output[1] = _mm_subs_epi16(__zero, x[8]);
1989 1575230 : output[2] = x[12];
1990 1575230 : output[3] = _mm_subs_epi16(__zero, x[4]);
1991 1575230 : output[4] = x[6];
1992 1575230 : output[5] = _mm_subs_epi16(__zero, x[14]);
1993 1575230 : output[6] = x[10];
1994 1575230 : output[7] = _mm_subs_epi16(__zero, x[2]);
1995 1575230 : output[8] = x[3];
1996 1575230 : output[9] = _mm_subs_epi16(__zero, x[11]);
1997 1575230 : output[10] = x[15];
1998 1575230 : output[11] = _mm_subs_epi16(__zero, x[7]);
1999 1575230 : output[12] = x[5];
2000 1575230 : output[13] = _mm_subs_epi16(__zero, x[13]);
2001 1575230 : output[14] = x[9];
2002 1575230 : output[15] = _mm_subs_epi16(__zero, x[1]);
2003 1575230 : }
2004 :
2005 42 : static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output,
2006 : int8_t cos_bit) {
2007 : (void)cos_bit;
2008 42 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
2009 42 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2010 :
2011 42 : const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
2012 42 : const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
2013 42 : const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
2014 42 : const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
2015 :
2016 : // stage 1
2017 : __m128i x[16];
2018 42 : x[1] = input[0];
2019 :
2020 : // stage 2
2021 168 : btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
2022 :
2023 : // stage 3
2024 42 : x[8] = x[0];
2025 42 : x[9] = x[1];
2026 :
2027 : // stage 4
2028 672 : btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
2029 :
2030 : // stage 5
2031 42 : x[4] = x[0];
2032 42 : x[5] = x[1];
2033 42 : x[12] = x[8];
2034 42 : x[13] = x[9];
2035 :
2036 : // stage 6
2037 672 : btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
2038 672 : btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
2039 :
2040 : // stage 7
2041 42 : x[2] = x[0];
2042 42 : x[3] = x[1];
2043 42 : x[6] = x[4];
2044 42 : x[7] = x[5];
2045 42 : x[10] = x[8];
2046 42 : x[11] = x[9];
2047 42 : x[14] = x[12];
2048 42 : x[15] = x[13];
2049 :
2050 42 : iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2051 42 : iadst16_stage9_ssse3(output, x);
2052 42 : }
2053 :
2054 252 : static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output,
2055 : int8_t cos_bit) {
2056 : (void)cos_bit;
2057 252 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
2058 252 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2059 :
2060 : // stage 1
2061 : __m128i x[16];
2062 252 : x[1] = input[0];
2063 252 : x[3] = input[2];
2064 252 : x[5] = input[4];
2065 252 : x[7] = input[6];
2066 252 : x[8] = input[7];
2067 252 : x[10] = input[5];
2068 252 : x[12] = input[3];
2069 252 : x[14] = input[1];
2070 :
2071 : // stage 2
2072 1008 : btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
2073 1008 : btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
2074 1008 : btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
2075 1008 : btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
2076 1008 : btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
2077 1008 : btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
2078 1008 : btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
2079 1008 : btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
2080 :
2081 : // stage 3
2082 252 : iadst16_stage3_ssse3(x);
2083 252 : iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
2084 252 : iadst16_stage5_ssse3(x);
2085 252 : iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
2086 252 : iadst16_stage7_ssse3(x);
2087 252 : iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2088 252 : iadst16_stage9_ssse3(output, x);
2089 252 : }
2090 :
2091 853967 : static void iadst16_new_sse2(const __m128i *input, __m128i *output,
2092 : int8_t cos_bit) {
2093 : (void)cos_bit;
2094 853967 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
2095 853967 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2096 853967 : const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2097 853967 : const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2098 853967 : const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2099 853967 : const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2100 853967 : const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2101 853967 : const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2102 853967 : const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2103 853967 : const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2104 853967 : const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2105 853967 : const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2106 853967 : const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2107 853967 : const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2108 853967 : const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2109 853967 : const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2110 853967 : const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2111 853967 : const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2112 :
2113 : // stage 1
2114 : __m128i x[16];
2115 853967 : x[0] = input[15];
2116 853967 : x[1] = input[0];
2117 853967 : x[2] = input[13];
2118 853967 : x[3] = input[2];
2119 853967 : x[4] = input[11];
2120 853967 : x[5] = input[4];
2121 853967 : x[6] = input[9];
2122 853967 : x[7] = input[6];
2123 853967 : x[8] = input[7];
2124 853967 : x[9] = input[8];
2125 853967 : x[10] = input[5];
2126 853967 : x[11] = input[10];
2127 853967 : x[12] = input[3];
2128 853967 : x[13] = input[12];
2129 853967 : x[14] = input[1];
2130 853967 : x[15] = input[14];
2131 :
2132 : // stage 2
2133 13663500 : btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
2134 13663500 : btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
2135 13663500 : btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
2136 13663500 : btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
2137 13663500 : btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
2138 13663500 : btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
2139 13663500 : btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
2140 13663500 : btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
2141 :
2142 : // stage 3~9
2143 853967 : iadst16_stage3_ssse3(x);
2144 853985 : iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
2145 853983 : iadst16_stage5_ssse3(x);
2146 853989 : iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
2147 853985 : iadst16_stage7_ssse3(x);
2148 853986 : iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2149 853984 : iadst16_stage9_ssse3(output, x);
2150 853983 : }
2151 :
2152 720996 : static void iadst16_w4_new_sse2(const __m128i *input, __m128i *output,
2153 : int8_t cos_bit) {
2154 : (void)cos_bit;
2155 720996 : const int32_t *cospi = cospi_arr(INV_COS_BIT);
2156 720993 : const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2157 :
2158 720993 : const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2159 720993 : const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2160 720993 : const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2161 720993 : const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2162 720993 : const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2163 720993 : const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2164 720993 : const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2165 720993 : const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2166 720993 : const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2167 720993 : const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2168 720993 : const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2169 720993 : const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2170 720993 : const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2171 720993 : const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2172 720993 : const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2173 720993 : const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2174 720993 : const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
2175 720993 : const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
2176 720993 : const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
2177 720993 : const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
2178 720993 : const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
2179 720993 : const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
2180 720993 : const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
2181 720993 : const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
2182 720993 : const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
2183 720993 : const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
2184 720993 : const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
2185 :
2186 : // stage 1
2187 : __m128i x[16];
2188 720993 : x[0] = input[15];
2189 720993 : x[1] = input[0];
2190 720993 : x[2] = input[13];
2191 720993 : x[3] = input[2];
2192 720993 : x[4] = input[11];
2193 720993 : x[5] = input[4];
2194 720993 : x[6] = input[9];
2195 720993 : x[7] = input[6];
2196 720993 : x[8] = input[7];
2197 720993 : x[9] = input[8];
2198 720993 : x[10] = input[5];
2199 720993 : x[11] = input[10];
2200 720993 : x[12] = input[3];
2201 720993 : x[13] = input[12];
2202 720993 : x[14] = input[1];
2203 720993 : x[15] = input[14];
2204 :
2205 : // stage 2
2206 6488940 : btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
2207 6488940 : btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
2208 6488940 : btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
2209 6488940 : btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
2210 6488940 : btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
2211 6488940 : btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
2212 6488940 : btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
2213 6488940 : btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
2214 :
2215 : // stage 3
2216 720993 : iadst16_stage3_ssse3(x);
2217 :
2218 : // stage 4
2219 6489040 : btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
2220 6489040 : btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
2221 6489040 : btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
2222 6489040 : btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
2223 :
2224 : // stage 5
2225 721004 : iadst16_stage5_ssse3(x);
2226 :
2227 : // stage 6
2228 6489040 : btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
2229 6489040 : btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
2230 6489040 : btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
2231 6489040 : btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
2232 :
2233 : // stage 7
2234 721004 : iadst16_stage7_ssse3(x);
2235 :
2236 : // stage 8
2237 6489030 : btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
2238 6489030 : btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
2239 6489030 : btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
2240 6489030 : btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
2241 :
2242 : // stage 9
2243 721003 : iadst16_stage9_ssse3(output, x);
2244 721000 : }
2245 :
2246 5108550 : static void iidentity4_new_ssse3(const __m128i *input, __m128i *output,
2247 : int8_t cos_bit) {
2248 : (void)cos_bit;
2249 5108550 : const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
2250 5108550 : const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
2251 25542500 : for (int32_t i = 0; i < 4; ++i) {
2252 20434000 : __m128i x = _mm_mulhrs_epi16(input[i], scale);
2253 40868000 : output[i] = _mm_adds_epi16(x, input[i]);
2254 : }
2255 5108550 : }
2256 :
2257 391909 : static void iidentity8_new_sse2(const __m128i *input, __m128i *output,
2258 : int8_t cos_bit) {
2259 : (void)cos_bit;
2260 3527180 : for (int32_t i = 0; i < 8; ++i)
2261 6270550 : output[i] = _mm_adds_epi16(input[i], input[i]);
2262 391909 : }
2263 :
2264 359808 : static void iidentity16_new_ssse3(const __m128i *input, __m128i *output,
2265 : int8_t cos_bit) {
2266 : (void)cos_bit;
2267 359808 : const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
2268 359808 : const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
2269 6116510 : for (int32_t i = 0; i < 16; ++i) {
2270 5756700 : __m128i x = _mm_mulhrs_epi16(input[i], scale);
2271 5756700 : __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
2272 11513400 : output[i] = _mm_adds_epi16(x, srcx2);
2273 : }
2274 359808 : }
2275 :
2276 155549000 : static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
2277 : __m128i res) {
2278 155549000 : const __m128i zero = _mm_setzero_si128();
2279 311098000 : __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
2280 155549000 : return _mm_packus_epi16(x0, x0);
2281 : }
2282 :
2283 16968400 : static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in,
2284 : uint8_t *output_r, int32_t stride_r,
2285 : uint8_t *output_w, int32_t stride_w,
2286 : int32_t flipud, const int32_t height) {
2287 16968400 : int32_t j = flipud ? (height - 1) : 0;
2288 16968400 : const int32_t step = flipud ? -1 : 1;
2289 16968400 : const __m128i zero = _mm_setzero_si128();
2290 119118000 : for (int32_t i = 0; i < height; ++i, j += step) {
2291 204299000 : const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output_r + i * stride_r)));
2292 204299000 : __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
2293 102149000 : u = _mm_packus_epi16(u, zero);
2294 102149000 : *((uint32_t *)(output_w + i * stride_w)) = _mm_cvtsi128_si32(u);
2295 : }
2296 16968400 : }
2297 :
2298 17268100 : static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in,
2299 : uint8_t *output_r, int32_t stride_r,
2300 : uint8_t *output_w, int32_t stride_w,
2301 : int32_t flipud, const int32_t height) {
2302 17268100 : int32_t j = flipud ? (height - 1) : 0;
2303 17268100 : const int32_t step = flipud ? -1 : 1;
2304 162120000 : for (int32_t i = 0; i < height; ++i, j += step) {
2305 144860000 : const __m128i v = _mm_loadl_epi64((__m128i const *)(output_r + i * stride_r));
2306 144860000 : const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
2307 144852000 : _mm_storel_epi64((__m128i *)(output_w + i * stride_w), u);
2308 : }
2309 17260300 : }
2310 :
2311 : // 1D functions process process 8 pixels at one time.
2312 : static const transform_1d_ssse3
2313 : lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
2314 : { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3 },
2315 : { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
2316 : { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3 },
2317 : { idct32_new_sse2, NULL, NULL },
2318 : { idct64_low32_new_ssse3, NULL, NULL },
2319 : };
2320 :
2321 : // functions for blocks with eob at DC and within
2322 : // topleft 8x8, 16x16, 32x32 corner
2323 : static const transform_1d_ssse3
2324 : lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
2325 : {
2326 : { idct4_new_sse2, idct4_new_sse2, NULL, NULL },
2327 : { iadst4_new_sse2, iadst4_new_sse2, NULL, NULL },
2328 : { iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL },
2329 : },
2330 : {
2331 : { idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL },
2332 : { iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL },
2333 : { iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL } },
2334 : {
2335 : { idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2,
2336 : NULL },
2337 : { iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2,
2338 : NULL },
2339 : { NULL, NULL, NULL, NULL },
2340 : },
2341 : {
2342 : { idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3,
2343 : idct32_new_sse2 },
2344 : { NULL, NULL, NULL, NULL },
2345 : { NULL, NULL, NULL, NULL } },
2346 : {
2347 : { idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3,
2348 : idct64_low32_new_ssse3 },
2349 : { NULL, NULL, NULL, NULL },
2350 : { NULL, NULL, NULL, NULL } }
2351 : };
2352 :
2353 : // 1D functions process process 4 pixels at one time.
2354 : // used in 4x4, 4x8, 4x16, 8x4, 16x4
2355 : static const transform_1d_ssse3
2356 : lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
2357 : { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3 },
2358 : { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 },
2359 : { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3 },
2360 : { NULL, NULL, NULL },
2361 : { NULL, NULL, NULL },
2362 : };
2363 :
2364 2157420 : static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
2365 : int32_t stride, int32_t shift, int32_t height,
2366 : int32_t txw_idx, int32_t rect_type) {
2367 2157420 : const int32_t *input_row = input;
2368 2157420 : const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
2369 2157420 : const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
2370 2157420 : (1 << (NewSqrt2Bits - shift - 1)));
2371 2157420 : const __m128i one = _mm_set1_epi16(1);
2372 2157420 : const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
2373 2157420 : if (rect_type != 1 && rect_type != -1) {
2374 14593400 : for (int32_t i = 0; i < height; ++i) {
2375 13019600 : const __m128i src = load_32bit_to_16bit(input_row);
2376 13019200 : input_row += stride;
2377 13019200 : __m128i lo = _mm_unpacklo_epi16(src, one);
2378 13019200 : __m128i hi = _mm_unpackhi_epi16(src, one);
2379 13019200 : lo = _mm_madd_epi16(lo, scale_rounding);
2380 13019200 : hi = _mm_madd_epi16(hi, scale_rounding);
2381 13019200 : lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
2382 13019200 : hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
2383 26038400 : out[i] = _mm_packs_epi32(lo, hi);
2384 : }
2385 : }
2386 : else {
2387 : const __m128i rect_scale =
2388 583174 : _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
2389 6845620 : for (int32_t i = 0; i < height; ++i) {
2390 6262650 : __m128i src = load_32bit_to_16bit(input_row);
2391 6262450 : src = _mm_mulhrs_epi16(src, rect_scale);
2392 6262450 : input_row += stride;
2393 6262450 : __m128i lo = _mm_unpacklo_epi16(src, one);
2394 6262450 : __m128i hi = _mm_unpackhi_epi16(src, one);
2395 6262450 : lo = _mm_madd_epi16(lo, scale_rounding);
2396 6262450 : hi = _mm_madd_epi16(hi, scale_rounding);
2397 6262450 : lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
2398 6262450 : hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
2399 12524900 : out[i] = _mm_packs_epi32(lo, hi);
2400 : }
2401 : }
2402 2156820 : }
2403 :
2404 2436890 : static INLINE void iidentity_col_8xn_ssse3(
2405 : uint8_t *output_r, int32_t stride_r,
2406 : uint8_t *output_w, int32_t stride_w,
2407 : __m128i *buf, int32_t shift, int32_t height,
2408 : int32_t txh_idx) {
2409 2436890 : const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
2410 2436890 : const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
2411 4873790 : const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
2412 2436890 : const __m128i one = _mm_set1_epi16(1);
2413 2436890 : const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
2414 2436890 : const __m128i zero = _mm_setzero_si128();
2415 22922900 : for (int32_t h = 0; h < height; ++h) {
2416 20486100 : __m128i lo = _mm_unpacklo_epi16(buf[h], one);
2417 40972100 : __m128i hi = _mm_unpackhi_epi16(buf[h], one);
2418 20486100 : lo = _mm_madd_epi16(lo, scale_coeff);
2419 20486100 : hi = _mm_madd_epi16(hi, scale_coeff);
2420 20486100 : lo = _mm_srai_epi32(lo, NewSqrt2Bits);
2421 40972100 : hi = _mm_srai_epi32(hi, NewSqrt2Bits);
2422 20486100 : lo = _mm_add_epi32(lo, shift_rounding);
2423 20486100 : hi = _mm_add_epi32(hi, shift_rounding);
2424 20486100 : lo = _mm_srai_epi32(lo, -shift);
2425 40972100 : hi = _mm_srai_epi32(hi, -shift);
2426 20486100 : __m128i x = _mm_packs_epi32(lo, hi);
2427 :
2428 20486100 : const __m128i pred = _mm_loadl_epi64((__m128i const *)(output_r));
2429 40972100 : x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
2430 20486100 : const __m128i u = _mm_packus_epi16(x, x);
2431 20486100 : _mm_storel_epi64((__m128i *)(output_w), u);
2432 20486100 : output_r += stride_r;
2433 20486100 : output_w += stride_w;
2434 : }
2435 2436890 : }
2436 :
2437 819274 : static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
2438 : uint8_t *output_r, int32_t stride_r,
2439 : uint8_t *output_w, int32_t stride_w,
2440 : TxSize tx_size) {
2441 819274 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2442 819274 : const int32_t txw_idx = get_txw_idx(tx_size);
2443 819271 : const int32_t txh_idx = get_txh_idx(tx_size);
2444 819274 : const int32_t txfm_size_col = tx_size_wide[tx_size];
2445 819274 : const int32_t txfm_size_row = tx_size_high[tx_size];
2446 819274 : const int32_t input_stride = AOMMIN(32, txfm_size_col);
2447 819274 : const int32_t row_max = AOMMIN(32, txfm_size_row);
2448 819274 : const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2449 : __m128i buf[32];
2450 :
2451 1761160 : for (int32_t i = 0; i < (input_stride >> 3); ++i) {
2452 941884 : iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max,
2453 : txw_idx, rect_type);
2454 941883 : iidentity_col_8xn_ssse3(
2455 941883 : output_r + 8 * i, stride_r,
2456 941883 : output_w + 8 * i, stride_w,
2457 941883 : buf, shift[1], row_max,
2458 : txh_idx);
2459 : }
2460 819272 : }
2461 :
2462 12118300 : static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
2463 : uint8_t *output_r, int32_t stride_r,
2464 : uint8_t *output_w, int32_t stride_w,
2465 : TxType tx_type, TxSize tx_size_, int32_t eob) {
2466 : (void)tx_size_;
2467 : (void)eob;
2468 : __m128i buf[4];
2469 12118300 : const TxSize tx_size = TX_4X4;
2470 12118300 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2471 12118300 : const int32_t txw_idx = get_txw_idx(tx_size);
2472 12118100 : const int32_t txh_idx = get_txh_idx(tx_size);
2473 12118300 : const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2474 12118300 : const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2475 12118300 : const int32_t txfm_size_col = tx_size_wide[tx_size];
2476 12118300 : const int32_t txfm_size_row = tx_size_high[tx_size];
2477 :
2478 12118300 : const transform_1d_ssse3 row_txfm =
2479 12118300 : lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2480 12118300 : const transform_1d_ssse3 col_txfm =
2481 12118300 : lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2482 :
2483 : int32_t ud_flip, lr_flip;
2484 12118300 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2485 12117900 : load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
2486 12117600 : transpose_16bit_4x4(buf, buf);
2487 12118000 : row_txfm(buf, buf, cos_bit_row);
2488 12120400 : if (lr_flip) {
2489 : __m128i temp[4];
2490 690087 : flip_buf_sse2(buf, temp, txfm_size_col);
2491 690081 : transpose_16bit_4x4(temp, buf);
2492 : }
2493 : else
2494 11430300 : transpose_16bit_4x4(buf, buf);
2495 12119800 : col_txfm(buf, buf, cos_bit_col);
2496 12120500 : round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2497 12119900 : lowbd_write_buffer_4xn_sse2(buf, output_r, stride_r, output_w, stride_w,
2498 : ud_flip, txfm_size_row);
2499 12120200 : }
2500 :
2501 26449600 : static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
2502 : __m128i res0, __m128i res1) {
2503 26449600 : const __m128i zero = _mm_setzero_si128();
2504 26449600 : __m128i x0 = _mm_unpacklo_epi8(pred, zero);
2505 26449600 : __m128i x1 = _mm_unpackhi_epi8(pred, zero);
2506 26449600 : x0 = _mm_adds_epi16(res0, x0);
2507 26449600 : x1 = _mm_adds_epi16(res1, x1);
2508 26449600 : return _mm_packus_epi16(x0, x1);
2509 : }
2510 :
2511 3307310 : static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in,
2512 : uint8_t *output_r, int32_t stride_r, uint8_t *output_w, int32_t stride_w,
2513 : int32_t flipud,
2514 : int32_t height) {
2515 3307310 : int32_t j = flipud ? (height - 1) : 0;
2516 3307310 : const int32_t step = flipud ? -1 : 1;
2517 29756600 : for (int32_t i = 0; i < height; ++i, j += step) {
2518 26449300 : __m128i v = _mm_loadu_si128((__m128i const *)(output_r + i * stride_r));
2519 26449300 : __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
2520 26449200 : _mm_storeu_si128((__m128i *)(output_w + i * stride_w), u);
2521 : }
2522 3307300 : }
2523 :
2524 12994000 : static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
2525 : int32_t size) {
2526 12994000 : const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
2527 122334000 : for (int32_t i = 0; i < size; ++i)
2528 218679000 : output[i] = _mm_mulhrs_epi16(input[i], scale);
2529 12994000 : }
2530 :
2531 13153500 : static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
2532 : const int32_t *input,
2533 : uint8_t *output_r, int32_t stride_r,
2534 : uint8_t *output_w, int32_t stride_w,
2535 : TxType tx_type,
2536 : TxSize tx_size, int32_t eob) {
2537 : __m128i buf1[64 * 8];
2538 : int32_t eobx, eoby;
2539 13153500 : get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
2540 13152900 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2541 13152900 : const int32_t txw_idx = get_txw_idx(tx_size);
2542 13152300 : const int32_t txh_idx = get_txh_idx(tx_size);
2543 13151900 : const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2544 13151900 : const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2545 13151900 : const int32_t txfm_size_col = tx_size_wide[tx_size];
2546 13151900 : const int32_t txfm_size_row = tx_size_high[tx_size];
2547 13151900 : const int32_t buf_size_w_div8 = txfm_size_col >> 3;
2548 13151900 : const int32_t buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
2549 13151900 : const int32_t buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
2550 13151900 : const int32_t input_stride = AOMMIN(32, txfm_size_col);
2551 13151900 : const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2552 : ASSERT(eobx < 32);
2553 : ASSERT(eoby < 32);
2554 13151800 : const int32_t fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
2555 13151800 : const int32_t fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
2556 13151800 : const transform_1d_ssse3 row_txfm =
2557 13151800 : lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
2558 13151800 : const transform_1d_ssse3 col_txfm =
2559 13151800 : lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
2560 :
2561 13151800 : assert(col_txfm != NULL);
2562 13151800 : assert(row_txfm != NULL);
2563 : int32_t ud_flip, lr_flip;
2564 13151800 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2565 30578400 : for (int32_t i = 0; i < buf_size_nonzero_h_div8; i++) {
2566 : __m128i buf0[64];
2567 17418800 : const int32_t *input_row = input + i * input_stride * 8;
2568 38777600 : for (int32_t j = 0; j < buf_size_nonzero_w_div8; ++j) {
2569 21354200 : __m128i *buf0_cur = buf0 + j * 8;
2570 21354200 : load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
2571 21341700 : transpose_16bit_8x8(buf0_cur, buf0_cur);
2572 : }
2573 17423400 : if (rect_type == 1 || rect_type == -1)
2574 6701580 : round_shift_ssse3(buf0, buf0, input_stride); // rect special code
2575 17423700 : row_txfm(buf0, buf0, cos_bit_row);
2576 17422400 : round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
2577 17420600 : __m128i *_buf1 = buf1 + i * 8;
2578 17420600 : if (lr_flip) {
2579 1714570 : for (int32_t j = 0; j < buf_size_w_div8; ++j) {
2580 : __m128i temp[8];
2581 899330 : flip_buf_sse2(buf0 + 8 * j, temp, 8);
2582 899326 : transpose_16bit_8x8(temp,
2583 899326 : _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
2584 : }
2585 : }
2586 : else {
2587 37067300 : for (int32_t j = 0; j < buf_size_w_div8; ++j)
2588 20456700 : transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
2589 : }
2590 : }
2591 30245100 : for (int32_t i = 0; i < buf_size_w_div8; i++) {
2592 17092400 : col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
2593 17091600 : round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
2594 : }
2595 :
2596 13152700 : if (txfm_size_col >= 16) {
2597 5983240 : for (int32_t i = 0; i < (txfm_size_col >> 4); i++) {
2598 3307320 : lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
2599 3307320 : output_r + 16 * i, stride_r,
2600 3307320 : output_w + 16 * i, stride_w,
2601 : ud_flip, txfm_size_row);
2602 : }
2603 : }
2604 10476700 : else if (txfm_size_col == 8)
2605 10479100 : lowbd_write_buffer_8xn_sse2(buf1, output_r, stride_r, output_w, stride_w,
2606 : ud_flip, txfm_size_row);
2607 13152700 : }
2608 :
2609 1090210 : static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
2610 : const int32_t *input,
2611 : uint8_t *output_r, int32_t stride_r,
2612 : uint8_t *output_w, int32_t stride_w,
2613 : TxType tx_type,
2614 : TxSize tx_size, int32_t eob) {
2615 1090210 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2616 : int32_t eobx, eoby;
2617 1090210 : get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
2618 1090210 : const int32_t txw_idx = get_txw_idx(tx_size);
2619 1090200 : const int32_t txh_idx = get_txh_idx(tx_size);
2620 1090200 : const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2621 1090200 : const int32_t txfm_size_col = tx_size_wide[tx_size];
2622 1090200 : const int32_t txfm_size_row = tx_size_high[tx_size];
2623 1090200 : const int32_t buf_size_w_div8 = (eobx + 8) >> 3;
2624 1090200 : const int32_t input_stride = AOMMIN(32, txfm_size_col);
2625 1090200 : const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2626 :
2627 1090200 : const int32_t fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
2628 : ASSERT(fun_idx < 4);
2629 1090200 : const transform_1d_ssse3 col_txfm =
2630 1090200 : lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
2631 :
2632 1090200 : assert(col_txfm != NULL);
2633 :
2634 : int32_t ud_flip, lr_flip;
2635 1090200 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2636 2305750 : for (int32_t i = 0; i < buf_size_w_div8; i++) {
2637 : __m128i buf0[64];
2638 1215610 : iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0],
2639 : eoby + 1, txw_idx, rect_type);
2640 1215620 : col_txfm(buf0, buf0, cos_bit_col);
2641 1215630 : __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
2642 1215630 : int32_t k = ud_flip ? (txfm_size_row - 1) : 0;
2643 1215630 : const int32_t step = ud_flip ? -1 : 1;
2644 1215630 : uint8_t *out_r = output_r + 8 * i;
2645 1215630 : uint8_t *out_w = output_w + 8 * i;
2646 11963700 : for (int32_t j = 0; j < txfm_size_row; ++j, k += step) {
2647 10748100 : const __m128i v = _mm_loadl_epi64((__m128i const *)(out_r));
2648 : ASSERT(k >= 0);
2649 10748100 : __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
2650 10748100 : const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
2651 10748100 : _mm_storel_epi64((__m128i *)(out_w), u);
2652 10748100 : out_r += stride_r;
2653 10748100 : out_w += stride_w;
2654 : }
2655 : }
2656 1090140 : }
2657 :
2658 1229030 : static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
2659 : const int32_t *input,
2660 : uint8_t *output_r, int32_t stride_r,
2661 : uint8_t *output_w, int32_t stride_w,
2662 : TxType tx_type, TxSize tx_size, int32_t eob) {
2663 : __m128i buf1[64];
2664 : int32_t eobx, eoby;
2665 1229030 : get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
2666 1229030 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2667 1229030 : const int32_t txw_idx = get_txw_idx(tx_size);
2668 1229030 : const int32_t txh_idx = get_txh_idx(tx_size);
2669 1229030 : const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2670 1229030 : const int32_t txfm_size_col = tx_size_wide[tx_size];
2671 1229030 : const int32_t txfm_size_row = tx_size_high[tx_size];
2672 1229030 : const int32_t buf_size_w_div8 = txfm_size_col >> 3;
2673 1229030 : const int32_t buf_size_h_div8 = (eoby + 8) >> 3;
2674 1229030 : const int32_t input_stride = AOMMIN(32, txfm_size_col);
2675 1229030 : const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2676 :
2677 1229030 : const int32_t fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
2678 1229030 : const transform_1d_ssse3 row_txfm =
2679 1229030 : lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
2680 :
2681 1229030 : assert(row_txfm != NULL);
2682 : int32_t ud_flip, lr_flip;
2683 1229030 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2684 2603450 : for (int32_t i = 0; i < buf_size_h_div8; i++) {
2685 : __m128i buf0[64];
2686 1374380 : const int32_t *input_row = input + i * input_stride * 8;
2687 2869460 : for (int32_t j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
2688 1495070 : __m128i *buf0_cur = buf0 + j * 8;
2689 1495070 : load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
2690 1494970 : transpose_16bit_8x8(buf0_cur, buf0_cur);
2691 : }
2692 1374390 : if (rect_type == 1 || rect_type == -1)
2693 411458 : round_shift_ssse3(buf0, buf0, input_stride); // rect special code
2694 1374390 : row_txfm(buf0, buf0, cos_bit_row);
2695 1374400 : round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
2696 1374400 : __m128i *_buf1 = buf1;
2697 1374400 : if (lr_flip) {
2698 451847 : for (int32_t j = 0; j < buf_size_w_div8; ++j) {
2699 : __m128i temp[8];
2700 239061 : flip_buf_sse2(buf0 + 8 * j, temp, 8);
2701 239060 : transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
2702 : }
2703 : }
2704 : else {
2705 2417660 : for (int32_t j = 0; j < buf_size_w_div8; ++j)
2706 1256040 : transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
2707 : }
2708 :
2709 2869520 : for (int32_t j = 0; j < buf_size_w_div8; ++j) {
2710 1495100 : iidentity_col_8xn_ssse3(
2711 1495100 : output_r + i * 8 * stride_r + j * 8, stride_r,
2712 1495100 : output_w + i * 8 * stride_w + j * 8, stride_w,
2713 1495100 : buf1 + j * 8, shift[1], 8, txh_idx);
2714 : }
2715 : }
2716 1229070 : }
2717 :
2718 : // for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
2719 16289700 : static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
2720 : const int32_t *input,
2721 : uint8_t *output_r, int32_t stride_r,
2722 : uint8_t *output_w, int32_t stride_w,
2723 : TxType tx_type,
2724 : TxSize tx_size, int32_t eob) {
2725 16289700 : switch (tx_type) {
2726 8470760 : case DCT_DCT:
2727 8470760 : lowbd_inv_txfm2d_add_no_identity_ssse3(input,
2728 : output_r, stride_r, output_w, stride_w,
2729 : tx_type, tx_size, eob);
2730 8470410 : break;
2731 819278 : case IDTX:
2732 819278 : lowbd_inv_txfm2d_add_idtx_ssse3(input,
2733 : output_r, stride_r, output_w, stride_w, tx_size);
2734 819270 : break;
2735 1090220 : case V_DCT:
2736 : case V_ADST:
2737 : case V_FLIPADST:
2738 1090220 : lowbd_inv_txfm2d_add_h_identity_ssse3(input,
2739 : output_r, stride_r, output_w, stride_w,
2740 : tx_type, tx_size, eob);
2741 1090200 : break;
2742 1229040 : case H_DCT:
2743 : case H_ADST:
2744 : case H_FLIPADST:
2745 1229040 : lowbd_inv_txfm2d_add_v_identity_ssse3(input,
2746 : output_r, stride_r, output_w, stride_w,
2747 : tx_type, tx_size, eob);
2748 1229040 : break;
2749 4680440 : default:
2750 4680440 : lowbd_inv_txfm2d_add_no_identity_ssse3(input,
2751 : output_r, stride_r, output_w, stride_w,
2752 : tx_type, tx_size, eob);
2753 4685240 : break;
2754 : }
2755 16294200 : }
2756 :
2757 2978360 : static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
2758 : uint8_t *output_r, int32_t stride_r,
2759 : uint8_t *output_w, int32_t stride_w,
2760 : TxType tx_type, TxSize tx_size_,
2761 : int32_t eob) {
2762 : (void)tx_size_;
2763 : (void)eob;
2764 : __m128i buf[8];
2765 2978360 : const TxSize tx_size = TX_4X8;
2766 2978360 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2767 2978360 : const int32_t txw_idx = get_txw_idx(tx_size);
2768 2978330 : const int32_t txh_idx = get_txh_idx(tx_size);
2769 2978350 : const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2770 2978350 : const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2771 2978350 : const int32_t txfm_size_col = tx_size_wide[tx_size];
2772 2978350 : const int32_t txfm_size_row = tx_size_high[tx_size];
2773 :
2774 2978350 : const transform_1d_ssse3 row_txfm =
2775 2978350 : lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
2776 2978350 : const transform_1d_ssse3 col_txfm =
2777 2978350 : lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2778 :
2779 : int32_t ud_flip, lr_flip;
2780 2978350 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2781 2978340 : load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
2782 2978290 : transpose_16bit_4x8(buf, buf);
2783 2978430 : round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
2784 2978400 : row_txfm(buf, buf, cos_bit_row);
2785 : // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
2786 2978540 : if (lr_flip) {
2787 : __m128i temp[4];
2788 59145 : flip_buf_sse2(buf, temp, txfm_size_col);
2789 59145 : transpose_16bit_8x4(temp, buf);
2790 : }
2791 : else
2792 2919390 : transpose_16bit_8x4(buf, buf);
2793 2978500 : col_txfm(buf, buf, cos_bit_col);
2794 2978480 : round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2795 2978350 : lowbd_write_buffer_4xn_sse2(buf, output_r, stride_r, output_w, stride_w,
2796 : ud_flip, txfm_size_row);
2797 2978430 : }
2798 :
2799 2903780 : static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
2800 : uint8_t *output_r, int32_t stride_r,
2801 : uint8_t *output_w, int32_t stride_w,
2802 : TxType tx_type, TxSize tx_size_,
2803 : int32_t eob) {
2804 : (void)tx_size_;
2805 : (void)eob;
2806 : __m128i buf[8];
2807 2903780 : const TxSize tx_size = TX_8X4;
2808 2903780 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2809 2903780 : const int32_t txw_idx = get_txw_idx(tx_size);
2810 2903750 : const int32_t txh_idx = get_txh_idx(tx_size);
2811 2903760 : const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2812 2903760 : const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2813 2903760 : const int32_t txfm_size_col = tx_size_wide[tx_size];
2814 2903760 : const int32_t txfm_size_row = tx_size_high[tx_size];
2815 :
2816 2903760 : const transform_1d_ssse3 row_txfm =
2817 2903760 : lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2818 2903760 : const transform_1d_ssse3 col_txfm =
2819 2903760 : lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
2820 :
2821 : int32_t ud_flip, lr_flip;
2822 2903760 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2823 2903750 : load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
2824 2903530 : transpose_16bit_8x4(buf, buf);
2825 2903750 : round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
2826 2903700 : row_txfm(buf, buf, cos_bit_row);
2827 : // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
2828 2903920 : if (lr_flip) {
2829 : __m128i temp[8];
2830 60587 : flip_buf_sse2(buf, temp, txfm_size_col);
2831 60587 : transpose_16bit_4x8(temp, buf);
2832 : }
2833 : else
2834 2843330 : transpose_16bit_4x8(buf, buf);
2835 2903910 : col_txfm(buf, buf, cos_bit_col);
2836 2903880 : round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2837 2903830 : lowbd_write_buffer_8xn_sse2(buf, output_r, stride_r, output_w, stride_w,
2838 : ud_flip, txfm_size_row);
2839 2903770 : }
2840 :
2841 1876100 : static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
2842 : uint8_t *output_r, int32_t stride_r,
2843 : uint8_t *output_w, int32_t stride_w,
2844 : TxType tx_type, TxSize tx_size_,
2845 : int32_t eob) {
2846 : (void)tx_size_;
2847 : (void)eob;
2848 : __m128i buf[16];
2849 1876100 : const TxSize tx_size = TX_4X16;
2850 1876100 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2851 1876100 : const int32_t txw_idx = get_txw_idx(tx_size);
2852 1876100 : const int32_t txh_idx = get_txh_idx(tx_size);
2853 1876110 : const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2854 1876110 : const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2855 1876110 : const int32_t txfm_size_col = tx_size_wide[tx_size];
2856 1876110 : const int32_t txfm_size_row = tx_size_high[tx_size];
2857 :
2858 1876110 : const transform_1d_ssse3 row_txfm =
2859 1876110 : lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
2860 1876110 : const transform_1d_ssse3 col_txfm =
2861 1876110 : lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2862 :
2863 : int32_t ud_flip, lr_flip;
2864 1876110 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2865 :
2866 1875890 : const int32_t row_one_loop = 8;
2867 5628060 : for (int32_t i = 0; i < 2; ++i) {
2868 3751900 : const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
2869 3751900 : __m128i *buf_cur = buf + i * row_one_loop;
2870 3751900 : load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
2871 : row_one_loop);
2872 3751670 : transpose_16bit_4x8(buf_cur, buf_cur);
2873 3752050 : row_txfm(buf_cur, buf_cur, cos_bit_row);
2874 3752160 : round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
2875 3751960 : if (lr_flip) {
2876 : __m128i temp[8];
2877 110766 : flip_buf_sse2(buf_cur, temp, txfm_size_col);
2878 110766 : transpose_16bit_8x4(temp, buf_cur);
2879 : }
2880 : else
2881 3641200 : transpose_16bit_8x4(buf_cur, buf_cur);
2882 : }
2883 1876160 : col_txfm(buf, buf, cos_bit_col);
2884 1876140 : round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2885 1876060 : lowbd_write_buffer_4xn_sse2(buf, output_r, stride_r, output_w, stride_w,
2886 : ud_flip, txfm_size_row);
2887 1876120 : }
2888 :
2889 1945300 : static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
2890 : uint8_t *output_r, int32_t stride_r,
2891 : uint8_t *output_w, int32_t stride_w,
2892 : TxType tx_type, TxSize tx_size_, int32_t eob) {
2893 : (void)tx_size_;
2894 : (void)eob;
2895 : __m128i buf[16];
2896 1945300 : const TxSize tx_size = TX_16X4;
2897 1945300 : const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2898 1945300 : const int32_t txw_idx = get_txw_idx(tx_size);
2899 1945270 : const int32_t txh_idx = get_txh_idx(tx_size);
2900 1945280 : const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2901 1945280 : const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2902 1945280 : const int32_t txfm_size_col = tx_size_wide[tx_size];
2903 1945280 : const int32_t txfm_size_row = tx_size_high[tx_size];
2904 1945280 : const int32_t buf_size_w_div8 = txfm_size_col >> 3;
2905 :
2906 1945280 : const transform_1d_ssse3 row_txfm =
2907 1945280 : lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2908 1945280 : const transform_1d_ssse3 col_txfm =
2909 1945280 : lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
2910 :
2911 : int32_t ud_flip, lr_flip;
2912 1945280 : get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2913 1945260 : const int32_t row_one_loop = 8;
2914 5835590 : for (int32_t i = 0; i < buf_size_w_div8; ++i) {
2915 3890330 : const int32_t *input_cur = input + i * row_one_loop;
2916 3890330 : __m128i *buf_cur = buf + i * row_one_loop;
2917 3890330 : load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
2918 : txfm_size_row);
2919 3890050 : transpose_16bit_8x4(buf_cur, buf_cur);
2920 : }
2921 1945260 : row_txfm(buf, buf, cos_bit_row);
2922 1945320 : round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
2923 1945260 : if (lr_flip) {
2924 : __m128i temp[16];
2925 58850 : flip_buf_sse2(buf, temp, 16);
2926 58850 : transpose_16bit_4x8(temp, buf);
2927 58850 : transpose_16bit_4x8(temp + 8, buf + 8);
2928 : }
2929 : else {
2930 1886410 : transpose_16bit_4x8(buf, buf);
2931 1886470 : transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
2932 : }
2933 5835710 : for (int32_t i = 0; i < buf_size_w_div8; i++) {
2934 3890410 : col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
2935 3890500 : round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
2936 : }
2937 1945300 : lowbd_write_buffer_8xn_sse2(buf, output_r, stride_r, output_w, stride_w, ud_flip, 4);
2938 1945260 : lowbd_write_buffer_8xn_sse2(buf + 8, output_r + 8, stride_r, output_w + 8, stride_w, ud_flip, 4);
2939 1945270 : }
2940 :
2941 38080000 : void eb_av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input,
2942 : uint8_t *output_r, int32_t stride_r,
2943 : uint8_t *output_w, int32_t stride_w,
2944 : TxType tx_type,
2945 : TxSize tx_size, int32_t eob) {
2946 38080000 : switch (tx_size) {
2947 12118700 : case TX_4X4:
2948 12118700 : lowbd_inv_txfm2d_add_4x4_ssse3(input,
2949 : output_r, stride_r, output_w, stride_w,
2950 : tx_type, tx_size, eob);
2951 12120200 : break;
2952 2978380 : case TX_4X8:
2953 2978380 : lowbd_inv_txfm2d_add_4x8_ssse3(input,
2954 : output_r, stride_r, output_w, stride_w,
2955 : tx_type, tx_size, eob);
2956 2978430 : break;
2957 2903810 : case TX_8X4:
2958 2903810 : lowbd_inv_txfm2d_add_8x4_ssse3(input,
2959 : output_r, stride_r, output_w, stride_w,
2960 : tx_type, tx_size, eob);
2961 2903770 : break;
2962 1876120 : case TX_4X16:
2963 1876120 : lowbd_inv_txfm2d_add_4x16_ssse3(input,
2964 : output_r, stride_r, output_w, stride_w,
2965 : tx_type, tx_size, eob);
2966 1876120 : break;
2967 1945300 : case TX_16X4:
2968 1945300 : lowbd_inv_txfm2d_add_16x4_ssse3(input,
2969 : output_r, stride_r, output_w, stride_w,
2970 : tx_type, tx_size, eob);
2971 1945260 : break;
2972 16257800 : default:
2973 16257800 : lowbd_inv_txfm2d_add_universe_ssse3(input,
2974 : output_r, stride_r, output_w, stride_w,
2975 : tx_type, tx_size, eob);
2976 16288400 : break;
2977 : }
2978 38112200 : }
2979 :
2980 0 : void eb_av1_inv_txfm_add_ssse3(const TranLow *dqcoeff,
2981 : uint8_t *dst_r, int32_t stride_r,
2982 : uint8_t *dst_w, int32_t stride_w,
2983 : const TxfmParam *txfm_param) {
2984 0 : const TxType tx_type = txfm_param->tx_type;
2985 0 : if (!txfm_param->lossless) {
2986 0 : eb_av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff,
2987 : dst_r, stride_r, dst_w, stride_w,
2988 0 : tx_type, txfm_param->tx_size, txfm_param->eob);
2989 : }
2990 : else {
2991 0 : eb_av1_inv_txfm_add_c(dqcoeff, dst_r, stride_r, dst_w,
2992 : stride_w, txfm_param);
2993 : }
2994 0 : }
|