LCOV - code coverage report
Current view: top level - ASM_SSSE3 - av1_inv_txfm_ssse3.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 1568 2181 71.9 %
Date: 2019-11-25 17:38:06 Functions: 61 75 81.3 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include "EbDefinitions.h"
      13             : #include "aom_dsp_rtcd.h"
      14             : #include <tmmintrin.h>
      15             : #include "EbTransforms.h"
      16             : #include "av1_inv_txfm_ssse3.h"
      17             : #include "av1_txfm_sse2.h"
      18             : #include "transpose_sse2.h"
      19             : 
      20             : // TODO(binpengsmail@gmail.com): replace some for loop with do {} while
      21             : 
      22             :     // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
      23             : static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
      24             :                                           4 * 5793 };
      25             : 
      26     9740620 : static void idct4_new_sse2(const __m128i *input, __m128i *output,
      27             :     int8_t cos_bit) {
      28             :     (void)cos_bit;
      29     9740620 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
      30     9740140 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
      31             : 
      32     9740140 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
      33     9740140 :     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
      34     9740140 :     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
      35     9740140 :     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
      36             : 
      37             :     // stage 1
      38             :     __m128i x[4];
      39     9740140 :     x[0] = input[0];
      40     9740140 :     x[1] = input[2];
      41     9740140 :     x[2] = input[1];
      42     9740140 :     x[3] = input[3];
      43             : 
      44             :     // stage 2
      45   155842000 :     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
      46   155842000 :     btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
      47             : 
      48             :     // stage 3
      49    19480300 :     btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
      50    19480300 :     btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
      51     9740140 : }
      52             : 
      53    12536600 : static void idct4_w4_new_sse2(const __m128i *input, __m128i *output,
      54             :     int8_t cos_bit) {
      55             :     (void)cos_bit;
      56    12536600 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
      57    12536600 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
      58             : 
      59    12536600 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
      60    12536600 :     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
      61    12536600 :     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
      62    12536600 :     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
      63             : 
      64             :     // stage 1
      65             :     __m128i x[4];
      66    12536600 :     x[0] = input[0];
      67    12536600 :     x[1] = input[2];
      68    12536600 :     x[2] = input[1];
      69    12536600 :     x[3] = input[3];
      70             : 
      71             :     // stage 2
      72   112829000 :     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
      73   112829000 :     btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
      74             : 
      75             :     // stage 3
      76    25073200 :     btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
      77    25073200 :     btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
      78    12536600 : }
      79             : 
      80        2550 : static void idct8_low1_new_ssse3(const __m128i *input, __m128i *output,
      81             :     int8_t cos_bit) {
      82             :     (void)cos_bit;
      83        2550 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
      84             : 
      85             :     // stage 1
      86             :     __m128i x[2];
      87        2550 :     x[0] = input[0];
      88             : 
      89             :     // stage 2
      90             :     // stage 3
      91       10200 :     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
      92             : 
      93             :     // stage 4
      94             :     // stage 5
      95        2550 :     output[0] = x[0];
      96        2550 :     output[7] = x[0];
      97        2550 :     output[1] = x[1];
      98        2550 :     output[6] = x[1];
      99        2550 :     output[2] = x[1];
     100        2550 :     output[5] = x[1];
     101        2550 :     output[3] = x[0];
     102        2550 :     output[4] = x[0];
     103        2550 : }
     104             : 
     105    24031100 : static void idct8_new_sse2(const __m128i *input, __m128i *output,
     106             :     int8_t cos_bit) {
     107             :     (void)cos_bit;
     108    24031100 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
     109    24029500 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
     110             : 
     111    24029500 :     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
     112    24029500 :     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
     113    24029500 :     const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
     114    24029500 :     const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
     115    24029500 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
     116    24029500 :     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
     117    24029500 :     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
     118    24029500 :     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
     119    24029500 :     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
     120             : 
     121             :     // stage 1
     122             :     __m128i x[8];
     123    24029500 :     x[0] = input[0];
     124    24029500 :     x[1] = input[4];
     125    24029500 :     x[2] = input[2];
     126    24029500 :     x[3] = input[6];
     127    24029500 :     x[4] = input[1];
     128    24029500 :     x[5] = input[5];
     129    24029500 :     x[6] = input[3];
     130    24029500 :     x[7] = input[7];
     131             : 
     132             :     // stage 2
     133   384472000 :     btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
     134   384472000 :     btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
     135             : 
     136             :     // stage 3
     137   384472000 :     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
     138   384472000 :     btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
     139    48059000 :     btf_16_adds_subs_sse2(x[4], x[5]);
     140    48059000 :     btf_16_subs_adds_sse2(x[7], x[6]);
     141             : 
     142             :     // stage 4
     143    48059000 :     btf_16_adds_subs_sse2(x[0], x[3]);
     144    48059000 :     btf_16_adds_subs_sse2(x[1], x[2]);
     145   384472000 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
     146             : 
     147             :     // stage 5
     148    48059000 :     btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
     149    48059000 :     btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
     150    48059000 :     btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
     151    48059000 :     btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
     152    24029500 : }
     153             : 
     154     4168520 : static void idct8_w4_new_sse2(const __m128i *input, __m128i *output,
     155             :     int8_t cos_bit) {
     156             :     (void)cos_bit;
     157     4168520 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
     158     4168440 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
     159             : 
     160     4168440 :     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
     161     4168440 :     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
     162     4168440 :     const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
     163     4168440 :     const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
     164     4168440 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
     165     4168440 :     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
     166     4168440 :     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
     167     4168440 :     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
     168     4168440 :     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
     169             : 
     170             :     // stage 1
     171             :     __m128i x[8];
     172     4168440 :     x[0] = input[0];
     173     4168440 :     x[1] = input[4];
     174     4168440 :     x[2] = input[2];
     175     4168440 :     x[3] = input[6];
     176     4168440 :     x[4] = input[1];
     177     4168440 :     x[5] = input[5];
     178     4168440 :     x[6] = input[3];
     179     4168440 :     x[7] = input[7];
     180             : 
     181             :     // stage 2
     182    37515900 :     btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
     183    37515900 :     btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
     184             : 
     185             :     // stage 3
     186    37515900 :     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
     187    37515900 :     btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
     188     8336880 :     btf_16_adds_subs_sse2(x[4], x[5]);
     189     8336880 :     btf_16_subs_adds_sse2(x[7], x[6]);
     190             : 
     191             :     // stage 4
     192     8336880 :     btf_16_adds_subs_sse2(x[0], x[3]);
     193     8336880 :     btf_16_adds_subs_sse2(x[1], x[2]);
     194    37515900 :     btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
     195             : 
     196             :     // stage 5
     197     8336880 :     btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
     198     8336880 :     btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
     199     8336880 :     btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
     200     8336880 :     btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
     201     4168440 : }
     202             : 
     203     3768230 : static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
     204             :     const __m128i __rounding,
     205             :     int8_t cos_bit) {
     206     3768230 :     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
     207     3768230 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
     208     7536470 :     btf_16_adds_subs_sse2(x[0], x[3]);
     209     7536470 :     btf_16_adds_subs_sse2(x[1], x[2]);
     210    60291700 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
     211     7536470 :     btf_16_adds_subs_sse2(x[8], x[11]);
     212     7536470 :     btf_16_adds_subs_sse2(x[9], x[10]);
     213     7536470 :     btf_16_subs_adds_sse2(x[15], x[12]);
     214     7536470 :     btf_16_subs_adds_sse2(x[14], x[13]);
     215     3768230 : }
     216             : 
     217     3768130 : static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
     218             :     const __m128i __rounding,
     219             :     int8_t cos_bit) {
     220     3768130 :     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
     221     3768130 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
     222     7536260 :     btf_16_adds_subs_sse2(x[0], x[7]);
     223     7536260 :     btf_16_adds_subs_sse2(x[1], x[6]);
     224     7536260 :     btf_16_adds_subs_sse2(x[2], x[5]);
     225     7536260 :     btf_16_adds_subs_sse2(x[3], x[4]);
     226    60290100 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
     227    60290100 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
     228     3768130 : }
     229             : 
     230     6508300 : static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
     231    13016600 :     btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
     232    13016600 :     btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
     233    13016600 :     btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
     234    13016600 :     btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
     235    13016600 :     btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
     236    13016600 :     btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
     237    13016600 :     btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
     238    13016600 :     btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
     239     6508300 : }
     240             : 
     241          62 : static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output,
     242             :     int8_t cos_bit) {
     243             :     (void)cos_bit;
     244          62 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
     245             : 
     246             :     // stage 1
     247             :     __m128i x[2];
     248          62 :     x[0] = input[0];
     249             : 
     250             :     // stage 2
     251             :     // stage 3
     252             :     // stage 4
     253         248 :     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
     254             : 
     255             :     // stage 5
     256             :     // stage 6
     257             :     // stage 7
     258          62 :     output[0] = x[0];
     259          62 :     output[15] = x[0];
     260          62 :     output[1] = x[1];
     261          62 :     output[14] = x[1];
     262          62 :     output[2] = x[1];
     263          62 :     output[13] = x[1];
     264          62 :     output[3] = x[0];
     265          62 :     output[12] = x[0];
     266          62 :     output[4] = x[0];
     267          62 :     output[11] = x[0];
     268          62 :     output[5] = x[1];
     269          62 :     output[10] = x[1];
     270          62 :     output[6] = x[1];
     271          62 :     output[9] = x[1];
     272          62 :     output[7] = x[0];
     273          62 :     output[8] = x[0];
     274          62 : }
     275             : 
     276         152 : static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output,
     277             :     int8_t cos_bit) {
     278             :     (void)cos_bit;
     279         152 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
     280         152 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
     281         152 :     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
     282         152 :     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
     283         152 :     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
     284             : 
     285             :     // stage 1
     286             :     __m128i x[16];
     287         152 :     x[0] = input[0];
     288         152 :     x[2] = input[4];
     289         152 :     x[4] = input[2];
     290         152 :     x[6] = input[6];
     291         152 :     x[8] = input[1];
     292         152 :     x[10] = input[5];
     293         152 :     x[12] = input[3];
     294         152 :     x[14] = input[7];
     295             : 
     296             :     // stage 2
     297         608 :     btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
     298         608 :     btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
     299         608 :     btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
     300         608 :     btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
     301             : 
     302             :     // stage 3
     303         608 :     btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
     304         608 :     btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
     305         304 :     btf_16_adds_subs_sse2(x[8], x[9]);
     306         304 :     btf_16_subs_adds_sse2(x[11], x[10]);
     307         304 :     btf_16_adds_subs_sse2(x[12], x[13]);
     308         304 :     btf_16_subs_adds_sse2(x[15], x[14]);
     309             : 
     310             :     // stage 4
     311         608 :     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
     312         608 :     btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
     313         304 :     btf_16_adds_subs_sse2(x[4], x[5]);
     314         304 :     btf_16_subs_adds_sse2(x[7], x[6]);
     315        2432 :     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
     316        2432 :     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
     317             : 
     318         152 :     idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
     319         152 :     idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
     320         152 :     idct16_stage7_sse2(output, x);
     321         152 : }
     322             : 
     323     3767880 : static void idct16_new_sse2(const __m128i *input, __m128i *output,
     324             :     int8_t cos_bit) {
     325             :     (void)cos_bit;
     326     3767880 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
     327     3767860 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
     328             : 
     329     3767860 :     const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
     330     3767860 :     const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
     331     3767860 :     const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
     332     3767860 :     const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
     333     3767860 :     const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
     334     3767860 :     const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
     335     3767860 :     const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
     336     3767860 :     const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
     337     3767860 :     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
     338     3767860 :     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
     339     3767860 :     const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
     340     3767860 :     const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
     341     3767860 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
     342     3767860 :     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
     343     3767860 :     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
     344     3767860 :     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
     345     3767860 :     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
     346     3767860 :     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
     347     3767860 :     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
     348             : 
     349             :     // stage 1
     350             :     __m128i x[16];
     351     3767860 :     x[0] = input[0];
     352     3767860 :     x[1] = input[8];
     353     3767860 :     x[2] = input[4];
     354     3767860 :     x[3] = input[12];
     355     3767860 :     x[4] = input[2];
     356     3767860 :     x[5] = input[10];
     357     3767860 :     x[6] = input[6];
     358     3767860 :     x[7] = input[14];
     359     3767860 :     x[8] = input[1];
     360     3767860 :     x[9] = input[9];
     361     3767860 :     x[10] = input[5];
     362     3767860 :     x[11] = input[13];
     363     3767860 :     x[12] = input[3];
     364     3767860 :     x[13] = input[11];
     365     3767860 :     x[14] = input[7];
     366     3767860 :     x[15] = input[15];
     367             : 
     368             :     // stage 2
     369    60285700 :     btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
     370    60285700 :     btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
     371    60285700 :     btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
     372    60285700 :     btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
     373             : 
     374             :     // stage 3
     375    60285700 :     btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
     376    60285700 :     btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
     377     7535720 :     btf_16_adds_subs_sse2(x[8], x[9]);
     378     7535720 :     btf_16_subs_adds_sse2(x[11], x[10]);
     379     7535720 :     btf_16_adds_subs_sse2(x[12], x[13]);
     380     7535720 :     btf_16_subs_adds_sse2(x[15], x[14]);
     381             : 
     382             :     // stage 4
     383    60285700 :     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
     384    60285700 :     btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
     385     7535720 :     btf_16_adds_subs_sse2(x[4], x[5]);
     386     7535720 :     btf_16_subs_adds_sse2(x[7], x[6]);
     387    60285700 :     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
     388    60285700 :     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
     389             : 
     390             :     // stage 5~7
     391     3767860 :     idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
     392     3768020 :     idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
     393     3768020 :     idct16_stage7_sse2(output, x);
     394     3768110 : }
     395             : 
     396     2740660 : static void idct16_w4_new_sse2(const __m128i *input, __m128i *output,
     397             :     int8_t cos_bit) {
     398             :     (void)cos_bit;
     399     2740660 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
     400     2740640 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
     401             : 
     402     2740640 :     const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
     403     2740640 :     const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
     404     2740640 :     const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
     405     2740640 :     const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
     406     2740640 :     const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
     407     2740640 :     const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
     408     2740640 :     const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
     409     2740640 :     const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
     410     2740640 :     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
     411     2740640 :     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
     412     2740640 :     const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
     413     2740640 :     const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
     414     2740640 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
     415     2740640 :     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
     416     2740640 :     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
     417     2740640 :     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
     418     2740640 :     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
     419     2740640 :     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
     420     2740640 :     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
     421     2740640 :     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
     422             : 
     423             :     // stage 1
     424             :     __m128i x[16];
     425     2740640 :     x[0] = input[0];
     426     2740640 :     x[1] = input[8];
     427     2740640 :     x[2] = input[4];
     428     2740640 :     x[3] = input[12];
     429     2740640 :     x[4] = input[2];
     430     2740640 :     x[5] = input[10];
     431     2740640 :     x[6] = input[6];
     432     2740640 :     x[7] = input[14];
     433     2740640 :     x[8] = input[1];
     434     2740640 :     x[9] = input[9];
     435     2740640 :     x[10] = input[5];
     436     2740640 :     x[11] = input[13];
     437     2740640 :     x[12] = input[3];
     438     2740640 :     x[13] = input[11];
     439     2740640 :     x[14] = input[7];
     440     2740640 :     x[15] = input[15];
     441             : 
     442             :     // stage 2
     443    24665800 :     btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
     444    24665800 :     btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
     445    24665800 :     btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
     446    24665800 :     btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
     447             : 
     448             :     // stage 3
     449    24665800 :     btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
     450    24665800 :     btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
     451     5481280 :     btf_16_adds_subs_sse2(x[8], x[9]);
     452     5481280 :     btf_16_subs_adds_sse2(x[11], x[10]);
     453     5481280 :     btf_16_adds_subs_sse2(x[12], x[13]);
     454     5481280 :     btf_16_subs_adds_sse2(x[15], x[14]);
     455             : 
     456             :     // stage 4
     457    24665800 :     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
     458    24665800 :     btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
     459     5481280 :     btf_16_adds_subs_sse2(x[4], x[5]);
     460     5481280 :     btf_16_subs_adds_sse2(x[7], x[6]);
     461    24665800 :     btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
     462    24665800 :     btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
     463             : 
     464             :     // stage 5
     465     5481280 :     btf_16_adds_subs_sse2(x[0], x[3]);
     466     5481280 :     btf_16_adds_subs_sse2(x[1], x[2]);
     467    24665800 :     btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
     468     5481280 :     btf_16_adds_subs_sse2(x[8], x[11]);
     469     5481280 :     btf_16_adds_subs_sse2(x[9], x[10]);
     470     5481280 :     btf_16_subs_adds_sse2(x[15], x[12]);
     471     5481280 :     btf_16_subs_adds_sse2(x[14], x[13]);
     472             : 
     473             :     // stage 6
     474     5481280 :     btf_16_adds_subs_sse2(x[0], x[7]);
     475     5481280 :     btf_16_adds_subs_sse2(x[1], x[6]);
     476     5481280 :     btf_16_adds_subs_sse2(x[2], x[5]);
     477     5481280 :     btf_16_adds_subs_sse2(x[3], x[4]);
     478    24665800 :     btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
     479    24665800 :     btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
     480             : 
     481             :     // stage 7
     482     2740640 :     idct16_stage7_sse2(output, x);
     483     2740760 : }
     484             : 
     485     1278550 : static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
     486     2557100 :     btf_16_adds_subs_sse2(x[16], x[17]);
     487     2557100 :     btf_16_subs_adds_sse2(x[19], x[18]);
     488     2557100 :     btf_16_adds_subs_sse2(x[20], x[21]);
     489     2557100 :     btf_16_subs_adds_sse2(x[23], x[22]);
     490     2557100 :     btf_16_adds_subs_sse2(x[24], x[25]);
     491     2557100 :     btf_16_subs_adds_sse2(x[27], x[26]);
     492     2557100 :     btf_16_adds_subs_sse2(x[28], x[29]);
     493     2557100 :     btf_16_subs_adds_sse2(x[31], x[30]);
     494     1278550 : }
     495             : 
     496     1278580 : static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
     497             :     const __m128i __rounding,
     498             :     int8_t cos_bit) {
     499     1278580 :     const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
     500     1278580 :     const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
     501     1278580 :     const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
     502     1278580 :     const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
     503     1278580 :     const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
     504     1278580 :     const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
     505    20457300 :     btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
     506    20457300 :     btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
     507    20457300 :     btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
     508    20457300 :     btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
     509     1278580 : }
     510             : 
     511     1278590 : static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
     512             :     const __m128i __rounding,
     513             :     int8_t cos_bit) {
     514     1278590 :     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
     515     1278590 :     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
     516     1278590 :     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
     517    20457400 :     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
     518    20457400 :     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
     519     2557180 :     btf_16_adds_subs_sse2(x[16], x[19]);
     520     2557180 :     btf_16_adds_subs_sse2(x[17], x[18]);
     521     2557180 :     btf_16_subs_adds_sse2(x[23], x[20]);
     522     2557180 :     btf_16_subs_adds_sse2(x[22], x[21]);
     523     2557180 :     btf_16_adds_subs_sse2(x[24], x[27]);
     524     2557180 :     btf_16_adds_subs_sse2(x[25], x[26]);
     525     2557180 :     btf_16_subs_adds_sse2(x[31], x[28]);
     526     2557180 :     btf_16_subs_adds_sse2(x[30], x[29]);
     527     1278590 : }
     528             : 
     529     1278590 : static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
     530             :     const __m128i __rounding,
     531             :     int8_t cos_bit) {
     532     1278590 :     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
     533     1278590 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
     534     1278590 :     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
     535     1278590 :     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
     536     1278590 :     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
     537    20457400 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
     538     2557180 :     btf_16_adds_subs_sse2(x[8], x[11]);
     539     2557180 :     btf_16_adds_subs_sse2(x[9], x[10]);
     540     2557180 :     btf_16_subs_adds_sse2(x[15], x[12]);
     541     2557180 :     btf_16_subs_adds_sse2(x[14], x[13]);
     542    20457400 :     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
     543    20457400 :     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
     544    20457400 :     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
     545    20457400 :     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
     546     1278590 : }
     547             : 
     548     1278590 : static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
     549             :     const __m128i __rounding,
     550             :     int8_t cos_bit) {
     551     1278590 :     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
     552     1278590 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
     553     2557180 :     btf_16_adds_subs_sse2(x[0], x[7]);
     554     2557180 :     btf_16_adds_subs_sse2(x[1], x[6]);
     555     2557180 :     btf_16_adds_subs_sse2(x[2], x[5]);
     556     2557180 :     btf_16_adds_subs_sse2(x[3], x[4]);
     557    20457500 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
     558    20457500 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
     559     2557180 :     btf_16_adds_subs_sse2(x[16], x[23]);
     560     2557180 :     btf_16_adds_subs_sse2(x[17], x[22]);
     561     2557180 :     btf_16_adds_subs_sse2(x[18], x[21]);
     562     2557180 :     btf_16_adds_subs_sse2(x[19], x[20]);
     563     2557180 :     btf_16_subs_adds_sse2(x[31], x[24]);
     564     2557180 :     btf_16_subs_adds_sse2(x[30], x[25]);
     565     2557180 :     btf_16_subs_adds_sse2(x[29], x[26]);
     566     2557180 :     btf_16_subs_adds_sse2(x[28], x[27]);
     567     1278590 : }
     568             : 
     569     1278590 : static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
     570             :     const __m128i __rounding,
     571             :     int8_t cos_bit) {
     572     1278590 :     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
     573     1278590 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
     574     2557180 :     btf_16_adds_subs_sse2(x[0], x[15]);
     575     2557180 :     btf_16_adds_subs_sse2(x[1], x[14]);
     576     2557180 :     btf_16_adds_subs_sse2(x[2], x[13]);
     577     2557180 :     btf_16_adds_subs_sse2(x[3], x[12]);
     578     2557180 :     btf_16_adds_subs_sse2(x[4], x[11]);
     579     2557180 :     btf_16_adds_subs_sse2(x[5], x[10]);
     580     2557180 :     btf_16_adds_subs_sse2(x[6], x[9]);
     581     2557180 :     btf_16_adds_subs_sse2(x[7], x[8]);
     582    20457500 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
     583    20457500 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
     584    20457500 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
     585    20457500 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
     586     1278590 : }
     587             : 
     588     1278590 : static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
     589     2557180 :     btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
     590     2557180 :     btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
     591     2557180 :     btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
     592     2557180 :     btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
     593     2557180 :     btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
     594     2557180 :     btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
     595     2557180 :     btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
     596     2557180 :     btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
     597     2557180 :     btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
     598     2557180 :     btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
     599     2557180 :     btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
     600     2557180 :     btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
     601     2557180 :     btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
     602     2557180 :     btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
     603     2557180 :     btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
     604     2557180 :     btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
     605     1278590 : }
     606             : 
     607          50 : static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output,
     608             :     int8_t cos_bit) {
     609             :     (void)cos_bit;
     610          50 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
     611             : 
     612             :     // stage 1
     613             :     __m128i x[2];
     614          50 :     x[0] = input[0];
     615             : 
     616             :     // stage 2
     617             :     // stage 3
     618             :     // stage 4
     619             :     // stage 5
     620         200 :     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
     621             : 
     622             :     // stage 6
     623             :     // stage 7
     624             :     // stage 8
     625             :     // stage 9
     626          50 :     output[0] = x[0];
     627          50 :     output[31] = x[0];
     628          50 :     output[1] = x[1];
     629          50 :     output[30] = x[1];
     630          50 :     output[2] = x[1];
     631          50 :     output[29] = x[1];
     632          50 :     output[3] = x[0];
     633          50 :     output[28] = x[0];
     634          50 :     output[4] = x[0];
     635          50 :     output[27] = x[0];
     636          50 :     output[5] = x[1];
     637          50 :     output[26] = x[1];
     638          50 :     output[6] = x[1];
     639          50 :     output[25] = x[1];
     640          50 :     output[7] = x[0];
     641          50 :     output[24] = x[0];
     642          50 :     output[8] = x[0];
     643          50 :     output[23] = x[0];
     644          50 :     output[9] = x[1];
     645          50 :     output[22] = x[1];
     646          50 :     output[10] = x[1];
     647          50 :     output[21] = x[1];
     648          50 :     output[11] = x[0];
     649          50 :     output[20] = x[0];
     650          50 :     output[12] = x[0];
     651          50 :     output[19] = x[0];
     652          50 :     output[13] = x[1];
     653          50 :     output[18] = x[1];
     654          50 :     output[14] = x[1];
     655          50 :     output[17] = x[1];
     656          50 :     output[15] = x[0];
     657          50 :     output[16] = x[0];
     658          50 : }
     659             : 
     660          30 : static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output,
     661             :     int8_t cos_bit) {
     662             :     (void)cos_bit;
     663          30 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
     664          30 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
     665             : 
     666             :     // stage 1
     667             :     __m128i x[32];
     668          30 :     x[0] = input[0];
     669          30 :     x[4] = input[4];
     670          30 :     x[8] = input[2];
     671          30 :     x[12] = input[6];
     672          30 :     x[16] = input[1];
     673          30 :     x[20] = input[5];
     674          30 :     x[24] = input[3];
     675          30 :     x[28] = input[7];
     676             : 
     677             :     // stage 2
     678         120 :     btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
     679         120 :     btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
     680         120 :     btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
     681         120 :     btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
     682             : 
     683             :     // stage 3
     684         120 :     btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
     685         120 :     btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
     686          30 :     x[17] = x[16];
     687          30 :     x[18] = x[19];
     688          30 :     x[21] = x[20];
     689          30 :     x[22] = x[23];
     690          30 :     x[25] = x[24];
     691          30 :     x[26] = x[27];
     692          30 :     x[29] = x[28];
     693          30 :     x[30] = x[31];
     694             : 
     695             :     // stage 4
     696         120 :     btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
     697          30 :     x[9] = x[8];
     698          30 :     x[10] = x[11];
     699          30 :     x[13] = x[12];
     700          30 :     x[14] = x[15];
     701          30 :     idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
     702             : 
     703             :     // stage 5
     704         120 :     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
     705          30 :     x[5] = x[4];
     706          30 :     x[6] = x[7];
     707          30 :     idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
     708             :     // stage 6
     709          30 :     x[3] = x[0];
     710          30 :     x[2] = x[1];
     711          30 :     idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
     712             : 
     713          30 :     idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
     714          30 :     idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
     715          30 :     idct32_stage9_sse2(output, x);
     716          30 : }
     717             : 
     718           8 : static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output,
     719             :     int8_t cos_bit) {
     720             :     (void)cos_bit;
     721           8 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
     722           8 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
     723             : 
     724             :     // stage 1
     725             :     __m128i x[32];
     726           8 :     x[0] = input[0];
     727           8 :     x[2] = input[8];
     728           8 :     x[4] = input[4];
     729           8 :     x[6] = input[12];
     730           8 :     x[8] = input[2];
     731           8 :     x[10] = input[10];
     732           8 :     x[12] = input[6];
     733           8 :     x[14] = input[14];
     734           8 :     x[16] = input[1];
     735           8 :     x[18] = input[9];
     736           8 :     x[20] = input[5];
     737           8 :     x[22] = input[13];
     738           8 :     x[24] = input[3];
     739           8 :     x[26] = input[11];
     740           8 :     x[28] = input[7];
     741           8 :     x[30] = input[15];
     742             : 
     743             :     // stage 2
     744          32 :     btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
     745          32 :     btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
     746          32 :     btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
     747          32 :     btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
     748          32 :     btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
     749          32 :     btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
     750          32 :     btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
     751          32 :     btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
     752             : 
     753             :     // stage 3
     754          32 :     btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
     755          32 :     btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
     756          32 :     btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
     757          32 :     btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
     758           8 :     idct32_high16_stage3_sse2(x);
     759             : 
     760             :     // stage 4
     761          32 :     btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
     762          32 :     btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
     763          16 :     btf_16_adds_subs_sse2(x[8], x[9]);
     764          16 :     btf_16_subs_adds_sse2(x[11], x[10]);
     765          16 :     btf_16_adds_subs_sse2(x[12], x[13]);
     766          16 :     btf_16_subs_adds_sse2(x[15], x[14]);
     767           8 :     idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
     768             : 
     769             :     // stage 5
     770          32 :     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
     771          32 :     btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
     772          16 :     btf_16_adds_subs_sse2(x[4], x[5]);
     773          16 :     btf_16_subs_adds_sse2(x[7], x[6]);
     774           8 :     idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
     775             : 
     776          16 :     btf_16_adds_subs_sse2(x[0], x[3]);
     777          16 :     btf_16_adds_subs_sse2(x[1], x[2]);
     778           8 :     idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
     779             : 
     780           8 :     idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
     781           8 :     idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
     782           8 :     idct32_stage9_sse2(output, x);
     783           8 : }
     784             : 
     785     1278540 : static void idct32_new_sse2(const __m128i *input, __m128i *output,
     786             :     int8_t cos_bit) {
     787             :     (void)cos_bit;
     788     1278540 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
     789     1278540 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
     790             : 
     791     1278540 :     const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
     792     1278540 :     const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
     793     1278540 :     const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
     794     1278540 :     const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
     795     1278540 :     const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
     796     1278540 :     const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
     797     1278540 :     const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
     798     1278540 :     const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
     799     1278540 :     const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
     800     1278540 :     const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
     801     1278540 :     const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
     802     1278540 :     const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
     803     1278540 :     const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
     804     1278540 :     const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
     805     1278540 :     const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
     806     1278540 :     const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
     807     1278540 :     const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
     808     1278540 :     const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
     809     1278540 :     const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
     810     1278540 :     const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
     811     1278540 :     const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
     812     1278540 :     const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
     813     1278540 :     const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
     814     1278540 :     const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
     815     1278540 :     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
     816     1278540 :     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
     817     1278540 :     const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
     818     1278540 :     const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
     819     1278540 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
     820     1278540 :     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
     821     1278540 :     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
     822     1278540 :     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
     823             : 
     824             :     // stage 1
     825             :     __m128i x[32];
     826     1278540 :     x[0] = input[0];
     827     1278540 :     x[1] = input[16];
     828     1278540 :     x[2] = input[8];
     829     1278540 :     x[3] = input[24];
     830     1278540 :     x[4] = input[4];
     831     1278540 :     x[5] = input[20];
     832     1278540 :     x[6] = input[12];
     833     1278540 :     x[7] = input[28];
     834     1278540 :     x[8] = input[2];
     835     1278540 :     x[9] = input[18];
     836     1278540 :     x[10] = input[10];
     837     1278540 :     x[11] = input[26];
     838     1278540 :     x[12] = input[6];
     839     1278540 :     x[13] = input[22];
     840     1278540 :     x[14] = input[14];
     841     1278540 :     x[15] = input[30];
     842     1278540 :     x[16] = input[1];
     843     1278540 :     x[17] = input[17];
     844     1278540 :     x[18] = input[9];
     845     1278540 :     x[19] = input[25];
     846     1278540 :     x[20] = input[5];
     847     1278540 :     x[21] = input[21];
     848     1278540 :     x[22] = input[13];
     849     1278540 :     x[23] = input[29];
     850     1278540 :     x[24] = input[3];
     851     1278540 :     x[25] = input[19];
     852     1278540 :     x[26] = input[11];
     853     1278540 :     x[27] = input[27];
     854     1278540 :     x[28] = input[7];
     855     1278540 :     x[29] = input[23];
     856     1278540 :     x[30] = input[15];
     857     1278540 :     x[31] = input[31];
     858             : 
     859             :     // stage 2
     860    20456700 :     btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
     861    20456700 :     btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
     862    20456700 :     btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
     863    20456700 :     btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
     864    20456700 :     btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
     865    20456700 :     btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
     866    20456700 :     btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
     867    20456700 :     btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
     868             : 
     869             :     // stage 3
     870    20456700 :     btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
     871    20456700 :     btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
     872    20456700 :     btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
     873    20456700 :     btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
     874     1278540 :     idct32_high16_stage3_sse2(x);
     875             : 
     876             :     // stage 4
     877    20456800 :     btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
     878    20456800 :     btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
     879     2557100 :     btf_16_adds_subs_sse2(x[8], x[9]);
     880     2557100 :     btf_16_subs_adds_sse2(x[11], x[10]);
     881     2557100 :     btf_16_adds_subs_sse2(x[12], x[13]);
     882     2557100 :     btf_16_subs_adds_sse2(x[15], x[14]);
     883     1278550 :     idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
     884             : 
     885             :     // stage 5
     886    20456800 :     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
     887    20456800 :     btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
     888     2557100 :     btf_16_adds_subs_sse2(x[4], x[5]);
     889     2557100 :     btf_16_adds_subs_sse2(x[7], x[6]);
     890     1278550 :     idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
     891             : 
     892             :     // stage 6
     893     2557110 :     btf_16_adds_subs_sse2(x[0], x[3]);
     894     2557110 :     btf_16_adds_subs_sse2(x[1], x[2]);
     895     1278560 :     idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
     896             : 
     897             :     // stage 7~8
     898     1278560 :     idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
     899     1278550 :     idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
     900     1278550 :     idct32_stage9_sse2(output, x);
     901     1278560 : }
     902             : 
     903           0 : static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
     904             :     const __m128i __rounding,
     905             :     int8_t cos_bit) {
     906           0 :     const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
     907           0 :     const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
     908           0 :     const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
     909           0 :     const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
     910           0 :     const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
     911           0 :     const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
     912           0 :     const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
     913           0 :     const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
     914           0 :     const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
     915           0 :     const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
     916           0 :     const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
     917           0 :     const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
     918           0 :     btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
     919           0 :     btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
     920           0 :     btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
     921           0 :     btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
     922           0 :     btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
     923           0 :     btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
     924           0 :     btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
     925           0 :     btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
     926           0 : }
     927             : 
     928           0 : static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
     929             :     const __m128i __rounding,
     930             :     int8_t cos_bit) {
     931           0 :     const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
     932           0 :     const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
     933           0 :     const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
     934           0 :     const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
     935           0 :     const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
     936           0 :     const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
     937           0 :     btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
     938           0 :     btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
     939           0 :     btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
     940           0 :     btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
     941           0 :     btf_16_adds_subs_sse2(x[32], x[35]);
     942           0 :     btf_16_adds_subs_sse2(x[33], x[34]);
     943           0 :     btf_16_subs_adds_sse2(x[39], x[36]);
     944           0 :     btf_16_subs_adds_sse2(x[38], x[37]);
     945           0 :     btf_16_adds_subs_sse2(x[40], x[43]);
     946           0 :     btf_16_adds_subs_sse2(x[41], x[42]);
     947           0 :     btf_16_subs_adds_sse2(x[47], x[44]);
     948           0 :     btf_16_subs_adds_sse2(x[46], x[45]);
     949           0 :     btf_16_adds_subs_sse2(x[48], x[51]);
     950           0 :     btf_16_adds_subs_sse2(x[49], x[50]);
     951           0 :     btf_16_subs_adds_sse2(x[55], x[52]);
     952           0 :     btf_16_subs_adds_sse2(x[54], x[53]);
     953           0 :     btf_16_adds_subs_sse2(x[56], x[59]);
     954           0 :     btf_16_adds_subs_sse2(x[57], x[58]);
     955           0 :     btf_16_subs_adds_sse2(x[63], x[60]);
     956           0 :     btf_16_subs_adds_sse2(x[62], x[61]);
     957           0 : }
     958             : 
     959           0 : static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
     960             :     const __m128i __rounding,
     961             :     int8_t cos_bit) {
     962           0 :     const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
     963           0 :     const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
     964           0 :     const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
     965           0 :     const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
     966           0 :     const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
     967           0 :     const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
     968           0 :     btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
     969           0 :     btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
     970           0 :     btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
     971           0 :     btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
     972           0 :     btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
     973           0 :     btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
     974           0 :     btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
     975           0 :     btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
     976           0 : }
     977             : 
     978           0 : static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
     979             :     const __m128i __rounding,
     980             :     int8_t cos_bit) {
     981           0 :     btf_16_adds_subs_sse2(x[16], x[19]);
     982           0 :     btf_16_adds_subs_sse2(x[17], x[18]);
     983           0 :     btf_16_subs_adds_sse2(x[23], x[20]);
     984           0 :     btf_16_subs_adds_sse2(x[22], x[21]);
     985           0 :     btf_16_adds_subs_sse2(x[24], x[27]);
     986           0 :     btf_16_adds_subs_sse2(x[25], x[26]);
     987           0 :     btf_16_subs_adds_sse2(x[31], x[28]);
     988           0 :     btf_16_subs_adds_sse2(x[30], x[29]);
     989           0 :     idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
     990           0 : }
     991             : 
     992           0 : static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
     993             :     const __m128i __rounding,
     994             :     int8_t cos_bit) {
     995           0 :     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
     996           0 :     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
     997           0 :     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
     998           0 :     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
     999           0 :     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
    1000           0 :     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
    1001           0 :     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
    1002           0 :     btf_16_adds_subs_sse2(x[32], x[39]);
    1003           0 :     btf_16_adds_subs_sse2(x[33], x[38]);
    1004           0 :     btf_16_adds_subs_sse2(x[34], x[37]);
    1005           0 :     btf_16_adds_subs_sse2(x[35], x[36]);
    1006           0 :     btf_16_subs_adds_sse2(x[47], x[40]);
    1007           0 :     btf_16_subs_adds_sse2(x[46], x[41]);
    1008           0 :     btf_16_subs_adds_sse2(x[45], x[42]);
    1009           0 :     btf_16_subs_adds_sse2(x[44], x[43]);
    1010           0 :     btf_16_adds_subs_sse2(x[48], x[55]);
    1011           0 :     btf_16_adds_subs_sse2(x[49], x[54]);
    1012           0 :     btf_16_adds_subs_sse2(x[50], x[53]);
    1013           0 :     btf_16_adds_subs_sse2(x[51], x[52]);
    1014           0 :     btf_16_subs_adds_sse2(x[63], x[56]);
    1015           0 :     btf_16_subs_adds_sse2(x[62], x[57]);
    1016           0 :     btf_16_subs_adds_sse2(x[61], x[58]);
    1017           0 :     btf_16_subs_adds_sse2(x[60], x[59]);
    1018           0 : }
    1019             : 
    1020           0 : static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
    1021             :     const __m128i __rounding,
    1022             :     int8_t cos_bit) {
    1023           0 :     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
    1024           0 :     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
    1025           0 :     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
    1026           0 :     btf_16_adds_subs_sse2(x[16], x[23]);
    1027           0 :     btf_16_adds_subs_sse2(x[17], x[22]);
    1028           0 :     btf_16_adds_subs_sse2(x[18], x[21]);
    1029           0 :     btf_16_adds_subs_sse2(x[19], x[20]);
    1030           0 :     btf_16_subs_adds_sse2(x[31], x[24]);
    1031           0 :     btf_16_subs_adds_sse2(x[30], x[25]);
    1032           0 :     btf_16_subs_adds_sse2(x[29], x[26]);
    1033           0 :     btf_16_subs_adds_sse2(x[28], x[27]);
    1034           0 :     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
    1035           0 :     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
    1036           0 :     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
    1037           0 :     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
    1038           0 :     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
    1039           0 :     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
    1040           0 :     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
    1041           0 :     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
    1042           0 : }
    1043             : 
    1044           0 : static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
    1045             :     const __m128i __rounding,
    1046             :     int8_t cos_bit) {
    1047           0 :     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
    1048           0 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    1049           0 :     btf_16_adds_subs_sse2(x[0], x[15]);
    1050           0 :     btf_16_adds_subs_sse2(x[1], x[14]);
    1051           0 :     btf_16_adds_subs_sse2(x[2], x[13]);
    1052           0 :     btf_16_adds_subs_sse2(x[3], x[12]);
    1053           0 :     btf_16_adds_subs_sse2(x[4], x[11]);
    1054           0 :     btf_16_adds_subs_sse2(x[5], x[10]);
    1055           0 :     btf_16_adds_subs_sse2(x[6], x[9]);
    1056           0 :     btf_16_adds_subs_sse2(x[7], x[8]);
    1057           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
    1058           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
    1059           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
    1060           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
    1061           0 :     btf_16_adds_subs_sse2(x[32], x[47]);
    1062           0 :     btf_16_adds_subs_sse2(x[33], x[46]);
    1063           0 :     btf_16_adds_subs_sse2(x[34], x[45]);
    1064           0 :     btf_16_adds_subs_sse2(x[35], x[44]);
    1065           0 :     btf_16_adds_subs_sse2(x[36], x[43]);
    1066           0 :     btf_16_adds_subs_sse2(x[37], x[42]);
    1067           0 :     btf_16_adds_subs_sse2(x[38], x[41]);
    1068           0 :     btf_16_adds_subs_sse2(x[39], x[40]);
    1069           0 :     btf_16_subs_adds_sse2(x[63], x[48]);
    1070           0 :     btf_16_subs_adds_sse2(x[62], x[49]);
    1071           0 :     btf_16_subs_adds_sse2(x[61], x[50]);
    1072           0 :     btf_16_subs_adds_sse2(x[60], x[51]);
    1073           0 :     btf_16_subs_adds_sse2(x[59], x[52]);
    1074           0 :     btf_16_subs_adds_sse2(x[58], x[53]);
    1075           0 :     btf_16_subs_adds_sse2(x[57], x[54]);
    1076           0 :     btf_16_subs_adds_sse2(x[56], x[55]);
    1077           0 : }
    1078             : 
    1079           0 : static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
    1080             :     const __m128i __rounding,
    1081             :     int8_t cos_bit) {
    1082           0 :     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
    1083           0 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    1084           0 :     btf_16_adds_subs_sse2(x[0], x[31]);
    1085           0 :     btf_16_adds_subs_sse2(x[1], x[30]);
    1086           0 :     btf_16_adds_subs_sse2(x[2], x[29]);
    1087           0 :     btf_16_adds_subs_sse2(x[3], x[28]);
    1088           0 :     btf_16_adds_subs_sse2(x[4], x[27]);
    1089           0 :     btf_16_adds_subs_sse2(x[5], x[26]);
    1090           0 :     btf_16_adds_subs_sse2(x[6], x[25]);
    1091           0 :     btf_16_adds_subs_sse2(x[7], x[24]);
    1092           0 :     btf_16_adds_subs_sse2(x[8], x[23]);
    1093           0 :     btf_16_adds_subs_sse2(x[9], x[22]);
    1094           0 :     btf_16_adds_subs_sse2(x[10], x[21]);
    1095           0 :     btf_16_adds_subs_sse2(x[11], x[20]);
    1096           0 :     btf_16_adds_subs_sse2(x[12], x[19]);
    1097           0 :     btf_16_adds_subs_sse2(x[13], x[18]);
    1098           0 :     btf_16_adds_subs_sse2(x[14], x[17]);
    1099           0 :     btf_16_adds_subs_sse2(x[15], x[16]);
    1100           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
    1101           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
    1102           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
    1103           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
    1104           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
    1105           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
    1106           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
    1107           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
    1108           0 : }
    1109             : 
    1110           0 : static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
    1111           0 :     btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
    1112           0 :     btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
    1113           0 :     btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
    1114           0 :     btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
    1115           0 :     btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
    1116           0 :     btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
    1117           0 :     btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
    1118           0 :     btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
    1119           0 :     btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
    1120           0 :     btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
    1121           0 :     btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
    1122           0 :     btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
    1123           0 :     btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
    1124           0 :     btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
    1125           0 :     btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
    1126           0 :     btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
    1127           0 :     btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
    1128           0 :     btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
    1129           0 :     btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
    1130           0 :     btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
    1131           0 :     btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
    1132           0 :     btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
    1133           0 :     btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
    1134           0 :     btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
    1135           0 :     btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
    1136           0 :     btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
    1137           0 :     btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
    1138           0 :     btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
    1139           0 :     btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
    1140           0 :     btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
    1141           0 :     btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
    1142           0 :     btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
    1143           0 : }
    1144             : 
    1145           0 : static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output,
    1146             :     int8_t cos_bit) {
    1147             :     (void)cos_bit;
    1148           0 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
    1149             : 
    1150             :     // stage 1
    1151             :     __m128i x[32];
    1152           0 :     x[0] = input[0];
    1153             : 
    1154             :     // stage 2
    1155             :     // stage 3
    1156             :     // stage 4
    1157             :     // stage 5
    1158             :     // stage 6
    1159           0 :     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
    1160             : 
    1161             :     // stage 7
    1162             :     // stage 8
    1163             :     // stage 9
    1164             :     // stage 10
    1165             :     // stage 11
    1166           0 :     output[0] = x[0];
    1167           0 :     output[63] = x[0];
    1168           0 :     output[1] = x[1];
    1169           0 :     output[62] = x[1];
    1170           0 :     output[2] = x[1];
    1171           0 :     output[61] = x[1];
    1172           0 :     output[3] = x[0];
    1173           0 :     output[60] = x[0];
    1174           0 :     output[4] = x[0];
    1175           0 :     output[59] = x[0];
    1176           0 :     output[5] = x[1];
    1177           0 :     output[58] = x[1];
    1178           0 :     output[6] = x[1];
    1179           0 :     output[57] = x[1];
    1180           0 :     output[7] = x[0];
    1181           0 :     output[56] = x[0];
    1182           0 :     output[8] = x[0];
    1183           0 :     output[55] = x[0];
    1184           0 :     output[9] = x[1];
    1185           0 :     output[54] = x[1];
    1186           0 :     output[10] = x[1];
    1187           0 :     output[53] = x[1];
    1188           0 :     output[11] = x[0];
    1189           0 :     output[52] = x[0];
    1190           0 :     output[12] = x[0];
    1191           0 :     output[51] = x[0];
    1192           0 :     output[13] = x[1];
    1193           0 :     output[50] = x[1];
    1194           0 :     output[14] = x[1];
    1195           0 :     output[49] = x[1];
    1196           0 :     output[15] = x[0];
    1197           0 :     output[48] = x[0];
    1198           0 :     output[16] = x[0];
    1199           0 :     output[47] = x[0];
    1200           0 :     output[17] = x[1];
    1201           0 :     output[46] = x[1];
    1202           0 :     output[18] = x[1];
    1203           0 :     output[45] = x[1];
    1204           0 :     output[19] = x[0];
    1205           0 :     output[44] = x[0];
    1206           0 :     output[20] = x[0];
    1207           0 :     output[43] = x[0];
    1208           0 :     output[21] = x[1];
    1209           0 :     output[42] = x[1];
    1210           0 :     output[22] = x[1];
    1211           0 :     output[41] = x[1];
    1212           0 :     output[23] = x[0];
    1213           0 :     output[40] = x[0];
    1214           0 :     output[24] = x[0];
    1215           0 :     output[39] = x[0];
    1216           0 :     output[25] = x[1];
    1217           0 :     output[38] = x[1];
    1218           0 :     output[26] = x[1];
    1219           0 :     output[37] = x[1];
    1220           0 :     output[27] = x[0];
    1221           0 :     output[36] = x[0];
    1222           0 :     output[28] = x[0];
    1223           0 :     output[35] = x[0];
    1224           0 :     output[29] = x[1];
    1225           0 :     output[34] = x[1];
    1226           0 :     output[30] = x[1];
    1227           0 :     output[33] = x[1];
    1228           0 :     output[31] = x[0];
    1229           0 :     output[32] = x[0];
    1230           0 : }
    1231             : 
    1232           0 : static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
    1233             :     int8_t cos_bit) {
    1234             :     (void)cos_bit;
    1235           0 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
    1236           0 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    1237           0 :     const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
    1238           0 :     const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
    1239           0 :     const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
    1240           0 :     const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
    1241           0 :     const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
    1242           0 :     const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
    1243           0 :     const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
    1244           0 :     const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
    1245           0 :     const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
    1246           0 :     const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
    1247           0 :     const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
    1248           0 :     const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
    1249           0 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    1250           0 :     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
    1251           0 :     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
    1252           0 :     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
    1253             : 
    1254             :     // stage 1
    1255             :     __m128i x[64];
    1256           0 :     x[0] = input[0];
    1257           0 :     x[8] = input[4];
    1258           0 :     x[16] = input[2];
    1259           0 :     x[24] = input[6];
    1260           0 :     x[32] = input[1];
    1261           0 :     x[40] = input[5];
    1262           0 :     x[48] = input[3];
    1263           0 :     x[56] = input[7];
    1264             : 
    1265             :     // stage 2
    1266           0 :     btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
    1267           0 :     btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
    1268           0 :     btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
    1269           0 :     btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
    1270             : 
    1271             :     // stage 3
    1272           0 :     btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
    1273           0 :     btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
    1274           0 :     x[33] = x[32];
    1275           0 :     x[38] = x[39];
    1276           0 :     x[41] = x[40];
    1277           0 :     x[46] = x[47];
    1278           0 :     x[49] = x[48];
    1279           0 :     x[54] = x[55];
    1280           0 :     x[57] = x[56];
    1281           0 :     x[62] = x[63];
    1282             : 
    1283             :     // stage 4
    1284           0 :     btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
    1285           0 :     x[17] = x[16];
    1286           0 :     x[22] = x[23];
    1287           0 :     x[25] = x[24];
    1288           0 :     x[30] = x[31];
    1289           0 :     btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
    1290           0 :     btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
    1291           0 :     btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
    1292           0 :     btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
    1293             : 
    1294             :     // stage 5
    1295           0 :     x[9] = x[8];
    1296           0 :     x[14] = x[15];
    1297           0 :     btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
    1298           0 :     btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
    1299           0 :     x[35] = x[32];
    1300           0 :     x[34] = x[33];
    1301           0 :     x[36] = x[39];
    1302           0 :     x[37] = x[38];
    1303           0 :     x[43] = x[40];
    1304           0 :     x[42] = x[41];
    1305           0 :     x[44] = x[47];
    1306           0 :     x[45] = x[46];
    1307           0 :     x[51] = x[48];
    1308           0 :     x[50] = x[49];
    1309           0 :     x[52] = x[55];
    1310           0 :     x[53] = x[54];
    1311           0 :     x[59] = x[56];
    1312           0 :     x[58] = x[57];
    1313           0 :     x[60] = x[63];
    1314           0 :     x[61] = x[62];
    1315             : 
    1316             :     // stage 6
    1317           0 :     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
    1318           0 :     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
    1319           0 :     x[19] = x[16];
    1320           0 :     x[18] = x[17];
    1321           0 :     x[20] = x[23];
    1322           0 :     x[21] = x[22];
    1323           0 :     x[27] = x[24];
    1324           0 :     x[26] = x[25];
    1325           0 :     x[28] = x[31];
    1326           0 :     x[29] = x[30];
    1327           0 :     idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
    1328             : 
    1329             :     // stage 7
    1330           0 :     x[3] = x[0];
    1331           0 :     x[2] = x[1];
    1332           0 :     x[11] = x[8];
    1333           0 :     x[10] = x[9];
    1334           0 :     x[12] = x[15];
    1335           0 :     x[13] = x[14];
    1336           0 :     idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
    1337             : 
    1338             :     // stage 8
    1339           0 :     x[7] = x[0];
    1340           0 :     x[6] = x[1];
    1341           0 :     x[5] = x[2];
    1342           0 :     x[4] = x[3];
    1343           0 :     x[9] = x[9];
    1344           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
    1345           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
    1346           0 :     idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
    1347             : 
    1348           0 :     idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
    1349           0 :     idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
    1350           0 :     idct64_stage11_sse2(output, x);
    1351           0 : }
    1352             : 
    1353           0 : static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output,
    1354             :     int8_t cos_bit) {
    1355             :     (void)cos_bit;
    1356           0 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
    1357           0 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    1358             : 
    1359           0 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    1360           0 :     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
    1361           0 :     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
    1362           0 :     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
    1363           0 :     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
    1364             : 
    1365             :     // stage 1
    1366             :     __m128i x[64];
    1367           0 :     x[0] = input[0];
    1368           0 :     x[4] = input[8];
    1369           0 :     x[8] = input[4];
    1370           0 :     x[12] = input[12];
    1371           0 :     x[16] = input[2];
    1372           0 :     x[20] = input[10];
    1373           0 :     x[24] = input[6];
    1374           0 :     x[28] = input[14];
    1375           0 :     x[32] = input[1];
    1376           0 :     x[36] = input[9];
    1377           0 :     x[40] = input[5];
    1378           0 :     x[44] = input[13];
    1379           0 :     x[48] = input[3];
    1380           0 :     x[52] = input[11];
    1381           0 :     x[56] = input[7];
    1382           0 :     x[60] = input[15];
    1383             : 
    1384             :     // stage 2
    1385           0 :     btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
    1386           0 :     btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
    1387           0 :     btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
    1388           0 :     btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
    1389           0 :     btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
    1390           0 :     btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
    1391           0 :     btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
    1392           0 :     btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
    1393             : 
    1394             :     // stage 3
    1395           0 :     btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
    1396           0 :     btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
    1397           0 :     btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
    1398           0 :     btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
    1399           0 :     x[33] = x[32];
    1400           0 :     x[34] = x[35];
    1401           0 :     x[37] = x[36];
    1402           0 :     x[38] = x[39];
    1403           0 :     x[41] = x[40];
    1404           0 :     x[42] = x[43];
    1405           0 :     x[45] = x[44];
    1406           0 :     x[46] = x[47];
    1407           0 :     x[49] = x[48];
    1408           0 :     x[50] = x[51];
    1409           0 :     x[53] = x[52];
    1410           0 :     x[54] = x[55];
    1411           0 :     x[57] = x[56];
    1412           0 :     x[58] = x[59];
    1413           0 :     x[61] = x[60];
    1414           0 :     x[62] = x[63];
    1415             : 
    1416             :     // stage 4
    1417           0 :     btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
    1418           0 :     btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
    1419           0 :     x[17] = x[16];
    1420           0 :     x[18] = x[19];
    1421           0 :     x[21] = x[20];
    1422           0 :     x[22] = x[23];
    1423           0 :     x[25] = x[24];
    1424           0 :     x[26] = x[27];
    1425           0 :     x[29] = x[28];
    1426           0 :     x[30] = x[31];
    1427           0 :     idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
    1428             : 
    1429             :     // stage 5
    1430           0 :     btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
    1431           0 :     x[9] = x[8];
    1432           0 :     x[10] = x[11];
    1433           0 :     x[13] = x[12];
    1434           0 :     x[14] = x[15];
    1435           0 :     idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
    1436             : 
    1437             :     // stage 6
    1438           0 :     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
    1439           0 :     x[5] = x[4];
    1440           0 :     x[6] = x[7];
    1441           0 :     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
    1442           0 :     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
    1443           0 :     idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
    1444             : 
    1445             :     // stage 7
    1446           0 :     x[3] = x[0];
    1447           0 :     x[2] = x[1];
    1448           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
    1449           0 :     btf_16_adds_subs_sse2(x[8], x[11]);
    1450           0 :     btf_16_adds_subs_sse2(x[9], x[10]);
    1451           0 :     btf_16_subs_adds_sse2(x[15], x[12]);
    1452           0 :     btf_16_subs_adds_sse2(x[14], x[13]);
    1453           0 :     idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
    1454             : 
    1455             :     // stage 8
    1456           0 :     btf_16_adds_subs_sse2(x[0], x[7]);
    1457           0 :     btf_16_adds_subs_sse2(x[1], x[6]);
    1458           0 :     btf_16_adds_subs_sse2(x[2], x[5]);
    1459           0 :     btf_16_adds_subs_sse2(x[3], x[4]);
    1460           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
    1461           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
    1462           0 :     idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
    1463             : 
    1464           0 :     idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
    1465           0 :     idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
    1466           0 :     idct64_stage11_sse2(output, x);
    1467           0 : }
    1468             : 
    1469           0 : static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output,
    1470             :     int8_t cos_bit) {
    1471             :     (void)cos_bit;
    1472           0 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
    1473           0 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    1474             : 
    1475           0 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    1476           0 :     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
    1477           0 :     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
    1478           0 :     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
    1479           0 :     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
    1480             : 
    1481             :     // stage 1
    1482             :     __m128i x[64];
    1483           0 :     x[0] = input[0];
    1484           0 :     x[2] = input[16];
    1485           0 :     x[4] = input[8];
    1486           0 :     x[6] = input[24];
    1487           0 :     x[8] = input[4];
    1488           0 :     x[10] = input[20];
    1489           0 :     x[12] = input[12];
    1490           0 :     x[14] = input[28];
    1491           0 :     x[16] = input[2];
    1492           0 :     x[18] = input[18];
    1493           0 :     x[20] = input[10];
    1494           0 :     x[22] = input[26];
    1495           0 :     x[24] = input[6];
    1496           0 :     x[26] = input[22];
    1497           0 :     x[28] = input[14];
    1498           0 :     x[30] = input[30];
    1499           0 :     x[32] = input[1];
    1500           0 :     x[34] = input[17];
    1501           0 :     x[36] = input[9];
    1502           0 :     x[38] = input[25];
    1503           0 :     x[40] = input[5];
    1504           0 :     x[42] = input[21];
    1505           0 :     x[44] = input[13];
    1506           0 :     x[46] = input[29];
    1507           0 :     x[48] = input[3];
    1508           0 :     x[50] = input[19];
    1509           0 :     x[52] = input[11];
    1510           0 :     x[54] = input[27];
    1511           0 :     x[56] = input[7];
    1512           0 :     x[58] = input[23];
    1513           0 :     x[60] = input[15];
    1514           0 :     x[62] = input[31];
    1515             : 
    1516             :     // stage 2
    1517           0 :     btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
    1518           0 :     btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
    1519           0 :     btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
    1520           0 :     btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
    1521           0 :     btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
    1522           0 :     btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
    1523           0 :     btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
    1524           0 :     btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
    1525           0 :     btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
    1526           0 :     btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
    1527           0 :     btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
    1528           0 :     btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
    1529           0 :     btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
    1530           0 :     btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
    1531           0 :     btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
    1532           0 :     btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
    1533             : 
    1534             :     // stage 3
    1535           0 :     btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
    1536           0 :     btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
    1537           0 :     btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
    1538           0 :     btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
    1539           0 :     btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
    1540           0 :     btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
    1541           0 :     btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
    1542           0 :     btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
    1543           0 :     btf_16_adds_subs_sse2(x[32], x[33]);
    1544           0 :     btf_16_subs_adds_sse2(x[35], x[34]);
    1545           0 :     btf_16_adds_subs_sse2(x[36], x[37]);
    1546           0 :     btf_16_subs_adds_sse2(x[39], x[38]);
    1547           0 :     btf_16_adds_subs_sse2(x[40], x[41]);
    1548           0 :     btf_16_subs_adds_sse2(x[43], x[42]);
    1549           0 :     btf_16_adds_subs_sse2(x[44], x[45]);
    1550           0 :     btf_16_subs_adds_sse2(x[47], x[46]);
    1551           0 :     btf_16_adds_subs_sse2(x[48], x[49]);
    1552           0 :     btf_16_subs_adds_sse2(x[51], x[50]);
    1553           0 :     btf_16_adds_subs_sse2(x[52], x[53]);
    1554           0 :     btf_16_subs_adds_sse2(x[55], x[54]);
    1555           0 :     btf_16_adds_subs_sse2(x[56], x[57]);
    1556           0 :     btf_16_subs_adds_sse2(x[59], x[58]);
    1557           0 :     btf_16_adds_subs_sse2(x[60], x[61]);
    1558           0 :     btf_16_subs_adds_sse2(x[63], x[62]);
    1559             : 
    1560             :     // stage 4
    1561           0 :     btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
    1562           0 :     btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
    1563           0 :     btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
    1564           0 :     btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
    1565           0 :     btf_16_adds_subs_sse2(x[16], x[17]);
    1566           0 :     btf_16_subs_adds_sse2(x[19], x[18]);
    1567           0 :     btf_16_adds_subs_sse2(x[20], x[21]);
    1568           0 :     btf_16_subs_adds_sse2(x[23], x[22]);
    1569           0 :     btf_16_adds_subs_sse2(x[24], x[25]);
    1570           0 :     btf_16_subs_adds_sse2(x[27], x[26]);
    1571           0 :     btf_16_adds_subs_sse2(x[28], x[29]);
    1572           0 :     btf_16_subs_adds_sse2(x[31], x[30]);
    1573           0 :     idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
    1574             : 
    1575             :     // stage 5
    1576           0 :     btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
    1577           0 :     btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
    1578           0 :     btf_16_adds_subs_sse2(x[8], x[9]);
    1579           0 :     btf_16_subs_adds_sse2(x[11], x[10]);
    1580           0 :     btf_16_adds_subs_sse2(x[12], x[13]);
    1581           0 :     btf_16_subs_adds_sse2(x[15], x[14]);
    1582           0 :     idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
    1583             : 
    1584             :     // stage 6
    1585           0 :     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
    1586           0 :     btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
    1587           0 :     btf_16_adds_subs_sse2(x[4], x[5]);
    1588           0 :     btf_16_subs_adds_sse2(x[7], x[6]);
    1589           0 :     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
    1590           0 :     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
    1591           0 :     idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
    1592             : 
    1593             :     // stage 7
    1594           0 :     btf_16_adds_subs_sse2(x[0], x[3]);
    1595           0 :     btf_16_adds_subs_sse2(x[1], x[2]);
    1596           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
    1597           0 :     btf_16_adds_subs_sse2(x[8], x[11]);
    1598           0 :     btf_16_adds_subs_sse2(x[9], x[10]);
    1599           0 :     btf_16_subs_adds_sse2(x[15], x[12]);
    1600           0 :     btf_16_subs_adds_sse2(x[14], x[13]);
    1601           0 :     idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
    1602             : 
    1603             :     // stage 8
    1604           0 :     btf_16_adds_subs_sse2(x[0], x[7]);
    1605           0 :     btf_16_adds_subs_sse2(x[1], x[6]);
    1606           0 :     btf_16_adds_subs_sse2(x[2], x[5]);
    1607           0 :     btf_16_adds_subs_sse2(x[3], x[4]);
    1608           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
    1609           0 :     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
    1610           0 :     idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
    1611             : 
    1612             :     // stage 9~11
    1613           0 :     idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
    1614           0 :     idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
    1615           0 :     idct64_stage11_sse2(output, x);
    1616           0 : }
    1617             : 
    1618     2737620 : static void iadst4_new_sse2(const __m128i *input, __m128i *output,
    1619             :     int8_t cos_bit) {
    1620             :     (void)cos_bit;
    1621     2737620 :     const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
    1622     2737600 :     const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
    1623     2737600 :     const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
    1624     2737600 :     const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
    1625     2737600 :     const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
    1626     2737600 :     const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
    1627     2737600 :     const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
    1628     2737600 :     const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
    1629     2737600 :     const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
    1630             :     __m128i x0[4];
    1631     2737600 :     x0[0] = input[0];
    1632     2737600 :     x0[1] = input[1];
    1633     2737600 :     x0[2] = input[2];
    1634     2737600 :     x0[3] = input[3];
    1635             : 
    1636             :     __m128i u[4];
    1637     2737600 :     u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
    1638     2737600 :     u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
    1639     2737600 :     u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
    1640     2737600 :     u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
    1641             : 
    1642             :     __m128i x1[16];
    1643     2737600 :     x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
    1644     2737600 :     x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
    1645     2737600 :     x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
    1646     2737600 :     x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
    1647     2737600 :     x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02);  // x1*sin3 + x3*sin2
    1648     2737600 :     x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
    1649     2737600 :     x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04);  // x1*sin3 - x3*sin4
    1650     2737600 :     x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
    1651     2737600 :     x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
    1652     2737600 :     x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
    1653     2737600 :     x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03);  // x2*sin3
    1654     2737600 :     x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
    1655     2737600 :     x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
    1656     2737600 :     x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
    1657     2737600 :     x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
    1658     2737600 :     x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
    1659             : 
    1660             :     __m128i x2[8];
    1661     2737600 :     x2[0] = _mm_add_epi32(x1[0], x1[4]);  // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
    1662     2737600 :     x2[1] = _mm_add_epi32(x1[1], x1[5]);
    1663     2737600 :     x2[2] = _mm_add_epi32(x1[2], x1[6]);  // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
    1664     2737600 :     x2[3] = _mm_add_epi32(x1[3], x1[7]);
    1665     2737600 :     x2[4] = _mm_add_epi32(x1[8], x1[10]);  // x0*sin3 -x2*sin3 +x3*sin3
    1666     2737600 :     x2[5] = _mm_add_epi32(x1[9], x1[11]);
    1667     2737600 :     x2[6] = _mm_add_epi32(x1[12], x1[14]);  // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
    1668     5475200 :     x2[7] = _mm_add_epi32(x1[13], x1[15]);
    1669             : 
    1670     2737600 :     const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    1671    13688000 :     for (int32_t i = 0; i < 4; ++i) {
    1672    10950400 :         __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
    1673    21900800 :         __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
    1674    10950400 :         out0 = _mm_srai_epi32(out0, INV_COS_BIT);
    1675    10950400 :         out1 = _mm_srai_epi32(out1, INV_COS_BIT);
    1676    21900800 :         output[i] = _mm_packs_epi32(out0, out1);
    1677             :     }
    1678     2737600 : }
    1679             : 
    1680             : // TODO(binpengsmail@gmail.com):
    1681             : // To explore the reuse of VP9 versions of corresponding SSE2 functions and
    1682             : // evaluate whether there is a possibility for further speedup.
    1683     7637690 : static void iadst4_w4_new_sse2(const __m128i *input, __m128i *output,
    1684             :     int8_t cos_bit) {
    1685             :     (void)cos_bit;
    1686     7637690 :     const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
    1687     7637760 :     const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
    1688     7637760 :     const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
    1689     7637760 :     const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
    1690     7637760 :     const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
    1691     7637760 :     const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
    1692     7637760 :     const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
    1693     7637760 :     const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
    1694     7637760 :     const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
    1695             :     __m128i x0[4];
    1696     7637760 :     x0[0] = input[0];
    1697     7637760 :     x0[1] = input[1];
    1698     7637760 :     x0[2] = input[2];
    1699     7637760 :     x0[3] = input[3];
    1700             : 
    1701             :     __m128i u[2];
    1702     7637760 :     u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
    1703     7637760 :     u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
    1704             : 
    1705             :     __m128i x1[8];
    1706     7637760 :     x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
    1707     7637760 :     x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
    1708     7637760 :     x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02);  // x1*sin3 + x3*sin2
    1709     7637760 :     x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04);  // x1*sin3 - x3*sin4
    1710     7637760 :     x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
    1711     7637760 :     x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03);    // x2*sin3
    1712     7637760 :     x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
    1713     7637760 :     x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
    1714             : 
    1715             :     __m128i x2[4];
    1716     7637760 :     x2[0] = _mm_add_epi32(x1[0], x1[2]);  // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
    1717     7637760 :     x2[1] = _mm_add_epi32(x1[1], x1[3]);  // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
    1718     7637760 :     x2[2] = _mm_add_epi32(x1[4], x1[5]);  // x0*sin3 - x2*sin3 + x3*sin3
    1719    15275500 :     x2[3] = _mm_add_epi32(x1[6], x1[7]);  // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
    1720             : 
    1721     7637760 :     const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    1722    38189000 :     for (int32_t i = 0; i < 4; ++i) {
    1723    61102600 :         __m128i out0 = _mm_add_epi32(x2[i], rounding);
    1724    30551300 :         out0 = _mm_srai_epi32(out0, INV_COS_BIT);
    1725    61102600 :         output[i] = _mm_packs_epi32(out0, out0);
    1726             :     }
    1727     7637760 : }
    1728             : 
    1729        1342 : static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output,
    1730             :     int8_t cos_bit) {
    1731             :     (void)cos_bit;
    1732        1342 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
    1733        1342 :     const __m128i __zero = _mm_setzero_si128();
    1734        1342 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    1735             : 
    1736        1342 :     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
    1737        1342 :     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
    1738        1342 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    1739        1342 :     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
    1740             : 
    1741             :     // stage 1
    1742             :     __m128i x[8];
    1743        1342 :     x[1] = input[0];
    1744             : 
    1745             :     // stage 2
    1746        5368 :     btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
    1747             : 
    1748             :     // stage 3
    1749        1342 :     x[4] = x[0];
    1750        1342 :     x[5] = x[1];
    1751             : 
    1752             :     // stage 4
    1753       21472 :     btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
    1754             : 
    1755             :     // stage 5
    1756        1342 :     x[2] = x[0];
    1757        1342 :     x[3] = x[1];
    1758        1342 :     x[6] = x[4];
    1759        1342 :     x[7] = x[5];
    1760             : 
    1761             :     // stage 6
    1762       21472 :     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
    1763       21472 :     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
    1764             : 
    1765             :     // stage 7
    1766        1342 :     output[0] = x[0];
    1767        1342 :     output[1] = _mm_subs_epi16(__zero, x[4]);
    1768        1342 :     output[2] = x[6];
    1769        1342 :     output[3] = _mm_subs_epi16(__zero, x[2]);
    1770        1342 :     output[4] = x[3];
    1771        1342 :     output[5] = _mm_subs_epi16(__zero, x[7]);
    1772        1342 :     output[6] = x[5];
    1773        1342 :     output[7] = _mm_subs_epi16(__zero, x[1]);
    1774        1342 : }
    1775             : 
    1776     7163460 : static void iadst8_new_sse2(const __m128i *input, __m128i *output,
    1777             :     int8_t cos_bit) {
    1778             :     (void)cos_bit;
    1779     7163460 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
    1780     7163310 :     const __m128i __zero = _mm_setzero_si128();
    1781     7163310 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    1782             : 
    1783     7163310 :     const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
    1784     7163310 :     const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
    1785     7163310 :     const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
    1786     7163310 :     const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
    1787     7163310 :     const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
    1788     7163310 :     const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
    1789     7163310 :     const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
    1790     7163310 :     const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
    1791     7163310 :     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
    1792     7163310 :     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
    1793     7163310 :     const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
    1794     7163310 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    1795     7163310 :     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
    1796             : 
    1797             :     // stage 1
    1798             :     __m128i x[8];
    1799     7163310 :     x[0] = input[7];
    1800     7163310 :     x[1] = input[0];
    1801     7163310 :     x[2] = input[5];
    1802     7163310 :     x[3] = input[2];
    1803     7163310 :     x[4] = input[3];
    1804     7163310 :     x[5] = input[4];
    1805     7163310 :     x[6] = input[1];
    1806     7163310 :     x[7] = input[6];
    1807             : 
    1808             :     // stage 2
    1809   114613000 :     btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
    1810   114613000 :     btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
    1811   114613000 :     btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
    1812   114613000 :     btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
    1813             : 
    1814             :     // stage 3
    1815    14326600 :     btf_16_adds_subs_sse2(x[0], x[4]);
    1816    14326600 :     btf_16_adds_subs_sse2(x[1], x[5]);
    1817    14326600 :     btf_16_adds_subs_sse2(x[2], x[6]);
    1818    14326600 :     btf_16_adds_subs_sse2(x[3], x[7]);
    1819             : 
    1820             :     // stage 4
    1821   114613000 :     btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
    1822   114613000 :     btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
    1823             : 
    1824             :     // stage 5
    1825    14326600 :     btf_16_adds_subs_sse2(x[0], x[2]);
    1826    14326600 :     btf_16_adds_subs_sse2(x[1], x[3]);
    1827    14326600 :     btf_16_adds_subs_sse2(x[4], x[6]);
    1828    14326600 :     btf_16_adds_subs_sse2(x[5], x[7]);
    1829             : 
    1830             :     // stage 6
    1831   114613000 :     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
    1832   114613000 :     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
    1833             : 
    1834             :     // stage 7
    1835     7163310 :     output[0] = x[0];
    1836     7163310 :     output[1] = _mm_subs_epi16(__zero, x[4]);
    1837     7163310 :     output[2] = x[6];
    1838     7163310 :     output[3] = _mm_subs_epi16(__zero, x[2]);
    1839     7163310 :     output[4] = x[3];
    1840     7163310 :     output[5] = _mm_subs_epi16(__zero, x[7]);
    1841     7163310 :     output[6] = x[5];
    1842     7163310 :     output[7] = _mm_subs_epi16(__zero, x[1]);
    1843     7163310 : }
    1844             : 
    1845     1321730 : static void iadst8_w4_new_sse2(const __m128i *input, __m128i *output,
    1846             :     int8_t cos_bit) {
    1847             :     (void)cos_bit;
    1848     1321730 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
    1849     1321720 :     const __m128i __zero = _mm_setzero_si128();
    1850     1321720 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    1851             : 
    1852     1321720 :     const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
    1853     1321720 :     const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
    1854     1321720 :     const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
    1855     1321720 :     const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
    1856     1321720 :     const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
    1857     1321720 :     const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
    1858     1321720 :     const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
    1859     1321720 :     const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
    1860     1321720 :     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
    1861     1321720 :     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
    1862     1321720 :     const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
    1863     1321720 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    1864     1321720 :     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
    1865             : 
    1866             :     // stage 1
    1867             :     __m128i x[8];
    1868     1321720 :     x[0] = input[7];
    1869     1321720 :     x[1] = input[0];
    1870     1321720 :     x[2] = input[5];
    1871     1321720 :     x[3] = input[2];
    1872     1321720 :     x[4] = input[3];
    1873     1321720 :     x[5] = input[4];
    1874     1321720 :     x[6] = input[1];
    1875     1321720 :     x[7] = input[6];
    1876             : 
    1877             :     // stage 2
    1878    11895500 :     btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
    1879    11895500 :     btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
    1880    11895500 :     btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
    1881    11895500 :     btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
    1882             : 
    1883             :     // stage 3
    1884     2643450 :     btf_16_adds_subs_sse2(x[0], x[4]);
    1885     2643450 :     btf_16_adds_subs_sse2(x[1], x[5]);
    1886     2643450 :     btf_16_adds_subs_sse2(x[2], x[6]);
    1887     2643450 :     btf_16_adds_subs_sse2(x[3], x[7]);
    1888             : 
    1889             :     // stage 4
    1890    11895500 :     btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
    1891    11895500 :     btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
    1892             : 
    1893             :     // stage 5
    1894     2643450 :     btf_16_adds_subs_sse2(x[0], x[2]);
    1895     2643450 :     btf_16_adds_subs_sse2(x[1], x[3]);
    1896     2643450 :     btf_16_adds_subs_sse2(x[4], x[6]);
    1897     2643450 :     btf_16_adds_subs_sse2(x[5], x[7]);
    1898             : 
    1899             :     // stage 6
    1900    11895500 :     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
    1901    11895500 :     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
    1902             : 
    1903             :     // stage 7
    1904     1321720 :     output[0] = x[0];
    1905     1321720 :     output[1] = _mm_subs_epi16(__zero, x[4]);
    1906     1321720 :     output[2] = x[6];
    1907     1321720 :     output[3] = _mm_subs_epi16(__zero, x[2]);
    1908     1321720 :     output[4] = x[3];
    1909     1321720 :     output[5] = _mm_subs_epi16(__zero, x[7]);
    1910     1321720 :     output[6] = x[5];
    1911     1321720 :     output[7] = _mm_subs_epi16(__zero, x[1]);
    1912     1321720 : }
    1913             : 
    1914     1575200 : static INLINE void iadst16_stage3_ssse3(__m128i *x) {
    1915     3150400 :     btf_16_adds_subs_sse2(x[0], x[8]);
    1916     3150400 :     btf_16_adds_subs_sse2(x[1], x[9]);
    1917     3150400 :     btf_16_adds_subs_sse2(x[2], x[10]);
    1918     3150400 :     btf_16_adds_subs_sse2(x[3], x[11]);
    1919     3150400 :     btf_16_adds_subs_sse2(x[4], x[12]);
    1920     3150400 :     btf_16_adds_subs_sse2(x[5], x[13]);
    1921     3150400 :     btf_16_adds_subs_sse2(x[6], x[14]);
    1922     3150400 :     btf_16_adds_subs_sse2(x[7], x[15]);
    1923     1575200 : }
    1924             : 
    1925      854235 : static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
    1926             :     const __m128i __rounding,
    1927             :     int8_t cos_bit) {
    1928      854235 :     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
    1929      854235 :     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
    1930      854235 :     const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
    1931      854235 :     const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
    1932      854235 :     const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
    1933      854235 :     const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
    1934    13667800 :     btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
    1935    13667800 :     btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
    1936    13667800 :     btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
    1937    13667800 :     btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
    1938      854235 : }
    1939             : 
    1940     1575190 : static INLINE void iadst16_stage5_ssse3(__m128i *x) {
    1941     3150380 :     btf_16_adds_subs_sse2(x[0], x[4]);
    1942     3150380 :     btf_16_adds_subs_sse2(x[1], x[5]);
    1943     3150380 :     btf_16_adds_subs_sse2(x[2], x[6]);
    1944     3150380 :     btf_16_adds_subs_sse2(x[3], x[7]);
    1945     3150380 :     btf_16_adds_subs_sse2(x[8], x[12]);
    1946     3150380 :     btf_16_adds_subs_sse2(x[9], x[13]);
    1947     3150380 :     btf_16_adds_subs_sse2(x[10], x[14]);
    1948     3150380 :     btf_16_adds_subs_sse2(x[11], x[15]);
    1949     1575190 : }
    1950             : 
    1951      854240 : static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
    1952             :     const __m128i __rounding,
    1953             :     int8_t cos_bit) {
    1954      854240 :     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
    1955      854240 :     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
    1956      854240 :     const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
    1957    13667800 :     btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
    1958    13667800 :     btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
    1959    13667800 :     btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
    1960    13667800 :     btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
    1961      854240 : }
    1962             : 
    1963     1575200 : static INLINE void iadst16_stage7_ssse3(__m128i *x) {
    1964     3150410 :     btf_16_adds_subs_sse2(x[0], x[2]);
    1965     3150410 :     btf_16_adds_subs_sse2(x[1], x[3]);
    1966     3150410 :     btf_16_adds_subs_sse2(x[4], x[6]);
    1967     3150410 :     btf_16_adds_subs_sse2(x[5], x[7]);
    1968     3150410 :     btf_16_adds_subs_sse2(x[8], x[10]);
    1969     3150410 :     btf_16_adds_subs_sse2(x[9], x[11]);
    1970     3150410 :     btf_16_adds_subs_sse2(x[12], x[14]);
    1971     3150410 :     btf_16_adds_subs_sse2(x[13], x[15]);
    1972     1575200 : }
    1973             : 
    1974      854278 : static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
    1975             :     const __m128i __rounding,
    1976             :     int8_t cos_bit) {
    1977      854278 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    1978      854278 :     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
    1979    13668400 :     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
    1980    13668400 :     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
    1981    13668400 :     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
    1982    13668400 :     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
    1983      854278 : }
    1984             : 
    1985     1575230 : static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
    1986     1575230 :     const __m128i __zero = _mm_setzero_si128();
    1987     1575230 :     output[0] = x[0];
    1988     1575230 :     output[1] = _mm_subs_epi16(__zero, x[8]);
    1989     1575230 :     output[2] = x[12];
    1990     1575230 :     output[3] = _mm_subs_epi16(__zero, x[4]);
    1991     1575230 :     output[4] = x[6];
    1992     1575230 :     output[5] = _mm_subs_epi16(__zero, x[14]);
    1993     1575230 :     output[6] = x[10];
    1994     1575230 :     output[7] = _mm_subs_epi16(__zero, x[2]);
    1995     1575230 :     output[8] = x[3];
    1996     1575230 :     output[9] = _mm_subs_epi16(__zero, x[11]);
    1997     1575230 :     output[10] = x[15];
    1998     1575230 :     output[11] = _mm_subs_epi16(__zero, x[7]);
    1999     1575230 :     output[12] = x[5];
    2000     1575230 :     output[13] = _mm_subs_epi16(__zero, x[13]);
    2001     1575230 :     output[14] = x[9];
    2002     1575230 :     output[15] = _mm_subs_epi16(__zero, x[1]);
    2003     1575230 : }
    2004             : 
    2005          42 : static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output,
    2006             :     int8_t cos_bit) {
    2007             :     (void)cos_bit;
    2008          42 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
    2009          42 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    2010             : 
    2011          42 :     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
    2012          42 :     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
    2013          42 :     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
    2014          42 :     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
    2015             : 
    2016             :     // stage 1
    2017             :     __m128i x[16];
    2018          42 :     x[1] = input[0];
    2019             : 
    2020             :     // stage 2
    2021         168 :     btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
    2022             : 
    2023             :     // stage 3
    2024          42 :     x[8] = x[0];
    2025          42 :     x[9] = x[1];
    2026             : 
    2027             :     // stage 4
    2028         672 :     btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
    2029             : 
    2030             :     // stage 5
    2031          42 :     x[4] = x[0];
    2032          42 :     x[5] = x[1];
    2033          42 :     x[12] = x[8];
    2034          42 :     x[13] = x[9];
    2035             : 
    2036             :     // stage 6
    2037         672 :     btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
    2038         672 :     btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
    2039             : 
    2040             :     // stage 7
    2041          42 :     x[2] = x[0];
    2042          42 :     x[3] = x[1];
    2043          42 :     x[6] = x[4];
    2044          42 :     x[7] = x[5];
    2045          42 :     x[10] = x[8];
    2046          42 :     x[11] = x[9];
    2047          42 :     x[14] = x[12];
    2048          42 :     x[15] = x[13];
    2049             : 
    2050          42 :     iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
    2051          42 :     iadst16_stage9_ssse3(output, x);
    2052          42 : }
    2053             : 
    2054         252 : static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output,
    2055             :     int8_t cos_bit) {
    2056             :     (void)cos_bit;
    2057         252 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
    2058         252 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    2059             : 
    2060             :     // stage 1
    2061             :     __m128i x[16];
    2062         252 :     x[1] = input[0];
    2063         252 :     x[3] = input[2];
    2064         252 :     x[5] = input[4];
    2065         252 :     x[7] = input[6];
    2066         252 :     x[8] = input[7];
    2067         252 :     x[10] = input[5];
    2068         252 :     x[12] = input[3];
    2069         252 :     x[14] = input[1];
    2070             : 
    2071             :     // stage 2
    2072        1008 :     btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
    2073        1008 :     btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
    2074        1008 :     btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
    2075        1008 :     btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
    2076        1008 :     btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
    2077        1008 :     btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
    2078        1008 :     btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
    2079        1008 :     btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
    2080             : 
    2081             :     // stage 3
    2082         252 :     iadst16_stage3_ssse3(x);
    2083         252 :     iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
    2084         252 :     iadst16_stage5_ssse3(x);
    2085         252 :     iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
    2086         252 :     iadst16_stage7_ssse3(x);
    2087         252 :     iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
    2088         252 :     iadst16_stage9_ssse3(output, x);
    2089         252 : }
    2090             : 
    2091      853967 : static void iadst16_new_sse2(const __m128i *input, __m128i *output,
    2092             :     int8_t cos_bit) {
    2093             :     (void)cos_bit;
    2094      853967 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
    2095      853967 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    2096      853967 :     const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
    2097      853967 :     const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
    2098      853967 :     const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
    2099      853967 :     const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
    2100      853967 :     const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
    2101      853967 :     const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
    2102      853967 :     const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
    2103      853967 :     const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
    2104      853967 :     const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
    2105      853967 :     const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
    2106      853967 :     const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
    2107      853967 :     const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
    2108      853967 :     const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
    2109      853967 :     const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
    2110      853967 :     const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
    2111      853967 :     const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
    2112             : 
    2113             :     // stage 1
    2114             :     __m128i x[16];
    2115      853967 :     x[0] = input[15];
    2116      853967 :     x[1] = input[0];
    2117      853967 :     x[2] = input[13];
    2118      853967 :     x[3] = input[2];
    2119      853967 :     x[4] = input[11];
    2120      853967 :     x[5] = input[4];
    2121      853967 :     x[6] = input[9];
    2122      853967 :     x[7] = input[6];
    2123      853967 :     x[8] = input[7];
    2124      853967 :     x[9] = input[8];
    2125      853967 :     x[10] = input[5];
    2126      853967 :     x[11] = input[10];
    2127      853967 :     x[12] = input[3];
    2128      853967 :     x[13] = input[12];
    2129      853967 :     x[14] = input[1];
    2130      853967 :     x[15] = input[14];
    2131             : 
    2132             :     // stage 2
    2133    13663500 :     btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
    2134    13663500 :     btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
    2135    13663500 :     btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
    2136    13663500 :     btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
    2137    13663500 :     btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
    2138    13663500 :     btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
    2139    13663500 :     btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
    2140    13663500 :     btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
    2141             : 
    2142             :     // stage 3~9
    2143      853967 :     iadst16_stage3_ssse3(x);
    2144      853985 :     iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
    2145      853983 :     iadst16_stage5_ssse3(x);
    2146      853989 :     iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
    2147      853985 :     iadst16_stage7_ssse3(x);
    2148      853986 :     iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
    2149      853984 :     iadst16_stage9_ssse3(output, x);
    2150      853983 : }
    2151             : 
    2152      720996 : static void iadst16_w4_new_sse2(const __m128i *input, __m128i *output,
    2153             :     int8_t cos_bit) {
    2154             :     (void)cos_bit;
    2155      720996 :     const int32_t *cospi = cospi_arr(INV_COS_BIT);
    2156      720993 :     const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
    2157             : 
    2158      720993 :     const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
    2159      720993 :     const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
    2160      720993 :     const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
    2161      720993 :     const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
    2162      720993 :     const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
    2163      720993 :     const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
    2164      720993 :     const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
    2165      720993 :     const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
    2166      720993 :     const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
    2167      720993 :     const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
    2168      720993 :     const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
    2169      720993 :     const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
    2170      720993 :     const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
    2171      720993 :     const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
    2172      720993 :     const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
    2173      720993 :     const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
    2174      720993 :     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
    2175      720993 :     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
    2176      720993 :     const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
    2177      720993 :     const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
    2178      720993 :     const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
    2179      720993 :     const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
    2180      720993 :     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
    2181      720993 :     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
    2182      720993 :     const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
    2183      720993 :     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
    2184      720993 :     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
    2185             : 
    2186             :     // stage 1
    2187             :     __m128i x[16];
    2188      720993 :     x[0] = input[15];
    2189      720993 :     x[1] = input[0];
    2190      720993 :     x[2] = input[13];
    2191      720993 :     x[3] = input[2];
    2192      720993 :     x[4] = input[11];
    2193      720993 :     x[5] = input[4];
    2194      720993 :     x[6] = input[9];
    2195      720993 :     x[7] = input[6];
    2196      720993 :     x[8] = input[7];
    2197      720993 :     x[9] = input[8];
    2198      720993 :     x[10] = input[5];
    2199      720993 :     x[11] = input[10];
    2200      720993 :     x[12] = input[3];
    2201      720993 :     x[13] = input[12];
    2202      720993 :     x[14] = input[1];
    2203      720993 :     x[15] = input[14];
    2204             : 
    2205             :     // stage 2
    2206     6488940 :     btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
    2207     6488940 :     btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
    2208     6488940 :     btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
    2209     6488940 :     btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
    2210     6488940 :     btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
    2211     6488940 :     btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
    2212     6488940 :     btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
    2213     6488940 :     btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
    2214             : 
    2215             :     // stage 3
    2216      720993 :     iadst16_stage3_ssse3(x);
    2217             : 
    2218             :     // stage 4
    2219     6489040 :     btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
    2220     6489040 :     btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
    2221     6489040 :     btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
    2222     6489040 :     btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
    2223             : 
    2224             :     // stage 5
    2225      721004 :     iadst16_stage5_ssse3(x);
    2226             : 
    2227             :     // stage 6
    2228     6489040 :     btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
    2229     6489040 :     btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
    2230     6489040 :     btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
    2231     6489040 :     btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
    2232             : 
    2233             :     // stage 7
    2234      721004 :     iadst16_stage7_ssse3(x);
    2235             : 
    2236             :     // stage 8
    2237     6489030 :     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
    2238     6489030 :     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
    2239     6489030 :     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
    2240     6489030 :     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
    2241             : 
    2242             :     // stage 9
    2243      721003 :     iadst16_stage9_ssse3(output, x);
    2244      721000 : }
    2245             : 
    2246     5108550 : static void iidentity4_new_ssse3(const __m128i *input, __m128i *output,
    2247             :     int8_t cos_bit) {
    2248             :     (void)cos_bit;
    2249     5108550 :     const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
    2250     5108550 :     const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
    2251    25542500 :     for (int32_t i = 0; i < 4; ++i) {
    2252    20434000 :         __m128i x = _mm_mulhrs_epi16(input[i], scale);
    2253    40868000 :         output[i] = _mm_adds_epi16(x, input[i]);
    2254             :     }
    2255     5108550 : }
    2256             : 
    2257      391909 : static void iidentity8_new_sse2(const __m128i *input, __m128i *output,
    2258             :     int8_t cos_bit) {
    2259             :     (void)cos_bit;
    2260     3527180 :     for (int32_t i = 0; i < 8; ++i)
    2261     6270550 :         output[i] = _mm_adds_epi16(input[i], input[i]);
    2262      391909 : }
    2263             : 
    2264      359808 : static void iidentity16_new_ssse3(const __m128i *input, __m128i *output,
    2265             :     int8_t cos_bit) {
    2266             :     (void)cos_bit;
    2267      359808 :     const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
    2268      359808 :     const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
    2269     6116510 :     for (int32_t i = 0; i < 16; ++i) {
    2270     5756700 :         __m128i x = _mm_mulhrs_epi16(input[i], scale);
    2271     5756700 :         __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
    2272    11513400 :         output[i] = _mm_adds_epi16(x, srcx2);
    2273             :     }
    2274      359808 : }
    2275             : 
    2276   155549000 : static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
    2277             :     __m128i res) {
    2278   155549000 :     const __m128i zero = _mm_setzero_si128();
    2279   311098000 :     __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
    2280   155549000 :     return _mm_packus_epi16(x0, x0);
    2281             : }
    2282             : 
    2283    16968400 : static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in,
    2284             :     uint8_t *output_r, int32_t stride_r,
    2285             :     uint8_t *output_w, int32_t stride_w,
    2286             :     int32_t flipud, const int32_t height) {
    2287    16968400 :     int32_t j = flipud ? (height - 1) : 0;
    2288    16968400 :     const int32_t step = flipud ? -1 : 1;
    2289    16968400 :     const __m128i zero = _mm_setzero_si128();
    2290   119118000 :     for (int32_t i = 0; i < height; ++i, j += step) {
    2291   204299000 :         const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output_r + i * stride_r)));
    2292   204299000 :         __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
    2293   102149000 :         u = _mm_packus_epi16(u, zero);
    2294   102149000 :         *((uint32_t *)(output_w + i * stride_w)) = _mm_cvtsi128_si32(u);
    2295             :     }
    2296    16968400 : }
    2297             : 
    2298    17268100 : static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in,
    2299             :     uint8_t *output_r, int32_t stride_r,
    2300             :     uint8_t *output_w, int32_t stride_w,
    2301             :     int32_t flipud, const int32_t height) {
    2302    17268100 :     int32_t j = flipud ? (height - 1) : 0;
    2303    17268100 :     const int32_t step = flipud ? -1 : 1;
    2304   162120000 :     for (int32_t i = 0; i < height; ++i, j += step) {
    2305   144860000 :         const __m128i v = _mm_loadl_epi64((__m128i const *)(output_r + i * stride_r));
    2306   144860000 :         const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
    2307   144852000 :         _mm_storel_epi64((__m128i *)(output_w + i * stride_w), u);
    2308             :     }
    2309    17260300 : }
    2310             : 
    2311             : // 1D functions process process 8 pixels at one time.
    2312             : static const transform_1d_ssse3
    2313             : lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
    2314             :     { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3 },
    2315             :     { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
    2316             :     { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3 },
    2317             :     { idct32_new_sse2, NULL, NULL },
    2318             :     { idct64_low32_new_ssse3, NULL, NULL },
    2319             : };
    2320             : 
    2321             : // functions for blocks with eob at DC and within
    2322             : // topleft 8x8, 16x16, 32x32 corner
    2323             : static const transform_1d_ssse3
    2324             : lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
    2325             :     {
    2326             :         { idct4_new_sse2, idct4_new_sse2, NULL, NULL },
    2327             :         { iadst4_new_sse2, iadst4_new_sse2, NULL, NULL },
    2328             :         { iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL },
    2329             :     },
    2330             :     {
    2331             :         { idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL },
    2332             :         { iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL },
    2333             :         { iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL } },
    2334             :     {
    2335             :         { idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2,
    2336             :         NULL },
    2337             :         { iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2,
    2338             :         NULL },
    2339             :         { NULL, NULL, NULL, NULL },
    2340             :     },
    2341             :     {
    2342             :         { idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3,
    2343             :           idct32_new_sse2 },
    2344             :         { NULL, NULL, NULL, NULL },
    2345             :         { NULL, NULL, NULL, NULL } },
    2346             :     {
    2347             :         { idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3,
    2348             :           idct64_low32_new_ssse3 },
    2349             :         { NULL, NULL, NULL, NULL },
    2350             :         { NULL, NULL, NULL, NULL } }
    2351             : };
    2352             : 
    2353             : // 1D functions process process 4 pixels at one time.
    2354             : // used in 4x4, 4x8, 4x16, 8x4, 16x4
    2355             : static const transform_1d_ssse3
    2356             : lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
    2357             :     { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3 },
    2358             :     { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 },
    2359             :     { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3 },
    2360             :     { NULL, NULL, NULL },
    2361             :     { NULL, NULL, NULL },
    2362             : };
    2363             : 
    2364     2157420 : static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
    2365             :     int32_t stride, int32_t shift, int32_t height,
    2366             :     int32_t txw_idx, int32_t rect_type) {
    2367     2157420 :     const int32_t *input_row = input;
    2368     2157420 :     const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
    2369     2157420 :     const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
    2370     2157420 :         (1 << (NewSqrt2Bits - shift - 1)));
    2371     2157420 :     const __m128i one = _mm_set1_epi16(1);
    2372     2157420 :     const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
    2373     2157420 :     if (rect_type != 1 && rect_type != -1) {
    2374    14593400 :         for (int32_t i = 0; i < height; ++i) {
    2375    13019600 :             const __m128i src = load_32bit_to_16bit(input_row);
    2376    13019200 :             input_row += stride;
    2377    13019200 :             __m128i lo = _mm_unpacklo_epi16(src, one);
    2378    13019200 :             __m128i hi = _mm_unpackhi_epi16(src, one);
    2379    13019200 :             lo = _mm_madd_epi16(lo, scale_rounding);
    2380    13019200 :             hi = _mm_madd_epi16(hi, scale_rounding);
    2381    13019200 :             lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
    2382    13019200 :             hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
    2383    26038400 :             out[i] = _mm_packs_epi32(lo, hi);
    2384             :         }
    2385             :     }
    2386             :     else {
    2387             :         const __m128i rect_scale =
    2388      583174 :             _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
    2389     6845620 :         for (int32_t i = 0; i < height; ++i) {
    2390     6262650 :             __m128i src = load_32bit_to_16bit(input_row);
    2391     6262450 :             src = _mm_mulhrs_epi16(src, rect_scale);
    2392     6262450 :             input_row += stride;
    2393     6262450 :             __m128i lo = _mm_unpacklo_epi16(src, one);
    2394     6262450 :             __m128i hi = _mm_unpackhi_epi16(src, one);
    2395     6262450 :             lo = _mm_madd_epi16(lo, scale_rounding);
    2396     6262450 :             hi = _mm_madd_epi16(hi, scale_rounding);
    2397     6262450 :             lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
    2398     6262450 :             hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
    2399    12524900 :             out[i] = _mm_packs_epi32(lo, hi);
    2400             :         }
    2401             :     }
    2402     2156820 : }
    2403             : 
    2404     2436890 : static INLINE void iidentity_col_8xn_ssse3(
    2405             :     uint8_t *output_r, int32_t stride_r,
    2406             :     uint8_t *output_w, int32_t stride_w,
    2407             :     __m128i *buf, int32_t shift, int32_t height,
    2408             :     int32_t txh_idx) {
    2409     2436890 :     const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
    2410     2436890 :     const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
    2411     4873790 :     const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
    2412     2436890 :     const __m128i one = _mm_set1_epi16(1);
    2413     2436890 :     const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
    2414     2436890 :     const __m128i zero = _mm_setzero_si128();
    2415    22922900 :     for (int32_t h = 0; h < height; ++h) {
    2416    20486100 :         __m128i lo = _mm_unpacklo_epi16(buf[h], one);
    2417    40972100 :         __m128i hi = _mm_unpackhi_epi16(buf[h], one);
    2418    20486100 :         lo = _mm_madd_epi16(lo, scale_coeff);
    2419    20486100 :         hi = _mm_madd_epi16(hi, scale_coeff);
    2420    20486100 :         lo = _mm_srai_epi32(lo, NewSqrt2Bits);
    2421    40972100 :         hi = _mm_srai_epi32(hi, NewSqrt2Bits);
    2422    20486100 :         lo = _mm_add_epi32(lo, shift_rounding);
    2423    20486100 :         hi = _mm_add_epi32(hi, shift_rounding);
    2424    20486100 :         lo = _mm_srai_epi32(lo, -shift);
    2425    40972100 :         hi = _mm_srai_epi32(hi, -shift);
    2426    20486100 :         __m128i x = _mm_packs_epi32(lo, hi);
    2427             : 
    2428    20486100 :         const __m128i pred = _mm_loadl_epi64((__m128i const *)(output_r));
    2429    40972100 :         x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
    2430    20486100 :         const __m128i u = _mm_packus_epi16(x, x);
    2431    20486100 :         _mm_storel_epi64((__m128i *)(output_w), u);
    2432    20486100 :         output_r += stride_r;
    2433    20486100 :         output_w += stride_w;
    2434             :     }
    2435     2436890 : }
    2436             : 
    2437      819274 : static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
    2438             :     uint8_t *output_r, int32_t stride_r,
    2439             :     uint8_t *output_w, int32_t stride_w,
    2440             :     TxSize tx_size) {
    2441      819274 :     const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
    2442      819274 :     const int32_t txw_idx = get_txw_idx(tx_size);
    2443      819271 :     const int32_t txh_idx = get_txh_idx(tx_size);
    2444      819274 :     const int32_t txfm_size_col = tx_size_wide[tx_size];
    2445      819274 :     const int32_t txfm_size_row = tx_size_high[tx_size];
    2446      819274 :     const int32_t input_stride = AOMMIN(32, txfm_size_col);
    2447      819274 :     const int32_t row_max = AOMMIN(32, txfm_size_row);
    2448      819274 :     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
    2449             :     __m128i buf[32];
    2450             : 
    2451     1761160 :     for (int32_t i = 0; i < (input_stride >> 3); ++i) {
    2452      941884 :         iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max,
    2453             :             txw_idx, rect_type);
    2454      941883 :         iidentity_col_8xn_ssse3(
    2455      941883 :             output_r + 8 * i, stride_r,
    2456      941883 :             output_w + 8 * i, stride_w,
    2457      941883 :             buf, shift[1], row_max,
    2458             :             txh_idx);
    2459             :     }
    2460      819272 : }
    2461             : 
    2462    12118300 : static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
    2463             :     uint8_t *output_r, int32_t stride_r,
    2464             :     uint8_t *output_w, int32_t stride_w,
    2465             :     TxType tx_type, TxSize tx_size_, int32_t eob) {
    2466             :     (void)tx_size_;
    2467             :     (void)eob;
    2468             :     __m128i buf[4];
    2469    12118300 :     const TxSize tx_size = TX_4X4;
    2470    12118300 :     const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
    2471    12118300 :     const int32_t txw_idx = get_txw_idx(tx_size);
    2472    12118100 :     const int32_t txh_idx = get_txh_idx(tx_size);
    2473    12118300 :     const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
    2474    12118300 :     const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
    2475    12118300 :     const int32_t txfm_size_col = tx_size_wide[tx_size];
    2476    12118300 :     const int32_t txfm_size_row = tx_size_high[tx_size];
    2477             : 
    2478    12118300 :     const transform_1d_ssse3 row_txfm =
    2479    12118300 :         lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
    2480    12118300 :     const transform_1d_ssse3 col_txfm =
    2481    12118300 :         lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
    2482             : 
    2483             :     int32_t ud_flip, lr_flip;
    2484    12118300 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    2485    12117900 :     load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
    2486    12117600 :     transpose_16bit_4x4(buf, buf);
    2487    12118000 :     row_txfm(buf, buf, cos_bit_row);
    2488    12120400 :     if (lr_flip) {
    2489             :         __m128i temp[4];
    2490      690087 :         flip_buf_sse2(buf, temp, txfm_size_col);
    2491      690081 :         transpose_16bit_4x4(temp, buf);
    2492             :     }
    2493             :     else
    2494    11430300 :         transpose_16bit_4x4(buf, buf);
    2495    12119800 :     col_txfm(buf, buf, cos_bit_col);
    2496    12120500 :     round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
    2497    12119900 :     lowbd_write_buffer_4xn_sse2(buf, output_r, stride_r, output_w, stride_w,
    2498             :         ud_flip, txfm_size_row);
    2499    12120200 : }
    2500             : 
    2501    26449600 : static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
    2502             :     __m128i res0, __m128i res1) {
    2503    26449600 :     const __m128i zero = _mm_setzero_si128();
    2504    26449600 :     __m128i x0 = _mm_unpacklo_epi8(pred, zero);
    2505    26449600 :     __m128i x1 = _mm_unpackhi_epi8(pred, zero);
    2506    26449600 :     x0 = _mm_adds_epi16(res0, x0);
    2507    26449600 :     x1 = _mm_adds_epi16(res1, x1);
    2508    26449600 :     return _mm_packus_epi16(x0, x1);
    2509             : }
    2510             : 
    2511     3307310 : static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in,
    2512             :     uint8_t *output_r, int32_t stride_r, uint8_t *output_w, int32_t stride_w,
    2513             :     int32_t flipud,
    2514             :     int32_t height) {
    2515     3307310 :     int32_t j = flipud ? (height - 1) : 0;
    2516     3307310 :     const int32_t step = flipud ? -1 : 1;
    2517    29756600 :     for (int32_t i = 0; i < height; ++i, j += step) {
    2518    26449300 :         __m128i v = _mm_loadu_si128((__m128i const *)(output_r + i * stride_r));
    2519    26449300 :         __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
    2520    26449200 :         _mm_storeu_si128((__m128i *)(output_w + i * stride_w), u);
    2521             :     }
    2522     3307300 : }
    2523             : 
    2524    12994000 : static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
    2525             :     int32_t size) {
    2526    12994000 :     const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
    2527   122334000 :     for (int32_t i = 0; i < size; ++i)
    2528   218679000 :         output[i] = _mm_mulhrs_epi16(input[i], scale);
    2529    12994000 : }
    2530             : 
    2531    13153500 : static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
    2532             :     const int32_t *input,
    2533             :     uint8_t *output_r, int32_t stride_r,
    2534             :     uint8_t *output_w, int32_t stride_w,
    2535             :     TxType tx_type,
    2536             :     TxSize tx_size, int32_t eob) {
    2537             :     __m128i buf1[64 * 8];
    2538             :     int32_t eobx, eoby;
    2539    13153500 :     get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
    2540    13152900 :     const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
    2541    13152900 :     const int32_t txw_idx = get_txw_idx(tx_size);
    2542    13152300 :     const int32_t txh_idx = get_txh_idx(tx_size);
    2543    13151900 :     const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
    2544    13151900 :     const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
    2545    13151900 :     const int32_t txfm_size_col = tx_size_wide[tx_size];
    2546    13151900 :     const int32_t txfm_size_row = tx_size_high[tx_size];
    2547    13151900 :     const int32_t buf_size_w_div8 = txfm_size_col >> 3;
    2548    13151900 :     const int32_t buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
    2549    13151900 :     const int32_t buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
    2550    13151900 :     const int32_t input_stride = AOMMIN(32, txfm_size_col);
    2551    13151900 :     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
    2552             :     ASSERT(eobx < 32);
    2553             :     ASSERT(eoby < 32);
    2554    13151800 :     const int32_t fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
    2555    13151800 :     const int32_t fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
    2556    13151800 :     const transform_1d_ssse3 row_txfm =
    2557    13151800 :         lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
    2558    13151800 :     const transform_1d_ssse3 col_txfm =
    2559    13151800 :         lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
    2560             : 
    2561    13151800 :     assert(col_txfm != NULL);
    2562    13151800 :     assert(row_txfm != NULL);
    2563             :     int32_t ud_flip, lr_flip;
    2564    13151800 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    2565    30578400 :     for (int32_t i = 0; i < buf_size_nonzero_h_div8; i++) {
    2566             :         __m128i buf0[64];
    2567    17418800 :         const int32_t *input_row = input + i * input_stride * 8;
    2568    38777600 :         for (int32_t j = 0; j < buf_size_nonzero_w_div8; ++j) {
    2569    21354200 :             __m128i *buf0_cur = buf0 + j * 8;
    2570    21354200 :             load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
    2571    21341700 :             transpose_16bit_8x8(buf0_cur, buf0_cur);
    2572             :         }
    2573    17423400 :         if (rect_type == 1 || rect_type == -1)
    2574     6701580 :             round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
    2575    17423700 :         row_txfm(buf0, buf0, cos_bit_row);
    2576    17422400 :         round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
    2577    17420600 :         __m128i *_buf1 = buf1 + i * 8;
    2578    17420600 :         if (lr_flip) {
    2579     1714570 :             for (int32_t j = 0; j < buf_size_w_div8; ++j) {
    2580             :                 __m128i temp[8];
    2581      899330 :                 flip_buf_sse2(buf0 + 8 * j, temp, 8);
    2582      899326 :                 transpose_16bit_8x8(temp,
    2583      899326 :                     _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
    2584             :             }
    2585             :         }
    2586             :         else {
    2587    37067300 :             for (int32_t j = 0; j < buf_size_w_div8; ++j)
    2588    20456700 :                 transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
    2589             :         }
    2590             :     }
    2591    30245100 :     for (int32_t i = 0; i < buf_size_w_div8; i++) {
    2592    17092400 :         col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
    2593    17091600 :         round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
    2594             :     }
    2595             : 
    2596    13152700 :     if (txfm_size_col >= 16) {
    2597     5983240 :         for (int32_t i = 0; i < (txfm_size_col >> 4); i++) {
    2598     3307320 :             lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
    2599     3307320 :                 output_r + 16 * i, stride_r,
    2600     3307320 :                 output_w + 16 * i, stride_w,
    2601             :                 ud_flip, txfm_size_row);
    2602             :         }
    2603             :     }
    2604    10476700 :     else if (txfm_size_col == 8)
    2605    10479100 :         lowbd_write_buffer_8xn_sse2(buf1, output_r, stride_r, output_w, stride_w,
    2606             :             ud_flip, txfm_size_row);
    2607    13152700 : }
    2608             : 
    2609     1090210 : static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
    2610             :     const int32_t *input,
    2611             :     uint8_t *output_r, int32_t stride_r,
    2612             :     uint8_t *output_w, int32_t stride_w,
    2613             :     TxType tx_type,
    2614             :     TxSize tx_size, int32_t eob) {
    2615     1090210 :     const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
    2616             :     int32_t eobx, eoby;
    2617     1090210 :     get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
    2618     1090210 :     const int32_t txw_idx = get_txw_idx(tx_size);
    2619     1090200 :     const int32_t txh_idx = get_txh_idx(tx_size);
    2620     1090200 :     const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
    2621     1090200 :     const int32_t txfm_size_col = tx_size_wide[tx_size];
    2622     1090200 :     const int32_t txfm_size_row = tx_size_high[tx_size];
    2623     1090200 :     const int32_t buf_size_w_div8 = (eobx + 8) >> 3;
    2624     1090200 :     const int32_t input_stride = AOMMIN(32, txfm_size_col);
    2625     1090200 :     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
    2626             : 
    2627     1090200 :     const int32_t fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
    2628             :     ASSERT(fun_idx < 4);
    2629     1090200 :     const transform_1d_ssse3 col_txfm =
    2630     1090200 :         lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
    2631             : 
    2632     1090200 :     assert(col_txfm != NULL);
    2633             : 
    2634             :     int32_t ud_flip, lr_flip;
    2635     1090200 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    2636     2305750 :     for (int32_t i = 0; i < buf_size_w_div8; i++) {
    2637             :         __m128i buf0[64];
    2638     1215610 :         iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0],
    2639             :             eoby + 1, txw_idx, rect_type);
    2640     1215620 :         col_txfm(buf0, buf0, cos_bit_col);
    2641     1215630 :         __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
    2642     1215630 :         int32_t k = ud_flip ? (txfm_size_row - 1) : 0;
    2643     1215630 :         const int32_t step = ud_flip ? -1 : 1;
    2644     1215630 :         uint8_t *out_r = output_r + 8 * i;
    2645     1215630 :         uint8_t *out_w = output_w + 8 * i;
    2646    11963700 :         for (int32_t j = 0; j < txfm_size_row; ++j, k += step) {
    2647    10748100 :             const __m128i v = _mm_loadl_epi64((__m128i const *)(out_r));
    2648             :             ASSERT(k >= 0);
    2649    10748100 :             __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
    2650    10748100 :             const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
    2651    10748100 :             _mm_storel_epi64((__m128i *)(out_w), u);
    2652    10748100 :             out_r += stride_r;
    2653    10748100 :             out_w += stride_w;
    2654             :         }
    2655             :     }
    2656     1090140 : }
    2657             : 
    2658     1229030 : static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
    2659             :     const int32_t *input,
    2660             :     uint8_t *output_r, int32_t stride_r,
    2661             :     uint8_t *output_w, int32_t stride_w,
    2662             :     TxType tx_type, TxSize tx_size, int32_t eob) {
    2663             :     __m128i buf1[64];
    2664             :     int32_t eobx, eoby;
    2665     1229030 :     get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
    2666     1229030 :     const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
    2667     1229030 :     const int32_t txw_idx = get_txw_idx(tx_size);
    2668     1229030 :     const int32_t txh_idx = get_txh_idx(tx_size);
    2669     1229030 :     const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
    2670     1229030 :     const int32_t txfm_size_col = tx_size_wide[tx_size];
    2671     1229030 :     const int32_t txfm_size_row = tx_size_high[tx_size];
    2672     1229030 :     const int32_t buf_size_w_div8 = txfm_size_col >> 3;
    2673     1229030 :     const int32_t buf_size_h_div8 = (eoby + 8) >> 3;
    2674     1229030 :     const int32_t input_stride = AOMMIN(32, txfm_size_col);
    2675     1229030 :     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
    2676             : 
    2677     1229030 :     const int32_t fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
    2678     1229030 :     const transform_1d_ssse3 row_txfm =
    2679     1229030 :         lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
    2680             : 
    2681     1229030 :     assert(row_txfm != NULL);
    2682             :     int32_t ud_flip, lr_flip;
    2683     1229030 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    2684     2603450 :     for (int32_t i = 0; i < buf_size_h_div8; i++) {
    2685             :         __m128i buf0[64];
    2686     1374380 :         const int32_t *input_row = input + i * input_stride * 8;
    2687     2869460 :         for (int32_t j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
    2688     1495070 :             __m128i *buf0_cur = buf0 + j * 8;
    2689     1495070 :             load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
    2690     1494970 :             transpose_16bit_8x8(buf0_cur, buf0_cur);
    2691             :         }
    2692     1374390 :         if (rect_type == 1 || rect_type == -1)
    2693      411458 :             round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
    2694     1374390 :         row_txfm(buf0, buf0, cos_bit_row);
    2695     1374400 :         round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
    2696     1374400 :         __m128i *_buf1 = buf1;
    2697     1374400 :         if (lr_flip) {
    2698      451847 :             for (int32_t j = 0; j < buf_size_w_div8; ++j) {
    2699             :                 __m128i temp[8];
    2700      239061 :                 flip_buf_sse2(buf0 + 8 * j, temp, 8);
    2701      239060 :                 transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
    2702             :             }
    2703             :         }
    2704             :         else {
    2705     2417660 :             for (int32_t j = 0; j < buf_size_w_div8; ++j)
    2706     1256040 :                 transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
    2707             :         }
    2708             : 
    2709     2869520 :         for (int32_t j = 0; j < buf_size_w_div8; ++j) {
    2710     1495100 :             iidentity_col_8xn_ssse3(
    2711     1495100 :                 output_r + i * 8 * stride_r + j * 8, stride_r,
    2712     1495100 :                 output_w + i * 8 * stride_w + j * 8, stride_w,
    2713     1495100 :                 buf1 + j * 8, shift[1], 8, txh_idx);
    2714             :         }
    2715             :     }
    2716     1229070 : }
    2717             : 
    2718             : // for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
    2719    16289700 : static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
    2720             :     const int32_t *input,
    2721             :     uint8_t *output_r, int32_t stride_r,
    2722             :     uint8_t *output_w, int32_t stride_w,
    2723             :     TxType tx_type,
    2724             :     TxSize tx_size, int32_t eob) {
    2725    16289700 :     switch (tx_type) {
    2726     8470760 :     case DCT_DCT:
    2727     8470760 :         lowbd_inv_txfm2d_add_no_identity_ssse3(input,
    2728             :             output_r, stride_r, output_w, stride_w,
    2729             :             tx_type, tx_size, eob);
    2730     8470410 :         break;
    2731      819278 :     case IDTX:
    2732      819278 :         lowbd_inv_txfm2d_add_idtx_ssse3(input,
    2733             :             output_r, stride_r, output_w, stride_w, tx_size);
    2734      819270 :         break;
    2735     1090220 :     case V_DCT:
    2736             :     case V_ADST:
    2737             :     case V_FLIPADST:
    2738     1090220 :         lowbd_inv_txfm2d_add_h_identity_ssse3(input,
    2739             :             output_r, stride_r, output_w, stride_w,
    2740             :             tx_type, tx_size, eob);
    2741     1090200 :         break;
    2742     1229040 :     case H_DCT:
    2743             :     case H_ADST:
    2744             :     case H_FLIPADST:
    2745     1229040 :         lowbd_inv_txfm2d_add_v_identity_ssse3(input,
    2746             :             output_r, stride_r, output_w, stride_w,
    2747             :             tx_type, tx_size, eob);
    2748     1229040 :         break;
    2749     4680440 :     default:
    2750     4680440 :         lowbd_inv_txfm2d_add_no_identity_ssse3(input,
    2751             :             output_r, stride_r, output_w, stride_w,
    2752             :             tx_type, tx_size, eob);
    2753     4685240 :         break;
    2754             :     }
    2755    16294200 : }
    2756             : 
    2757     2978360 : static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
    2758             :     uint8_t *output_r, int32_t stride_r,
    2759             :     uint8_t *output_w, int32_t stride_w,
    2760             :     TxType tx_type, TxSize tx_size_,
    2761             :     int32_t eob) {
    2762             :     (void)tx_size_;
    2763             :     (void)eob;
    2764             :     __m128i buf[8];
    2765     2978360 :     const TxSize tx_size = TX_4X8;
    2766     2978360 :     const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
    2767     2978360 :     const int32_t txw_idx = get_txw_idx(tx_size);
    2768     2978330 :     const int32_t txh_idx = get_txh_idx(tx_size);
    2769     2978350 :     const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
    2770     2978350 :     const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
    2771     2978350 :     const int32_t txfm_size_col = tx_size_wide[tx_size];
    2772     2978350 :     const int32_t txfm_size_row = tx_size_high[tx_size];
    2773             : 
    2774     2978350 :     const transform_1d_ssse3 row_txfm =
    2775     2978350 :         lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
    2776     2978350 :     const transform_1d_ssse3 col_txfm =
    2777     2978350 :         lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
    2778             : 
    2779             :     int32_t ud_flip, lr_flip;
    2780     2978350 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    2781     2978340 :     load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
    2782     2978290 :     transpose_16bit_4x8(buf, buf);
    2783     2978430 :     round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
    2784     2978400 :     row_txfm(buf, buf, cos_bit_row);
    2785             :     // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
    2786     2978540 :     if (lr_flip) {
    2787             :         __m128i temp[4];
    2788       59145 :         flip_buf_sse2(buf, temp, txfm_size_col);
    2789       59145 :         transpose_16bit_8x4(temp, buf);
    2790             :     }
    2791             :     else
    2792     2919390 :         transpose_16bit_8x4(buf, buf);
    2793     2978500 :     col_txfm(buf, buf, cos_bit_col);
    2794     2978480 :     round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
    2795     2978350 :     lowbd_write_buffer_4xn_sse2(buf, output_r, stride_r, output_w, stride_w,
    2796             :         ud_flip, txfm_size_row);
    2797     2978430 : }
    2798             : 
    2799     2903780 : static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
    2800             :     uint8_t *output_r, int32_t stride_r,
    2801             :     uint8_t *output_w, int32_t stride_w,
    2802             :     TxType tx_type, TxSize tx_size_,
    2803             :     int32_t eob) {
    2804             :     (void)tx_size_;
    2805             :     (void)eob;
    2806             :     __m128i buf[8];
    2807     2903780 :     const TxSize tx_size = TX_8X4;
    2808     2903780 :     const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
    2809     2903780 :     const int32_t txw_idx = get_txw_idx(tx_size);
    2810     2903750 :     const int32_t txh_idx = get_txh_idx(tx_size);
    2811     2903760 :     const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
    2812     2903760 :     const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
    2813     2903760 :     const int32_t txfm_size_col = tx_size_wide[tx_size];
    2814     2903760 :     const int32_t txfm_size_row = tx_size_high[tx_size];
    2815             : 
    2816     2903760 :     const transform_1d_ssse3 row_txfm =
    2817     2903760 :         lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
    2818     2903760 :     const transform_1d_ssse3 col_txfm =
    2819     2903760 :         lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
    2820             : 
    2821             :     int32_t ud_flip, lr_flip;
    2822     2903760 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    2823     2903750 :     load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
    2824     2903530 :     transpose_16bit_8x4(buf, buf);
    2825     2903750 :     round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
    2826     2903700 :     row_txfm(buf, buf, cos_bit_row);
    2827             :     // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
    2828     2903920 :     if (lr_flip) {
    2829             :         __m128i temp[8];
    2830       60587 :         flip_buf_sse2(buf, temp, txfm_size_col);
    2831       60587 :         transpose_16bit_4x8(temp, buf);
    2832             :     }
    2833             :     else
    2834     2843330 :         transpose_16bit_4x8(buf, buf);
    2835     2903910 :     col_txfm(buf, buf, cos_bit_col);
    2836     2903880 :     round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
    2837     2903830 :     lowbd_write_buffer_8xn_sse2(buf, output_r, stride_r, output_w, stride_w,
    2838             :         ud_flip, txfm_size_row);
    2839     2903770 : }
    2840             : 
    2841     1876100 : static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
    2842             :     uint8_t *output_r, int32_t stride_r,
    2843             :     uint8_t *output_w, int32_t stride_w,
    2844             :     TxType tx_type, TxSize tx_size_,
    2845             :     int32_t eob) {
    2846             :     (void)tx_size_;
    2847             :     (void)eob;
    2848             :     __m128i buf[16];
    2849     1876100 :     const TxSize tx_size = TX_4X16;
    2850     1876100 :     const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
    2851     1876100 :     const int32_t txw_idx = get_txw_idx(tx_size);
    2852     1876100 :     const int32_t txh_idx = get_txh_idx(tx_size);
    2853     1876110 :     const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
    2854     1876110 :     const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
    2855     1876110 :     const int32_t txfm_size_col = tx_size_wide[tx_size];
    2856     1876110 :     const int32_t txfm_size_row = tx_size_high[tx_size];
    2857             : 
    2858     1876110 :     const transform_1d_ssse3 row_txfm =
    2859     1876110 :         lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
    2860     1876110 :     const transform_1d_ssse3 col_txfm =
    2861     1876110 :         lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
    2862             : 
    2863             :     int32_t ud_flip, lr_flip;
    2864     1876110 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    2865             : 
    2866     1875890 :     const int32_t row_one_loop = 8;
    2867     5628060 :     for (int32_t i = 0; i < 2; ++i) {
    2868     3751900 :         const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
    2869     3751900 :         __m128i *buf_cur = buf + i * row_one_loop;
    2870     3751900 :         load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
    2871             :             row_one_loop);
    2872     3751670 :         transpose_16bit_4x8(buf_cur, buf_cur);
    2873     3752050 :         row_txfm(buf_cur, buf_cur, cos_bit_row);
    2874     3752160 :         round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
    2875     3751960 :         if (lr_flip) {
    2876             :             __m128i temp[8];
    2877      110766 :             flip_buf_sse2(buf_cur, temp, txfm_size_col);
    2878      110766 :             transpose_16bit_8x4(temp, buf_cur);
    2879             :         }
    2880             :         else
    2881     3641200 :             transpose_16bit_8x4(buf_cur, buf_cur);
    2882             :     }
    2883     1876160 :     col_txfm(buf, buf, cos_bit_col);
    2884     1876140 :     round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
    2885     1876060 :     lowbd_write_buffer_4xn_sse2(buf, output_r, stride_r, output_w, stride_w,
    2886             :         ud_flip, txfm_size_row);
    2887     1876120 : }
    2888             : 
    2889     1945300 : static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
    2890             :     uint8_t *output_r, int32_t stride_r,
    2891             :     uint8_t *output_w, int32_t stride_w,
    2892             :     TxType tx_type, TxSize tx_size_, int32_t eob) {
    2893             :     (void)tx_size_;
    2894             :     (void)eob;
    2895             :     __m128i buf[16];
    2896     1945300 :     const TxSize tx_size = TX_16X4;
    2897     1945300 :     const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
    2898     1945300 :     const int32_t txw_idx = get_txw_idx(tx_size);
    2899     1945270 :     const int32_t txh_idx = get_txh_idx(tx_size);
    2900     1945280 :     const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
    2901     1945280 :     const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
    2902     1945280 :     const int32_t txfm_size_col = tx_size_wide[tx_size];
    2903     1945280 :     const int32_t txfm_size_row = tx_size_high[tx_size];
    2904     1945280 :     const int32_t buf_size_w_div8 = txfm_size_col >> 3;
    2905             : 
    2906     1945280 :     const transform_1d_ssse3 row_txfm =
    2907     1945280 :         lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
    2908     1945280 :     const transform_1d_ssse3 col_txfm =
    2909     1945280 :         lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
    2910             : 
    2911             :     int32_t ud_flip, lr_flip;
    2912     1945280 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    2913     1945260 :     const int32_t row_one_loop = 8;
    2914     5835590 :     for (int32_t i = 0; i < buf_size_w_div8; ++i) {
    2915     3890330 :         const int32_t *input_cur = input + i * row_one_loop;
    2916     3890330 :         __m128i *buf_cur = buf + i * row_one_loop;
    2917     3890330 :         load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
    2918             :             txfm_size_row);
    2919     3890050 :         transpose_16bit_8x4(buf_cur, buf_cur);
    2920             :     }
    2921     1945260 :     row_txfm(buf, buf, cos_bit_row);
    2922     1945320 :     round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
    2923     1945260 :     if (lr_flip) {
    2924             :         __m128i temp[16];
    2925       58850 :         flip_buf_sse2(buf, temp, 16);
    2926       58850 :         transpose_16bit_4x8(temp, buf);
    2927       58850 :         transpose_16bit_4x8(temp + 8, buf + 8);
    2928             :     }
    2929             :     else {
    2930     1886410 :         transpose_16bit_4x8(buf, buf);
    2931     1886470 :         transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
    2932             :     }
    2933     5835710 :     for (int32_t i = 0; i < buf_size_w_div8; i++) {
    2934     3890410 :         col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
    2935     3890500 :         round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
    2936             :     }
    2937     1945300 :     lowbd_write_buffer_8xn_sse2(buf, output_r, stride_r, output_w, stride_w, ud_flip, 4);
    2938     1945260 :     lowbd_write_buffer_8xn_sse2(buf + 8, output_r + 8, stride_r, output_w + 8, stride_w, ud_flip, 4);
    2939     1945270 : }
    2940             : 
    2941    38080000 : void eb_av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input,
    2942             :     uint8_t *output_r, int32_t stride_r,
    2943             :     uint8_t *output_w, int32_t stride_w,
    2944             :     TxType tx_type,
    2945             :     TxSize tx_size, int32_t eob) {
    2946    38080000 :     switch (tx_size) {
    2947    12118700 :     case TX_4X4:
    2948    12118700 :         lowbd_inv_txfm2d_add_4x4_ssse3(input,
    2949             :             output_r, stride_r, output_w, stride_w,
    2950             :             tx_type, tx_size, eob);
    2951    12120200 :         break;
    2952     2978380 :     case TX_4X8:
    2953     2978380 :         lowbd_inv_txfm2d_add_4x8_ssse3(input,
    2954             :             output_r, stride_r, output_w, stride_w,
    2955             :             tx_type, tx_size, eob);
    2956     2978430 :         break;
    2957     2903810 :     case TX_8X4:
    2958     2903810 :         lowbd_inv_txfm2d_add_8x4_ssse3(input,
    2959             :             output_r, stride_r, output_w, stride_w,
    2960             :             tx_type, tx_size, eob);
    2961     2903770 :         break;
    2962     1876120 :     case TX_4X16:
    2963     1876120 :         lowbd_inv_txfm2d_add_4x16_ssse3(input,
    2964             :             output_r, stride_r, output_w, stride_w,
    2965             :             tx_type, tx_size, eob);
    2966     1876120 :         break;
    2967     1945300 :     case TX_16X4:
    2968     1945300 :         lowbd_inv_txfm2d_add_16x4_ssse3(input,
    2969             :             output_r, stride_r, output_w, stride_w,
    2970             :             tx_type, tx_size, eob);
    2971     1945260 :         break;
    2972    16257800 :     default:
    2973    16257800 :         lowbd_inv_txfm2d_add_universe_ssse3(input,
    2974             :             output_r, stride_r, output_w, stride_w,
    2975             :             tx_type, tx_size, eob);
    2976    16288400 :         break;
    2977             :     }
    2978    38112200 : }
    2979             : 
    2980           0 : void eb_av1_inv_txfm_add_ssse3(const TranLow *dqcoeff,
    2981             :     uint8_t *dst_r, int32_t stride_r,
    2982             :     uint8_t *dst_w, int32_t stride_w,
    2983             :     const TxfmParam *txfm_param) {
    2984           0 :     const TxType tx_type = txfm_param->tx_type;
    2985           0 :     if (!txfm_param->lossless) {
    2986           0 :         eb_av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff,
    2987             :             dst_r, stride_r, dst_w, stride_w,
    2988           0 :             tx_type, txfm_param->tx_size, txfm_param->eob);
    2989             :     }
    2990             :     else {
    2991           0 :         eb_av1_inv_txfm_add_c(dqcoeff, dst_r, stride_r, dst_w,
    2992             :                               stride_w, txfm_param);
    2993             :     }
    2994           0 : }

Generated by: LCOV version 1.14