LCOV - code coverage report
Current view: top level - ASM_AVX2 - highbd_fwd_txfm_avx2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 4098 4170 98.3 %
Date: 2019-11-25 17:38:06 Functions: 77 78 98.7 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : /*
       7             : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       8             : *
       9             : * This source code is subject to the terms of the BSD 2 Clause License and
      10             : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      11             : * was not distributed with this source code in the LICENSE file, you can
      12             : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      13             : * Media Patent License 1.0 was not distributed with this source code in the
      14             : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      15             : */
      16             : 
      17             : #include <assert.h>
      18             : #include "EbDefinitions.h"
      19             : #include "aom_dsp_rtcd.h"
      20             : #include "EbTransforms.h"
      21             : #include <immintrin.h>
      22             : #include "txfm_common_avx2.h"
      23             : 
      24             : void Av1TransformConfig(
      25             :     TxType tx_type,
      26             :     TxSize tx_size,
      27             :     Txfm2DFlipCfg *cfg);
      28             : 
      29             : typedef void(*fwd_transform_1d_avx2)(const __m256i *in, __m256i *out, int8_t bit,
      30             :     const int32_t num_cols);
      31             : 
      32             : #define TRANSPOSE_4X4_AVX2(x0, x1, x2, x3, y0, y1, y2, y3) \
      33             :   do {                                                \
      34             :     __m256i u0, u1, u2, u3;                           \
      35             :     u0 = _mm256_unpacklo_epi32(x0, x1);                  \
      36             :     u1 = _mm256_unpackhi_epi32(x0, x1);                  \
      37             :     u2 = _mm256_unpacklo_epi32(x2, x3);                  \
      38             :     u3 = _mm256_unpackhi_epi32(x2, x3);                  \
      39             :     y0 = _mm256_unpacklo_epi64(u0, u2);                  \
      40             :     y1 = _mm256_unpackhi_epi64(u0, u2);                  \
      41             :     y2 = _mm256_unpacklo_epi64(u1, u3);                  \
      42             :     y3 = _mm256_unpackhi_epi64(u1, u3);                  \
      43             :     } while (0)
      44             : 
      45    98739800 : static INLINE void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
      46             :     __m256i out1[8];
      47   789919000 :     TRANSPOSE_4X4_AVX2(in[0], in[1], in[2], in[3], out1[0], out1[1], out1[4], out1[5]);
      48   789919000 :     TRANSPOSE_4X4_AVX2(in[4], in[5], in[6], in[7], out1[2], out1[3], out1[6], out1[7]);
      49    98739800 :     out[0] = _mm256_permute2x128_si256(out1[0], out1[2], 0x20);
      50    98739800 :     out[1] = _mm256_permute2x128_si256(out1[1], out1[3], 0x20);
      51    98739800 :     out[2] = _mm256_permute2x128_si256(out1[4], out1[6], 0x20);
      52    98739800 :     out[3] = _mm256_permute2x128_si256(out1[5], out1[7], 0x20);
      53    98739800 :     out[4] = _mm256_permute2x128_si256(out1[0], out1[2], 0x31);
      54    98739800 :     out[5] = _mm256_permute2x128_si256(out1[1], out1[3], 0x31);
      55    98739800 :     out[6] = _mm256_permute2x128_si256(out1[4], out1[6], 0x31);
      56    98739800 :     out[7] = _mm256_permute2x128_si256(out1[5], out1[7], 0x31);
      57    98739800 : }
      58             : 
      59    13216200 : static INLINE void transpose_16x16_avx2(const __m256i *in, __m256i *out) {
      60             :     __m256i temp[32];
      61   105730000 :     TRANSPOSE_4X4_AVX2(in[0], in[2], in[4], in[6], temp[0], temp[2], temp[4], temp[6]);
      62   105730000 :     TRANSPOSE_4X4_AVX2(in[8], in[10], in[12], in[14], temp[17], temp[19], temp[21], temp[23]);
      63   105730000 :     TRANSPOSE_4X4_AVX2(in[1], in[3], in[5], in[7], temp[16], temp[18], temp[20], temp[22]);
      64   105730000 :     TRANSPOSE_4X4_AVX2(in[9], in[11], in[13], in[15], temp[25], temp[27], temp[29], temp[31]);
      65   105730000 :     TRANSPOSE_4X4_AVX2(in[16], in[18], in[20], in[22], temp[1], temp[3], temp[5], temp[7]);
      66   105730000 :     TRANSPOSE_4X4_AVX2(in[24], in[26], in[28], in[30], temp[9], temp[11], temp[13], temp[15]);
      67   105730000 :     TRANSPOSE_4X4_AVX2(in[17], in[19], in[21], in[23], temp[8], temp[10], temp[12], temp[14]);
      68   105730000 :     TRANSPOSE_4X4_AVX2(in[25], in[27], in[29], in[31], temp[24], temp[26], temp[28], temp[30]);
      69             : 
      70    13216200 :     out[0] = _mm256_permute2x128_si256(temp[0], temp[17], 0x20);
      71    13216200 :     out[1] = _mm256_permute2x128_si256(temp[1], temp[9], 0x20);
      72    13216200 :     out[2] = _mm256_permute2x128_si256(temp[2], temp[19], 0x20);
      73    13216200 :     out[3] = _mm256_permute2x128_si256(temp[3], temp[11], 0x20);
      74    13216200 :     out[4] = _mm256_permute2x128_si256(temp[4], temp[21], 0x20);
      75    13216200 :     out[5] = _mm256_permute2x128_si256(temp[5], temp[13], 0x20);
      76    13216200 :     out[6] = _mm256_permute2x128_si256(temp[6], temp[23], 0x20);
      77    13216200 :     out[7] = _mm256_permute2x128_si256(temp[7], temp[15], 0x20);
      78    13216200 :     out[8] = _mm256_permute2x128_si256(temp[0], temp[17], 0x31);
      79    13216200 :     out[9] = _mm256_permute2x128_si256(temp[1], temp[9], 0x31);
      80    13216200 :     out[10] = _mm256_permute2x128_si256(temp[2], temp[19], 0x31);
      81    13216200 :     out[11] = _mm256_permute2x128_si256(temp[3], temp[11], 0x31);
      82    13216200 :     out[12] = _mm256_permute2x128_si256(temp[4], temp[21], 0x31);
      83    13216200 :     out[13] = _mm256_permute2x128_si256(temp[5], temp[13], 0x31);
      84    13216200 :     out[14] = _mm256_permute2x128_si256(temp[6], temp[23], 0x31);
      85    13216200 :     out[15] = _mm256_permute2x128_si256(temp[7], temp[15], 0x31);
      86    13216200 :     out[16] = _mm256_permute2x128_si256(temp[16], temp[25], 0x20);
      87    13216200 :     out[17] = _mm256_permute2x128_si256(temp[8], temp[24], 0x20);
      88    13216200 :     out[18] = _mm256_permute2x128_si256(temp[18], temp[27], 0x20);
      89    13216200 :     out[19] = _mm256_permute2x128_si256(temp[10], temp[26], 0x20);
      90    13216200 :     out[20] = _mm256_permute2x128_si256(temp[20], temp[29], 0x20);
      91    13216200 :     out[21] = _mm256_permute2x128_si256(temp[12], temp[28], 0x20);
      92    13216200 :     out[22] = _mm256_permute2x128_si256(temp[22], temp[31], 0x20);
      93    13216200 :     out[23] = _mm256_permute2x128_si256(temp[14], temp[30], 0x20);
      94    13216200 :     out[24] = _mm256_permute2x128_si256(temp[16], temp[25], 0x31);
      95    13216200 :     out[25] = _mm256_permute2x128_si256(temp[8], temp[24], 0x31);
      96    13216200 :     out[26] = _mm256_permute2x128_si256(temp[18], temp[27], 0x31);
      97    13216200 :     out[27] = _mm256_permute2x128_si256(temp[10], temp[26], 0x31);
      98    13216200 :     out[28] = _mm256_permute2x128_si256(temp[20], temp[29], 0x31);
      99    13216200 :     out[29] = _mm256_permute2x128_si256(temp[12], temp[28], 0x31);
     100    13216200 :     out[30] = _mm256_permute2x128_si256(temp[22], temp[31], 0x31);
     101    13216200 :     out[31] = _mm256_permute2x128_si256(temp[14], temp[30], 0x31);
     102    13216200 : }
     103             : 
     104    54789800 : static INLINE void transpose_32_8x8_avx2(int32_t stride, const __m256i *in,
     105             :     __m256i *out) {
     106             :     __m256i out1[8];
     107    54789800 :     __m256i temp0 = _mm256_unpacklo_epi32(in[0 * stride], in[2 * stride]);
     108    54789800 :     __m256i temp1 = _mm256_unpackhi_epi32(in[0 * stride], in[2 * stride]);
     109    54789800 :     __m256i temp2 = _mm256_unpacklo_epi32(in[1 * stride], in[3 * stride]);
     110    54789800 :     __m256i temp3 = _mm256_unpackhi_epi32(in[1 * stride], in[3 * stride]);
     111    54789800 :     __m256i temp4 = _mm256_unpacklo_epi32(in[4 * stride], in[6 * stride]);
     112    54789800 :     __m256i temp5 = _mm256_unpackhi_epi32(in[4 * stride], in[6 * stride]);
     113    54789800 :     __m256i temp6 = _mm256_unpacklo_epi32(in[5 * stride], in[7 * stride]);
     114   109580000 :     __m256i temp7 = _mm256_unpackhi_epi32(in[5 * stride], in[7 * stride]);
     115             : 
     116    54789800 :     out1[0] = _mm256_unpacklo_epi32(temp0, temp2);
     117    54789800 :     out1[1] = _mm256_unpackhi_epi32(temp0, temp2);
     118    54789800 :     out1[4] = _mm256_unpacklo_epi32(temp1, temp3);
     119    54789800 :     out1[5] = _mm256_unpackhi_epi32(temp1, temp3);
     120    54789800 :     out1[2] = _mm256_unpacklo_epi32(temp4, temp6);
     121    54789800 :     out1[3] = _mm256_unpackhi_epi32(temp4, temp6);
     122    54789800 :     out1[6] = _mm256_unpacklo_epi32(temp5, temp7);
     123    54789800 :     out1[7] = _mm256_unpackhi_epi32(temp5, temp7);
     124             : 
     125    54789800 :     out[0 * stride] = _mm256_permute2x128_si256(out1[0], out1[2], 0x20);
     126    54789800 :     out[1 * stride] = _mm256_permute2x128_si256(out1[1], out1[3], 0x20);
     127    54789800 :     out[2 * stride] = _mm256_permute2x128_si256(out1[4], out1[6], 0x20);
     128    54789800 :     out[3 * stride] = _mm256_permute2x128_si256(out1[5], out1[7], 0x20);
     129    54789800 :     out[4 * stride] = _mm256_permute2x128_si256(out1[0], out1[2], 0x31);
     130    54789800 :     out[5 * stride] = _mm256_permute2x128_si256(out1[1], out1[3], 0x31);
     131    54789800 :     out[6 * stride] = _mm256_permute2x128_si256(out1[4], out1[6], 0x31);
     132    54789800 :     out[7 * stride] = _mm256_permute2x128_si256(out1[5], out1[7], 0x31);
     133    54789800 : }
     134             : 
     135     3427220 : static INLINE void transpose_32_avx2(int32_t txfm_size, const __m256i *input,
     136             :     __m256i *output) {
     137     3427220 :     const int32_t num_per_256 = 8;
     138     3427220 :     const int32_t row_size = txfm_size;
     139     3427220 :     const int32_t col_size = txfm_size / num_per_256;
     140             :     int32_t r, c;
     141             : 
     142             :     // transpose each 8x8 block internally
     143    17133000 :     for (r = 0; r < row_size; r += 8) {
     144    68491400 :         for (c = 0; c < col_size; c++) {
     145    54785700 :             transpose_32_8x8_avx2(col_size, &input[r * col_size + c],
     146    54785700 :                 &output[c * 8 * col_size + r / 8]);
     147             :         }
     148             :     }
     149     3426740 : }
     150             : 
     151    20384100 : static INLINE void transpose_8nx8n(const __m256i *input, __m256i *output,
     152             :     const int32_t width, const int32_t height) {
     153    20384100 :     const int32_t numcol = height >> 3;
     154    20384100 :     const int32_t numrow = width >> 3;
     155             :     __m256i out1[8];
     156    92539200 :     for (int32_t j = 0; j < numrow; j++) {
     157   306538000 :         for (int32_t i = 0; i < numcol; i++) {
     158  1875060000 :             TRANSPOSE_4X4_AVX2(input[i * width + j + (numrow * 0)],
     159             :                 input[i * width + j + (numrow * 1)],
     160             :                 input[i * width + j + (numrow * 2)],
     161             :                 input[i * width + j + (numrow * 3)],
     162             :                 out1[0], out1[1], out1[4], out1[5]);
     163  1875060000 :             TRANSPOSE_4X4_AVX2(input[i * width + j + (numrow * 4)],
     164             :                 input[i * width + j + (numrow * 5)],
     165             :                 input[i * width + j + (numrow * 6)],
     166             :                 input[i * width + j + (numrow * 7)],
     167             :                 out1[2], out1[3], out1[6], out1[7]);
     168   234383000 :             output[j * height + i + (numcol * 0)] = _mm256_permute2x128_si256(out1[0], out1[2], 0x20);
     169   234383000 :             output[j * height + i + (numcol * 1)] = _mm256_permute2x128_si256(out1[1], out1[3], 0x20);
     170   234383000 :             output[j * height + i + (numcol * 2)] = _mm256_permute2x128_si256(out1[4], out1[6], 0x20);
     171   234383000 :             output[j * height + i + (numcol * 3)] = _mm256_permute2x128_si256(out1[5], out1[7], 0x20);
     172   234383000 :             output[j * height + i + (numcol * 4)] = _mm256_permute2x128_si256(out1[0], out1[2], 0x31);
     173   234383000 :             output[j * height + i + (numcol * 5)] = _mm256_permute2x128_si256(out1[1], out1[3], 0x31);
     174   234383000 :             output[j * height + i + (numcol * 6)] = _mm256_permute2x128_si256(out1[4], out1[6], 0x31);
     175   234383000 :             output[j * height + i + (numcol * 7)] = _mm256_permute2x128_si256(out1[5], out1[7], 0x31);
     176             :         }
     177             :     }
     178    20384100 : }
     179             : 
     180    15040100 : static INLINE void transpose_4x8_avx2(const __m256i *in, __m256i *out) {
     181    15040100 :     __m256i perm = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
     182             : 
     183   120320000 :     TRANSPOSE_4X4_AVX2(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]);
     184    15040100 :     out[0] = _mm256_permutevar8x32_epi32(out[0], perm);
     185    15040100 :     out[1] = _mm256_permutevar8x32_epi32(out[1], perm);
     186    15040100 :     out[2] = _mm256_permutevar8x32_epi32(out[2], perm);
     187    15040100 :     out[3] = _mm256_permutevar8x32_epi32(out[3], perm);
     188    15040100 : }
     189             : 
     190    11341100 : static INLINE void transpose_4x16_avx2(const __m256i *in, __m256i *out) {
     191    11341100 :     __m256i perm = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
     192             : 
     193    90728800 :     TRANSPOSE_4X4_AVX2(in[0], in[1], in[2], in[3], out[0], out[2], out[4], out[6]);
     194    90728800 :     TRANSPOSE_4X4_AVX2(in[4], in[5], in[6], in[7], out[1], out[3], out[5], out[7]);
     195             : 
     196    11341100 :     out[0] = _mm256_permutevar8x32_epi32(out[0], perm);
     197    11341100 :     out[1] = _mm256_permutevar8x32_epi32(out[1], perm);
     198    11341100 :     out[2] = _mm256_permutevar8x32_epi32(out[2], perm);
     199    11341100 :     out[3] = _mm256_permutevar8x32_epi32(out[3], perm);
     200    11341100 :     out[4] = _mm256_permutevar8x32_epi32(out[4], perm);
     201    11341100 :     out[5] = _mm256_permutevar8x32_epi32(out[5], perm);
     202    11341100 :     out[6] = _mm256_permutevar8x32_epi32(out[6], perm);
     203    11341100 :     out[7] = _mm256_permutevar8x32_epi32(out[7], perm);
     204    11341100 : }
     205             : 
     206             : // Note:
     207             : //  rounding = 1 << (bit - 1)
     208   249215000 : static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
     209             :     const __m256i *w1, const __m256i *n1,
     210             :     const __m256i *rounding, int32_t bit) {
     211             :     __m256i x, y;
     212             : 
     213   249215000 :     x = _mm256_mullo_epi32(*w0, *n0);
     214   498430000 :     y = _mm256_mullo_epi32(*w1, *n1);
     215   249215000 :     x = _mm256_add_epi32(x, y);
     216   498430000 :     x = _mm256_add_epi32(x, *rounding);
     217   249215000 :     x = _mm256_srai_epi32(x, bit);
     218   249215000 :     return x;
     219             : }
     220             : 
     221    86665700 : static INLINE __m128i half_btf_small(const __m128i *w0, const __m128i *n0,
     222             :     const __m128i *w1, const __m128i *n1,
     223             :     const __m128i *rounding, int32_t bit) {
     224             :     __m128i x, y;
     225             : 
     226    86665700 :     x = _mm_mullo_epi32(*w0, *n0);
     227   173331000 :     y = _mm_mullo_epi32(*w1, *n1);
     228    86665700 :     x = _mm_add_epi32(x, y);
     229   173331000 :     x = _mm_add_epi32(x, *rounding);
     230    86665700 :     x = _mm_srai_epi32(x, bit);
     231    86665700 :     return x;
     232             : }
     233             : 
     234             : // out0 = in0*w0 + in1*w1
     235             : // out1 = -in1*w0 + in0*w1
     236             : #define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \
     237             :   do {                                                         \
     238             :     const __m256i ww0 = _mm256_set1_epi32(w0);                    \
     239             :     const __m256i ww1 = _mm256_set1_epi32(w1);                    \
     240             :     const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0);          \
     241             :     const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1);          \
     242             :     out0 = _mm256_add_epi32(in0_w0, in1_w1);                      \
     243             :     out0 = av1_round_shift_32_avx2(out0, bit);               \
     244             :     const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1);          \
     245             :     const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0);          \
     246             :     out1 = _mm256_sub_epi32(in0_w1, in1_w0);                      \
     247             :     out1 = av1_round_shift_32_avx2(out1, bit);               \
     248             :       } while (0)
     249             : 
     250             : // out0 = in0*w0 + in1*w1
     251             : // out1 = in1*w0 - in0*w1
     252             : #define btf_32_avx2_type1(w0, w1, in0, in1, out0, out1, bit) \
     253             :   do {                                                         \
     254             :     btf_32_avx2_type0(w1, w0, in1, in0, out0, out1, bit);    \
     255             :       } while (0)
     256             : 
     257             : // out0 = in0*w0 + in1*w1
     258             : // out1 = -in1*w0 + in0*w1
     259             : #define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
     260             :   do {                                                                  \
     261             :     const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0);                   \
     262             :     const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1);                   \
     263             :     out0 = _mm256_add_epi32(in0_w0, in1_w1);                               \
     264             :     out0 = _mm256_add_epi32(out0, r);                                      \
     265             :     out0 = _mm256_srai_epi32(out0, bit);                                   \
     266             :     const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1);                   \
     267             :     const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0);                   \
     268             :     out1 = _mm256_sub_epi32(in0_w1, in1_w0);                               \
     269             :     out1 = _mm256_add_epi32(out1, r);                                      \
     270             :     out1 = _mm256_srai_epi32(out1, bit);                                   \
     271             :     } while (0)
     272             : 
     273             : // out0 = in0*w0 + in1*w1
     274             : // out1 = in1*w0 - in0*w1
     275             : #define btf_32_type1_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
     276             :   do {                                                                  \
     277             :     btf_32_type0_avx2_new(ww1, ww0, in1, in0, out0, out1, r, bit);    \
     278             :     } while (0)
     279             : 
     280             : static const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = {
     281             :     fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32,
     282             :     fwd_shift_64x64, fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16,
     283             :     fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64,
     284             :     fwd_shift_64x32, fwd_shift_4x16, fwd_shift_16x4, fwd_shift_8x32,
     285             :     fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16,
     286             : };
     287             : 
     288   106415000 : static INLINE void load_buffer_8x8(const int16_t *input, __m256i *in,
     289             :     int32_t stride, int32_t flipud, int32_t fliplr,
     290             :     int32_t shift) {
     291             :     __m128i temp[8];
     292   106415000 :     if (!flipud) {
     293   101463000 :         temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
     294   101463000 :         temp[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
     295   101463000 :         temp[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
     296   101463000 :         temp[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
     297   101463000 :         temp[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
     298   101463000 :         temp[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
     299   101463000 :         temp[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
     300   202926000 :         temp[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
     301             :     }
     302             :     else {
     303     4951770 :         temp[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
     304     4951770 :         temp[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
     305     4951770 :         temp[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
     306     4951770 :         temp[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
     307     4951770 :         temp[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
     308     4951770 :         temp[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
     309     9903540 :         temp[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
     310     4951770 :         temp[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
     311             :     }
     312             : 
     313   106415000 :     if (fliplr) {
     314     4044020 :         temp[0] = mm_reverse_epi16(temp[0]);
     315     4044000 :         temp[1] = mm_reverse_epi16(temp[1]);
     316     4043910 :         temp[2] = mm_reverse_epi16(temp[2]);
     317     4043800 :         temp[3] = mm_reverse_epi16(temp[3]);
     318     4043730 :         temp[4] = mm_reverse_epi16(temp[4]);
     319     4043700 :         temp[5] = mm_reverse_epi16(temp[5]);
     320     4043680 :         temp[6] = mm_reverse_epi16(temp[6]);
     321     4043670 :         temp[7] = mm_reverse_epi16(temp[7]);
     322             :     }
     323             : 
     324   106484000 :     in[0] = _mm256_cvtepi16_epi32(temp[0]);
     325   106484000 :     in[1] = _mm256_cvtepi16_epi32(temp[1]);
     326   106484000 :     in[2] = _mm256_cvtepi16_epi32(temp[2]);
     327   106484000 :     in[3] = _mm256_cvtepi16_epi32(temp[3]);
     328   106484000 :     in[4] = _mm256_cvtepi16_epi32(temp[4]);
     329   106484000 :     in[5] = _mm256_cvtepi16_epi32(temp[5]);
     330   106484000 :     in[6] = _mm256_cvtepi16_epi32(temp[6]);
     331   106484000 :     in[7] = _mm256_cvtepi16_epi32(temp[7]);
     332             : 
     333   106484000 :     in[0] = _mm256_slli_epi32(in[0], shift);
     334   106484000 :     in[1] = _mm256_slli_epi32(in[1], shift);
     335   106484000 :     in[2] = _mm256_slli_epi32(in[2], shift);
     336   106484000 :     in[3] = _mm256_slli_epi32(in[3], shift);
     337   106484000 :     in[4] = _mm256_slli_epi32(in[4], shift);
     338   106484000 :     in[5] = _mm256_slli_epi32(in[5], shift);
     339   106484000 :     in[6] = _mm256_slli_epi32(in[6], shift);
     340   106484000 :     in[7] = _mm256_slli_epi32(in[7], shift);
     341   106484000 : }
     342             : 
     343    75347400 : static INLINE void load_buffer_4x4_avx2(const int16_t *input, __m256i *in,
     344             :     int32_t stride, int32_t flipud, int32_t fliplr, int32_t shift) {
     345    75347400 :     if (!flipud) {
     346   140235000 :         in[0] = _mm256_setr_epi64x(*(uint64_t *)(input + 0 * stride),
     347    70117700 :             *(uint64_t *)(input + 1 * stride), 0, 0);
     348    70117700 :         in[1] = _mm256_setr_epi64x(*(uint64_t *)(input + 2 * stride),
     349    70117700 :             *(uint64_t *)(input + 3 * stride), 0, 0);
     350             :     }
     351             :     else {
     352    10459300 :         in[0] = _mm256_setr_epi64x(*(uint64_t *)(input + 3 * stride),
     353     5229630 :             *(uint64_t *)(input + 2 * stride), 0, 0);
     354     5229630 :         in[1] = _mm256_setr_epi64x(*(uint64_t *)(input + 1 * stride),
     355     5229630 :             *(uint64_t *)(input + 0 * stride), 0, 0);
     356             :     }
     357             : 
     358    75347400 :     if (fliplr) {
     359     5261200 :         in[0] = _mm256_shufflelo_epi16(in[0], 0x1b);
     360     5261200 :         in[0] = _mm256_shufflehi_epi16(in[0], 0x1b);
     361     5261200 :         in[1] = _mm256_shufflelo_epi16(in[1], 0x1b);
     362     5261200 :         in[1] = _mm256_shufflehi_epi16(in[1], 0x1b);
     363             :     }
     364             : 
     365   150695000 :     in[0] = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[0]));
     366   150695000 :     in[1] = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[1]));
     367             : 
     368    75347400 :     in[0] = _mm256_slli_epi32(in[0], shift);
     369    75347400 :     in[1] = _mm256_slli_epi32(in[1], shift);
     370    75347400 : }
     371             : 
     372    18805700 : static INLINE void load_buffer_4x8_avx2(const int16_t *input, __m256i *out,
     373             :     int32_t stride, int32_t flipud, int32_t fliplr,
     374             :     int32_t shift) {
     375    18805700 :     const int16_t *topL = input;
     376    18805700 :     const int16_t *botL = input + 4 * stride;
     377             : 
     378    18805700 :     if (flipud) {
     379     1280060 :         load_buffer_4x4_avx2(botL, out, stride, flipud, fliplr, shift);
     380     1280070 :         load_buffer_4x4_avx2(topL, out + 2, stride, flipud, fliplr, shift);
     381             :     }
     382             :     else {
     383    17525700 :         load_buffer_4x4_avx2(topL, out, stride, flipud, fliplr, shift);
     384    17531200 :         load_buffer_4x4_avx2(botL, out + 2, stride, flipud, fliplr, shift);
     385             :     }
     386    18811600 : }
     387             : 
     388    18905200 : static INLINE void load_buffer_8x4_avx2(const int16_t *input, __m256i *out,
     389             :     int32_t stride, int32_t flipud, int32_t fliplr, int32_t shift) {
     390    18905200 :     const int16_t *topL = input;
     391    18905200 :     const int16_t *topR = input + 4;
     392             : 
     393    18905200 :     if (fliplr) {
     394     1342910 :         load_buffer_4x4_avx2(topR, out, stride, flipud, fliplr, shift);
     395     1342920 :         load_buffer_4x4_avx2(topL, out + 2, stride, flipud, fliplr, shift);
     396             :     }
     397             :     else {
     398    17562300 :         load_buffer_4x4_avx2(topL, out, stride, flipud, fliplr, shift);
     399    17567600 :         load_buffer_4x4_avx2(topR, out + 2, stride, flipud, fliplr, shift);
     400             :     }
     401    18911900 : }
     402             : 
     403     5564200 : static INLINE void load_buffer_4x16_avx2(const int16_t *input, __m256i *out,
     404             :     const int32_t stride, const int32_t flipud,
     405             :     const int32_t fliplr, const int32_t shift) {
     406     5564200 :     const int16_t *topL = input;
     407     5564200 :     const int16_t *botL = input + 8 * stride;
     408             : 
     409     5564200 :     if (flipud) {
     410      475174 :         load_buffer_4x8_avx2(botL, out, stride, flipud, fliplr, shift);
     411      475178 :         load_buffer_4x8_avx2(topL, out + 4, stride, flipud, fliplr, shift);
     412             :     }
     413             :     else {
     414     5089020 :         load_buffer_4x8_avx2(topL, out, stride, flipud, fliplr, shift);
     415     5089680 :         load_buffer_4x8_avx2(botL, out + 4, stride, flipud, fliplr, shift);
     416             :     }
     417     5564990 : }
     418             : 
     419     5777000 : static INLINE void load_buffer_16x4_avx2(const int16_t *input, __m256i *out,
     420             :     int32_t stride, int32_t flipud, int32_t fliplr, int32_t shift) {
     421     5777000 :     const int16_t *topL = input;
     422     5777000 :     const int16_t *topR = input + 8;
     423             : 
     424     5777000 :     if (fliplr) {
     425      502106 :         load_buffer_8x4_avx2(topR, out, stride, flipud, fliplr, shift);
     426      502111 :         load_buffer_8x4_avx2(topL, out + 4, stride, flipud, fliplr, shift);
     427             :     }
     428             :     else {
     429     5274890 :         load_buffer_8x4_avx2(topL, out, stride, flipud, fliplr, shift);
     430     5275790 :         load_buffer_8x4_avx2(topR, out + 4, stride, flipud, fliplr, shift);
     431             :     }
     432     5777850 : }
     433             : 
     434   188826000 : static INLINE void col_txfm_8x8_rounding(__m256i *in, int32_t shift) {
     435   188826000 :     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
     436             : 
     437   188826000 :     in[0] = _mm256_add_epi32(in[0], rounding);
     438   188826000 :     in[1] = _mm256_add_epi32(in[1], rounding);
     439   188826000 :     in[2] = _mm256_add_epi32(in[2], rounding);
     440   188826000 :     in[3] = _mm256_add_epi32(in[3], rounding);
     441   188826000 :     in[4] = _mm256_add_epi32(in[4], rounding);
     442   188826000 :     in[5] = _mm256_add_epi32(in[5], rounding);
     443   188826000 :     in[6] = _mm256_add_epi32(in[6], rounding);
     444   188826000 :     in[7] = _mm256_add_epi32(in[7], rounding);
     445             : 
     446   188826000 :     in[0] = _mm256_srai_epi32(in[0], shift);
     447   188826000 :     in[1] = _mm256_srai_epi32(in[1], shift);
     448   188826000 :     in[2] = _mm256_srai_epi32(in[2], shift);
     449   188826000 :     in[3] = _mm256_srai_epi32(in[3], shift);
     450   188826000 :     in[4] = _mm256_srai_epi32(in[4], shift);
     451   188826000 :     in[5] = _mm256_srai_epi32(in[5], shift);
     452   188826000 :     in[6] = _mm256_srai_epi32(in[6], shift);
     453   188826000 :     in[7] = _mm256_srai_epi32(in[7], shift);
     454   188826000 : }
     455             : 
     456     7557430 : static void fidtx8x8_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
     457             :     (void)bit;
     458     7557430 :     out[0] = _mm256_slli_epi32(in[0 * col_num], 1);
     459     7557430 :     out[1] = _mm256_slli_epi32(in[1 * col_num], 1);
     460     7557430 :     out[2] = _mm256_slli_epi32(in[2 * col_num], 1);
     461     7557430 :     out[3] = _mm256_slli_epi32(in[3 * col_num], 1);
     462     7557430 :     out[4] = _mm256_slli_epi32(in[4 * col_num], 1);
     463     7557430 :     out[5] = _mm256_slli_epi32(in[5 * col_num], 1);
     464     7557430 :     out[6] = _mm256_slli_epi32(in[6 * col_num], 1);
     465     7557430 :     out[7] = _mm256_slli_epi32(in[7 * col_num], 1);
     466     7557430 : }
     467             : 
     468     1286740 : static INLINE void fidtx16x8_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
     469             :     (void)bit;
     470     1286740 :     const int32_t bits = 12;       // NewSqrt2Bits = 12
     471     1286740 :     const int32_t sqrt = 2 * 5793; // 2 * NewSqrt2
     472     1286740 :     const __m256i newsqrt = _mm256_set1_epi32(sqrt);
     473     1286740 :     const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
     474             :     __m256i temp;
     475     1286740 :     int32_t num_iters = 8 * col_num;
     476    11580400 :     for (int32_t i = 0; i < num_iters; i++) {
     477    20587300 :         temp = _mm256_mullo_epi32(in[i], newsqrt);
     478    10293700 :         temp = _mm256_add_epi32(temp, rounding);
     479    20587300 :         out[i] = _mm256_srai_epi32(temp, bits);
     480             :     }
     481     1286740 : }
     482             : 
     483     7685080 : static INLINE void write_buffer_4x8(const __m256i *res, int32_t *output) {
     484     7685080 :     _mm256_storeu_si256((__m256i *)(output + 0 * 8), res[0]);
     485     7685080 :     _mm256_storeu_si256((__m256i *)(output + 1 * 8), res[1]);
     486     7685080 :     _mm256_storeu_si256((__m256i *)(output + 2 * 8), res[2]);
     487     7685080 :     _mm256_storeu_si256((__m256i *)(output + 3 * 8), res[3]);
     488     7685080 : }
     489             : 
     490    42880000 : static INLINE void write_buffer_8x8(const __m256i *res, int32_t *output) {
     491    42880000 :     _mm256_storeu_si256((__m256i *)(output + 0 * 8), res[0]);
     492    42880000 :     _mm256_storeu_si256((__m256i *)(output + 1 * 8), res[1]);
     493    42880000 :     _mm256_storeu_si256((__m256i *)(output + 2 * 8), res[2]);
     494    42880000 :     _mm256_storeu_si256((__m256i *)(output + 3 * 8), res[3]);
     495             : 
     496    42880000 :     _mm256_storeu_si256((__m256i *)(output + 4 * 8), res[4]);
     497    42880000 :     _mm256_storeu_si256((__m256i *)(output + 5 * 8), res[5]);
     498    42880000 :     _mm256_storeu_si256((__m256i *)(output + 6 * 8), res[6]);
     499    42880000 :     _mm256_storeu_si256((__m256i *)(output + 7 * 8), res[7]);
     500    42880000 : }
     501             : 
     502    69633300 : static void fdct8x8_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
     503    69633300 :     const int32_t *cospi = cospi_arr(bit);
     504    69617100 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
     505    69617100 :     const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
     506    69617100 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
     507    69617100 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
     508    69617100 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
     509    69617100 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
     510    69617100 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
     511    69617100 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
     512    69617100 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
     513             :     __m256i u[8], v[8];
     514             : 
     515             :     // stage 0
     516             :     // stage 1
     517    69617100 :     u[0] = _mm256_add_epi32(in[0 * col_num], in[7 * col_num]);
     518    69617100 :     v[7] = _mm256_sub_epi32(in[0 * col_num], in[7 * col_num]);
     519    69617100 :     u[1] = _mm256_add_epi32(in[1 * col_num], in[6 * col_num]);
     520    69617100 :     u[6] = _mm256_sub_epi32(in[1 * col_num], in[6 * col_num]);
     521    69617100 :     u[2] = _mm256_add_epi32(in[2 * col_num], in[5 * col_num]);
     522    69617100 :     u[5] = _mm256_sub_epi32(in[2 * col_num], in[5 * col_num]);
     523    69617100 :     u[3] = _mm256_add_epi32(in[3 * col_num], in[4 * col_num]);
     524    69617100 :     v[4] = _mm256_sub_epi32(in[3 * col_num], in[4 * col_num]);
     525             : 
     526             :     // stage 2
     527    69617100 :     v[0] = _mm256_add_epi32(u[0], u[3]);
     528    69617100 :     v[3] = _mm256_sub_epi32(u[0], u[3]);
     529    69617100 :     v[1] = _mm256_add_epi32(u[1], u[2]);
     530    69617100 :     v[2] = _mm256_sub_epi32(u[1], u[2]);
     531             : 
     532    69617100 :     v[5] = _mm256_mullo_epi32(u[5], cospim32);
     533    69617100 :     v[6] = _mm256_mullo_epi32(u[6], cospi32);
     534    69617100 :     v[5] = _mm256_add_epi32(v[5], v[6]);
     535    69617100 :     v[5] = _mm256_add_epi32(v[5], rnding);
     536    69617100 :     v[5] = _mm256_srai_epi32(v[5], bit);
     537             : 
     538    69617100 :     u[0] = _mm256_mullo_epi32(u[5], cospi32);
     539    69617100 :     v[6] = _mm256_mullo_epi32(u[6], cospim32);
     540    69617100 :     v[6] = _mm256_sub_epi32(u[0], v[6]);
     541    69617100 :     v[6] = _mm256_add_epi32(v[6], rnding);
     542    69617100 :     v[6] = _mm256_srai_epi32(v[6], bit);
     543             : 
     544             :     // stage 3
     545             :     // type 0
     546    69617100 :     v[0] = _mm256_mullo_epi32(v[0], cospi32);
     547    69617100 :     v[1] = _mm256_mullo_epi32(v[1], cospi32);
     548    69617100 :     u[0] = _mm256_add_epi32(v[0], v[1]);
     549    69617100 :     u[0] = _mm256_add_epi32(u[0], rnding);
     550    69617100 :     u[0] = _mm256_srai_epi32(u[0], bit);
     551             : 
     552    69617100 :     u[1] = _mm256_sub_epi32(v[0], v[1]);
     553    69617100 :     u[1] = _mm256_add_epi32(u[1], rnding);
     554    69617100 :     u[1] = _mm256_srai_epi32(u[1], bit);
     555             : 
     556             :     // type 1
     557    69617100 :     v[0] = _mm256_mullo_epi32(v[2], cospi48);
     558    69617100 :     v[1] = _mm256_mullo_epi32(v[3], cospi16);
     559    69617100 :     u[2] = _mm256_add_epi32(v[0], v[1]);
     560    69617100 :     u[2] = _mm256_add_epi32(u[2], rnding);
     561    69617100 :     u[2] = _mm256_srai_epi32(u[2], bit);
     562             : 
     563    69617100 :     v[0] = _mm256_mullo_epi32(v[2], cospi16);
     564    69617100 :     v[1] = _mm256_mullo_epi32(v[3], cospi48);
     565    69617100 :     u[3] = _mm256_sub_epi32(v[1], v[0]);
     566    69617100 :     u[3] = _mm256_add_epi32(u[3], rnding);
     567    69617100 :     u[3] = _mm256_srai_epi32(u[3], bit);
     568             : 
     569    69617100 :     u[4] = _mm256_add_epi32(v[4], v[5]);
     570    69617100 :     u[5] = _mm256_sub_epi32(v[4], v[5]);
     571    69617100 :     u[6] = _mm256_sub_epi32(v[7], v[6]);
     572    69617100 :     u[7] = _mm256_add_epi32(v[7], v[6]);
     573             : 
     574             :     // stage 4
     575             :     // stage 5
     576    69617100 :     v[0] = _mm256_mullo_epi32(u[4], cospi56);
     577    69617100 :     v[1] = _mm256_mullo_epi32(u[7], cospi8);
     578    69617100 :     v[0] = _mm256_add_epi32(v[0], v[1]);
     579    69617100 :     v[0] = _mm256_add_epi32(v[0], rnding);
     580    69617100 :     out[1 * col_num] = _mm256_srai_epi32(v[0], bit);
     581             : 
     582    69617100 :     v[0] = _mm256_mullo_epi32(u[4], cospi8);
     583    69617100 :     v[1] = _mm256_mullo_epi32(u[7], cospi56);
     584    69617100 :     v[0] = _mm256_sub_epi32(v[1], v[0]);
     585    69617100 :     v[0] = _mm256_add_epi32(v[0], rnding);
     586    69617100 :     out[7 * col_num] = _mm256_srai_epi32(v[0], bit);
     587             : 
     588    69617100 :     v[0] = _mm256_mullo_epi32(u[5], cospi24);
     589    69617100 :     v[1] = _mm256_mullo_epi32(u[6], cospi40);
     590    69617100 :     v[0] = _mm256_add_epi32(v[0], v[1]);
     591    69617100 :     v[0] = _mm256_add_epi32(v[0], rnding);
     592    69617100 :     out[5 * col_num] = _mm256_srai_epi32(v[0], bit);
     593             : 
     594    69617100 :     v[0] = _mm256_mullo_epi32(u[5], cospi40);
     595    69617100 :     v[1] = _mm256_mullo_epi32(u[6], cospi24);
     596    69617100 :     v[0] = _mm256_sub_epi32(v[1], v[0]);
     597    69617100 :     v[0] = _mm256_add_epi32(v[0], rnding);
     598    69617100 :     out[3 * col_num] = _mm256_srai_epi32(v[0], bit);
     599             : 
     600    69617100 :     out[0 * col_num] = u[0];
     601    69617100 :     out[4 * col_num] = u[1];
     602    69617100 :     out[2 * col_num] = u[2];
     603    69617100 :     out[6 * col_num] = u[3];
     604    69617100 : }
     605             : 
     606    13888100 : static void fadst8x8_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
     607    13888100 :     const int32_t *cospi = cospi_arr(bit);
     608    13887500 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
     609    13887500 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
     610    13887500 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
     611    13887500 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
     612    13887500 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
     613    13887500 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
     614    13887500 :     const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
     615    13887500 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
     616    13887500 :     const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
     617    13887500 :     const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
     618    13887500 :     const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
     619    13887500 :     const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
     620    13887500 :     const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
     621    13887500 :     const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
     622    13887500 :     const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
     623    13887500 :     const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
     624    13887500 :     const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
     625    27774900 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
     626    13887500 :     const __m256i zero = _mm256_setzero_si256();
     627             :     __m256i u0, u1, u2, u3, u4, u5, u6, u7;
     628             :     __m256i v0, v1, v2, v3, v4, v5, v6, v7;
     629             :     __m256i x, y;
     630             : 
     631    13887500 :     u0 = in[0 * col_num];
     632    13887500 :     u1 = _mm256_sub_epi32(zero, in[7 * col_num]);
     633    13887500 :     u2 = _mm256_sub_epi32(zero, in[3 * col_num]);
     634    13887500 :     u3 = in[4 * col_num];
     635    13887500 :     u4 = _mm256_sub_epi32(zero, in[1 * col_num]);
     636    13887500 :     u5 = in[6 * col_num];
     637    13887500 :     u6 = in[2 * col_num];
     638    13887500 :     u7 = _mm256_sub_epi32(zero, in[5 * col_num]);
     639             : 
     640             :     // stage 2
     641    13887500 :     v0 = u0;
     642    13887500 :     v1 = u1;
     643             : 
     644    13887500 :     x = _mm256_mullo_epi32(u2, cospi32);
     645    13887500 :     y = _mm256_mullo_epi32(u3, cospi32);
     646    13887500 :     v2 = _mm256_add_epi32(x, y);
     647    13887500 :     v2 = _mm256_add_epi32(v2, rnding);
     648    27774900 :     v2 = _mm256_srai_epi32(v2, bit);
     649             : 
     650    13887500 :     v3 = _mm256_sub_epi32(x, y);
     651    13887500 :     v3 = _mm256_add_epi32(v3, rnding);
     652    13887500 :     v3 = _mm256_srai_epi32(v3, bit);
     653             : 
     654    13887500 :     v4 = u4;
     655    13887500 :     v5 = u5;
     656             : 
     657    13887500 :     x = _mm256_mullo_epi32(u6, cospi32);
     658    13887500 :     y = _mm256_mullo_epi32(u7, cospi32);
     659    13887500 :     v6 = _mm256_add_epi32(x, y);
     660    13887500 :     v6 = _mm256_add_epi32(v6, rnding);
     661    27774900 :     v6 = _mm256_srai_epi32(v6, bit);
     662             : 
     663    13887500 :     v7 = _mm256_sub_epi32(x, y);
     664    13887500 :     v7 = _mm256_add_epi32(v7, rnding);
     665    27774900 :     v7 = _mm256_srai_epi32(v7, bit);
     666             : 
     667             :     // stage 3
     668    13887500 :     u0 = _mm256_add_epi32(v0, v2);
     669    13887500 :     u1 = _mm256_add_epi32(v1, v3);
     670    13887500 :     u2 = _mm256_sub_epi32(v0, v2);
     671    13887500 :     u3 = _mm256_sub_epi32(v1, v3);
     672    13887500 :     u4 = _mm256_add_epi32(v4, v6);
     673    13887500 :     u5 = _mm256_add_epi32(v5, v7);
     674    13887500 :     u6 = _mm256_sub_epi32(v4, v6);
     675    13887500 :     u7 = _mm256_sub_epi32(v5, v7);
     676             : 
     677             :     // stage 4
     678    13887500 :     v0 = u0;
     679    13887500 :     v1 = u1;
     680    13887500 :     v2 = u2;
     681    13887500 :     v3 = u3;
     682             : 
     683    13887500 :     x = _mm256_mullo_epi32(u4, cospi16);
     684    13887500 :     y = _mm256_mullo_epi32(u5, cospi48);
     685    13887500 :     v4 = _mm256_add_epi32(x, y);
     686    13887500 :     v4 = _mm256_add_epi32(v4, rnding);
     687    27774900 :     v4 = _mm256_srai_epi32(v4, bit);
     688             : 
     689    13887500 :     x = _mm256_mullo_epi32(u4, cospi48);
     690    13887500 :     y = _mm256_mullo_epi32(u5, cospim16);
     691    13887500 :     v5 = _mm256_add_epi32(x, y);
     692    13887500 :     v5 = _mm256_add_epi32(v5, rnding);
     693    27774900 :     v5 = _mm256_srai_epi32(v5, bit);
     694             : 
     695    13887500 :     x = _mm256_mullo_epi32(u6, cospim48);
     696    13887500 :     y = _mm256_mullo_epi32(u7, cospi16);
     697    13887500 :     v6 = _mm256_add_epi32(x, y);
     698    13887500 :     v6 = _mm256_add_epi32(v6, rnding);
     699    27774900 :     v6 = _mm256_srai_epi32(v6, bit);
     700             : 
     701    13887500 :     x = _mm256_mullo_epi32(u6, cospi16);
     702    13887500 :     y = _mm256_mullo_epi32(u7, cospi48);
     703    13887500 :     v7 = _mm256_add_epi32(x, y);
     704    13887500 :     v7 = _mm256_add_epi32(v7, rnding);
     705    27774900 :     v7 = _mm256_srai_epi32(v7, bit);
     706             : 
     707             :     // stage 5
     708    13887500 :     u0 = _mm256_add_epi32(v0, v4);
     709    13887500 :     u1 = _mm256_add_epi32(v1, v5);
     710    13887500 :     u2 = _mm256_add_epi32(v2, v6);
     711    13887500 :     u3 = _mm256_add_epi32(v3, v7);
     712    13887500 :     u4 = _mm256_sub_epi32(v0, v4);
     713    13887500 :     u5 = _mm256_sub_epi32(v1, v5);
     714    13887500 :     u6 = _mm256_sub_epi32(v2, v6);
     715    13887500 :     u7 = _mm256_sub_epi32(v3, v7);
     716             : 
     717             :     // stage 6
     718    13887500 :     x = _mm256_mullo_epi32(u0, cospi4);
     719    13887500 :     y = _mm256_mullo_epi32(u1, cospi60);
     720    13887500 :     v0 = _mm256_add_epi32(x, y);
     721    13887500 :     v0 = _mm256_add_epi32(v0, rnding);
     722    27774900 :     v0 = _mm256_srai_epi32(v0, bit);
     723             : 
     724    13887500 :     x = _mm256_mullo_epi32(u0, cospi60);
     725    13887500 :     y = _mm256_mullo_epi32(u1, cospim4);
     726    13887500 :     v1 = _mm256_add_epi32(x, y);
     727    13887500 :     v1 = _mm256_add_epi32(v1, rnding);
     728    27774900 :     v1 = _mm256_srai_epi32(v1, bit);
     729             : 
     730    13887500 :     x = _mm256_mullo_epi32(u2, cospi20);
     731    13887500 :     y = _mm256_mullo_epi32(u3, cospi44);
     732    13887500 :     v2 = _mm256_add_epi32(x, y);
     733    13887500 :     v2 = _mm256_add_epi32(v2, rnding);
     734    27774900 :     v2 = _mm256_srai_epi32(v2, bit);
     735             : 
     736    13887500 :     x = _mm256_mullo_epi32(u2, cospi44);
     737    13887500 :     y = _mm256_mullo_epi32(u3, cospim20);
     738    13887500 :     v3 = _mm256_add_epi32(x, y);
     739    13887500 :     v3 = _mm256_add_epi32(v3, rnding);
     740    27774900 :     v3 = _mm256_srai_epi32(v3, bit);
     741             : 
     742    13887500 :     x = _mm256_mullo_epi32(u4, cospi36);
     743    13887500 :     y = _mm256_mullo_epi32(u5, cospi28);
     744    13887500 :     v4 = _mm256_add_epi32(x, y);
     745    13887500 :     v4 = _mm256_add_epi32(v4, rnding);
     746    27774900 :     v4 = _mm256_srai_epi32(v4, bit);
     747             : 
     748    13887500 :     x = _mm256_mullo_epi32(u4, cospi28);
     749    13887500 :     y = _mm256_mullo_epi32(u5, cospim36);
     750    13887500 :     v5 = _mm256_add_epi32(x, y);
     751    13887500 :     v5 = _mm256_add_epi32(v5, rnding);
     752    27774900 :     v5 = _mm256_srai_epi32(v5, bit);
     753             : 
     754    13887500 :     x = _mm256_mullo_epi32(u6, cospi52);
     755    13887500 :     y = _mm256_mullo_epi32(u7, cospi12);
     756    13887500 :     v6 = _mm256_add_epi32(x, y);
     757    13887500 :     v6 = _mm256_add_epi32(v6, rnding);
     758    27774900 :     v6 = _mm256_srai_epi32(v6, bit);
     759             : 
     760    13887500 :     x = _mm256_mullo_epi32(u6, cospi12);
     761    13887500 :     y = _mm256_mullo_epi32(u7, cospim52);
     762    13887500 :     v7 = _mm256_add_epi32(x, y);
     763    13887500 :     v7 = _mm256_add_epi32(v7, rnding);
     764    13887500 :     v7 = _mm256_srai_epi32(v7, bit);
     765             : 
     766             :     // stage 7
     767    13887500 :     out[0 * col_num] = v1;
     768    13887500 :     out[1 * col_num] = v6;
     769    13887500 :     out[2 * col_num] = v3;
     770    13887500 :     out[3 * col_num] = v4;
     771    13887500 :     out[4 * col_num] = v5;
     772    13887500 :     out[5 * col_num] = v2;
     773    13887500 :     out[6 * col_num] = v7;
     774    13887500 :     out[7 * col_num] = v0;
     775    13887500 : }
     776             : 
     777    19667600 : void eb_av1_fwd_txfm2d_8x8_avx2(int16_t *input, int32_t *coeff, uint32_t stride, TxType tx_type, uint8_t  bd)
     778             : {
     779             :     __m256i in[8], out[8];
     780    19667600 :     const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
     781    19667600 :     const int32_t txw_idx = get_txw_idx(TX_8X8);
     782    19667000 :     const int32_t txh_idx = get_txh_idx(TX_8X8);
     783             : 
     784    19676900 :     switch (tx_type) {
     785    10384800 :     case DCT_DCT:
     786    10384800 :         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
     787    10386600 :         fdct8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     788    10386300 :         col_txfm_8x8_rounding(out, -shift[1]);
     789    10386100 :         transpose_8x8_avx2(out, in);
     790    10386400 :         fdct8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     791    10386600 :         transpose_8x8_avx2(out, in);
     792    10386600 :         write_buffer_8x8(in, coeff);
     793    10386200 :         break;
     794     1232280 :     case ADST_DCT:
     795     1232280 :         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
     796     1232340 :         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     797     1232310 :         col_txfm_8x8_rounding(out, -shift[1]);
     798     1232310 :         transpose_8x8_avx2(out, in);
     799     1232310 :         fdct8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     800     1232320 :         transpose_8x8_avx2(out, in);
     801     1232320 :         write_buffer_8x8(in, coeff);
     802     1232310 :         break;
     803     1241180 :     case DCT_ADST:
     804     1241180 :         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
     805     1241250 :         fdct8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     806     1241250 :         col_txfm_8x8_rounding(out, -shift[1]);
     807     1241250 :         transpose_8x8_avx2(out, in);
     808     1241260 :         fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     809     1241240 :         transpose_8x8_avx2(out, in);
     810     1241240 :         write_buffer_8x8(in, coeff);
     811     1241240 :         break;
     812     1108260 :     case ADST_ADST:
     813     1108260 :         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
     814     1108310 :         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     815     1108310 :         col_txfm_8x8_rounding(out, -shift[1]);
     816     1108300 :         transpose_8x8_avx2(out, in);
     817     1108310 :         fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     818     1108300 :         transpose_8x8_avx2(out, in);
     819     1108300 :         write_buffer_8x8(in, coeff);
     820     1108300 :         break;
     821      348711 :     case FLIPADST_DCT:
     822      348711 :         load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
     823      348713 :         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     824      348712 :         col_txfm_8x8_rounding(out, -shift[1]);
     825      348713 :         transpose_8x8_avx2(out, in);
     826      348712 :         fdct8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     827      348712 :         transpose_8x8_avx2(out, in);
     828      348710 :         write_buffer_8x8(in, coeff);
     829      348710 :         break;
     830      348572 :     case DCT_FLIPADST:
     831      348572 :         load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
     832      348576 :         fdct8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     833      348577 :         col_txfm_8x8_rounding(out, -shift[1]);
     834      348580 :         transpose_8x8_avx2(out, in);
     835      348580 :         fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     836      348580 :         transpose_8x8_avx2(out, in);
     837      348579 :         write_buffer_8x8(in, coeff);
     838      348578 :         break;
     839      347389 :     case FLIPADST_FLIPADST:
     840      347389 :         load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
     841      347387 :         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     842      347390 :         col_txfm_8x8_rounding(out, -shift[1]);
     843      347388 :         transpose_8x8_avx2(out, in);
     844      347389 :         fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     845      347389 :         transpose_8x8_avx2(out, in);
     846      347389 :         write_buffer_8x8(in, coeff);
     847      347388 :         break;
     848      351460 :     case ADST_FLIPADST:
     849      351460 :         load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
     850      351460 :         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     851      351463 :         col_txfm_8x8_rounding(out, -shift[1]);
     852      351463 :         transpose_8x8_avx2(out, in);
     853      351463 :         fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     854      351464 :         transpose_8x8_avx2(out, in);
     855      351465 :         write_buffer_8x8(in, coeff);
     856      351464 :         break;
     857      351058 :     case FLIPADST_ADST:
     858      351058 :         load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
     859      351062 :         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     860      351060 :         col_txfm_8x8_rounding(out, -shift[1]);
     861      351063 :         transpose_8x8_avx2(out, in);
     862      351063 :         fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     863      351064 :         transpose_8x8_avx2(out, in);
     864      351063 :         write_buffer_8x8(in, coeff);
     865      351063 :         break;
     866      999908 :     case IDTX:
     867      999908 :         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
     868      999936 :         fidtx8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     869      999928 :         col_txfm_8x8_rounding(out, -shift[1]);
     870      999918 :         fidtx8x8_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     871      999914 :         write_buffer_8x8(out, coeff);
     872      999917 :         break;
     873      935885 :     case V_DCT:
     874      935885 :         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
     875      935915 :         fdct8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     876      935910 :         col_txfm_8x8_rounding(out, -shift[1]);
     877      935909 :         fidtx8x8_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     878      935906 :         write_buffer_8x8(out, coeff);
     879      935902 :         break;
     880      984897 :     case H_DCT:
     881      984897 :         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
     882      984942 :         fidtx8x8_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     883      984931 :         col_txfm_8x8_rounding(in, -shift[1]);
     884      984926 :         transpose_8x8_avx2(in, out);
     885      984932 :         fdct8x8_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     886      984933 :         transpose_8x8_avx2(in, out);
     887      984937 :         write_buffer_8x8(out, coeff);
     888      984931 :         break;
     889      256910 :     case V_ADST:
     890      256910 :         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
     891      256912 :         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     892      256912 :         col_txfm_8x8_rounding(out, -shift[1]);
     893      256910 :         fidtx8x8_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     894      256910 :         write_buffer_8x8(out, coeff);
     895      256909 :         break;
     896      265622 :     case H_ADST:
     897      265622 :         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
     898      265623 :         fidtx8x8_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     899      265623 :         col_txfm_8x8_rounding(in, -shift[1]);
     900      265623 :         transpose_8x8_avx2(in, out);
     901      265623 :         fadst8x8_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     902      265620 :         transpose_8x8_avx2(in, out);
     903      265621 :         write_buffer_8x8(out, coeff);
     904      265622 :         break;
     905      256028 :     case V_FLIPADST:
     906      256028 :         load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
     907      256029 :         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     908      256029 :         col_txfm_8x8_rounding(out, -shift[1]);
     909      256030 :         fidtx8x8_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     910      256029 :         write_buffer_8x8(out, coeff);
     911      256029 :         break;
     912      264031 :     case H_FLIPADST:
     913      264031 :         load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
     914      264030 :         fidtx8x8_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
     915      264030 :         col_txfm_8x8_rounding(in, -shift[1]);
     916      264030 :         transpose_8x8_avx2(in, out);
     917      264031 :         fadst8x8_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
     918      264031 :         transpose_8x8_avx2(in, out);
     919      264031 :         write_buffer_8x8(out, coeff);
     920      264031 :         break;
     921           0 :     default: assert(0);
     922             :     }
     923             :     (void)bd;
     924    19678600 : }
     925             : 
     926    11158200 : static INLINE void convert_8x8_to_16x16(const __m256i *in, __m256i *out) {
     927    11158200 :     int32_t row_index = 0;
     928    11158200 :     int32_t dst_index = 0;
     929    11158200 :     int32_t src_index = 0;
     930             : 
     931             :     // row 0, 1, .., 7
     932             :     do {
     933    89248100 :         out[dst_index] = in[src_index];
     934    89248100 :         out[dst_index + 1] = in[src_index + 8];
     935    89248100 :         dst_index += 2;
     936    89248100 :         src_index += 1;
     937    89248100 :         row_index += 1;
     938    89248100 :     } while (row_index < 8);
     939             : 
     940             :     // row 8, 9, ..., 15
     941    11158200 :     src_index += 8;
     942             :     do {
     943    89237900 :         out[dst_index] = in[src_index];
     944    89237900 :         out[dst_index + 1] = in[src_index + 8];
     945    89237900 :         dst_index += 2;
     946    89237900 :         src_index += 1;
     947    89237900 :         row_index += 1;
     948    89237900 :     } while (row_index < 16);
     949    11158200 : }
     950             : 
     951    11155200 : static INLINE void load_buffer_16x16(const int16_t *input, __m256i *out,
     952             :     int32_t stride, int32_t flipud, int32_t fliplr, int32_t shift) {
     953             :     __m256i in[32];
     954             :     // Load 4 8x8 blocks
     955    11155200 :     const int16_t *topL = input;
     956    11155200 :     const int16_t *topR = input + 8;
     957    11155200 :     const int16_t *botL = input + 8 * stride;
     958    11155200 :     const int16_t *botR = input + 8 * stride + 8;
     959             : 
     960             :     const int16_t *tmp;
     961             : 
     962    11155200 :     if (flipud) {
     963             :         // Swap left columns
     964      387231 :         tmp = topL;
     965      387231 :         topL = botL;
     966      387231 :         botL = tmp;
     967             :         // Swap right columns
     968      387231 :         tmp = topR;
     969      387231 :         topR = botR;
     970      387231 :         botR = tmp;
     971             :     }
     972             : 
     973    11155200 :     if (fliplr) {
     974             :         // Swap top rows
     975      387043 :         tmp = topL;
     976      387043 :         topL = topR;
     977      387043 :         topR = tmp;
     978             :         // Swap bottom rows
     979      387043 :         tmp = botL;
     980      387043 :         botL = botR;
     981      387043 :         botR = tmp;
     982             :     }
     983             : 
     984             :     // load first 8 columns
     985    11155200 :     load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
     986    11157400 :     load_buffer_8x8(botL, &in[16], stride, flipud, fliplr, shift);
     987             : 
     988             :     // load second 8 columns
     989    11158200 :     load_buffer_8x8(topR, &in[8], stride, flipud, fliplr, shift);
     990    11158000 :     load_buffer_8x8(botR, &in[24], stride, flipud, fliplr, shift);
     991             : 
     992    11158000 :     convert_8x8_to_16x16(in, out);
     993    11156600 : }
     994             : 
     995    31536600 : static INLINE void col_txfm_16x16_rounding(__m256i *in, int32_t shift) {
     996    31536600 :     col_txfm_8x8_rounding(&in[0], shift);
     997    31539100 :     col_txfm_8x8_rounding(&in[8], shift);
     998    31539700 :     col_txfm_8x8_rounding(&in[16], shift);
     999    31539900 :     col_txfm_8x8_rounding(&in[24], shift);
    1000    31542400 : }
    1001             : 
    1002     2550280 : static void fidtx16x16_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
    1003             :     (void)bit;
    1004     2550280 :     const int32_t bits = 12;       // NewSqrt2Bits = 12
    1005     2550280 :     const int32_t sqrt = 2 * 5793; // 2 * NewSqrt2
    1006     2550280 :     const __m256i newsqrt = _mm256_set1_epi32(sqrt);
    1007     2550280 :     const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
    1008             :     __m256i temp;
    1009     2550280 :     int32_t num_iters = 16 * col_num;
    1010    68711300 :     for (int32_t i = 0; i < num_iters; i++) {
    1011   132322000 :         temp = _mm256_mullo_epi32(in[i], newsqrt);
    1012    66161100 :         temp = _mm256_add_epi32(temp, rounding);
    1013   132322000 :         out[i] = _mm256_srai_epi32(temp, bits);
    1014             :     }
    1015     2550280 : }
    1016             : 
    1017     7144750 : static INLINE void write_buffer_16x16(const __m256i *res, int32_t *output) {
    1018     7144750 :     int32_t fact = -1, index = -1;
    1019    64273700 :     for (int32_t i = 0; i < 8; i++)
    1020             :     {
    1021    57128900 :         _mm256_store_si256((__m256i *)(output + (++fact) * 16), res[++index]);
    1022    57128900 :         _mm256_store_si256((__m256i *)(output + (fact) * 16 + 8), res[++index]);
    1023    57128900 :         _mm256_store_si256((__m256i *)(output + (++fact) * 16), res[++index]);
    1024    57128900 :         _mm256_store_si256((__m256i *)(output + (fact) * 16 + 8), res[++index]);
    1025             :     }
    1026     7144750 : }
    1027             : 
    1028    12675600 : static INLINE void fdct4x8_row_avx2(__m256i *input, __m256i *output, int32_t bit,
    1029             :     const int32_t num_col) {
    1030    12675600 :     const int32_t *cospi = cospi_arr(bit);
    1031    12675300 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    1032    12675300 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    1033    12675300 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    1034    12675300 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    1035             :     __m256i in[4];
    1036             :     __m256i out[4];
    1037             :     __m256i s0, s1, s2, s3;
    1038             :     __m256i u0, u1, u2, u3;
    1039             :     __m256i v0, v1, v2, v3;
    1040    12675300 :     int32_t endidx = 3 * num_col;
    1041             : 
    1042    12675300 :     in[0] = _mm256_permute2x128_si256(input[0], input[2], 0x20);
    1043    12675300 :     in[1] = _mm256_permute2x128_si256(input[0], input[2], 0x31);
    1044    12675300 :     in[2] = _mm256_permute2x128_si256(input[1], input[3], 0x20);
    1045    12675300 :     in[3] = _mm256_permute2x128_si256(input[1], input[3], 0x31);
    1046             : 
    1047    12675300 :     s0 = _mm256_add_epi32(in[0], in[endidx]);
    1048    12675300 :     s3 = _mm256_sub_epi32(in[0], in[endidx]);
    1049    12675300 :     endidx -= num_col;
    1050    12675300 :     s1 = _mm256_add_epi32(in[num_col], in[endidx]);
    1051    25350700 :     s2 = _mm256_sub_epi32(in[num_col], in[endidx]);
    1052             : 
    1053             :     // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
    1054    12675300 :     u0 = _mm256_mullo_epi32(s0, cospi32);
    1055    12675300 :     u1 = _mm256_mullo_epi32(s1, cospi32);
    1056    12675300 :     u2 = _mm256_add_epi32(u0, u1);
    1057    12675300 :     v0 = _mm256_sub_epi32(u0, u1);
    1058             : 
    1059    12675300 :     u3 = _mm256_add_epi32(u2, rnding);
    1060    12675300 :     v1 = _mm256_add_epi32(v0, rnding);
    1061             : 
    1062    12675300 :     u0 = _mm256_srai_epi32(u3, bit);
    1063    12675300 :     u2 = _mm256_srai_epi32(v1, bit);
    1064             : 
    1065             :     // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
    1066    12675300 :     v0 = _mm256_mullo_epi32(s2, cospi48);
    1067    12675300 :     v1 = _mm256_mullo_epi32(s3, cospi16);
    1068    12675300 :     v2 = _mm256_add_epi32(v0, v1);
    1069             : 
    1070    12675300 :     v3 = _mm256_add_epi32(v2, rnding);
    1071    12675300 :     u1 = _mm256_srai_epi32(v3, bit);
    1072             : 
    1073    12675300 :     v0 = _mm256_mullo_epi32(s2, cospi16);
    1074    12675300 :     v1 = _mm256_mullo_epi32(s3, cospi48);
    1075    12675300 :     v2 = _mm256_sub_epi32(v1, v0);
    1076             : 
    1077    12675300 :     v3 = _mm256_add_epi32(v2, rnding);
    1078    12675300 :     u3 = _mm256_srai_epi32(v3, bit);
    1079             : 
    1080             :     // Note: shift[1] and shift[2] are zeros
    1081             : 
    1082             :     // Transpose 4x4 32-bit
    1083    12675300 :     v0 = _mm256_unpacklo_epi32(u0, u1);
    1084    12675300 :     v1 = _mm256_unpackhi_epi32(u0, u1);
    1085    12675300 :     v2 = _mm256_unpacklo_epi32(u2, u3);
    1086    12675300 :     v3 = _mm256_unpackhi_epi32(u2, u3);
    1087             : 
    1088    12675300 :     out[0] = _mm256_unpacklo_epi64(v0, v2);
    1089    12675300 :     out[1] = _mm256_unpackhi_epi64(v0, v2);
    1090    12675300 :     out[2] = _mm256_unpacklo_epi64(v1, v3);
    1091    12675300 :     out[3] = _mm256_unpackhi_epi64(v1, v3);
    1092             : 
    1093    12675300 :     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
    1094    12675300 :     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
    1095    12675300 :     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
    1096    12675300 :     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
    1097    12675300 : }
    1098             : 
    1099    12716600 : static INLINE void fdct4x8_col_avx2(__m256i *in, __m256i *output, int32_t bit,
    1100             :     const int32_t num_col) {
    1101    12716600 :     const int32_t *cospi = cospi_arr(bit);
    1102    12716100 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    1103    12716100 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    1104    12716100 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    1105    12716100 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    1106             :     __m256i s0, s1, s2, s3;
    1107             :     __m256i u0, u1, u2, u3;
    1108             :     __m256i v0, v1, v2, v3;
    1109             :     __m256i out[4];
    1110             : 
    1111    12716100 :     int32_t endidx = 3 * num_col;
    1112    12716100 :     s0 = _mm256_add_epi32(in[0], in[endidx]);
    1113    12716100 :     s3 = _mm256_sub_epi32(in[0], in[endidx]);
    1114    12716100 :     endidx -= num_col;
    1115    12716100 :     s1 = _mm256_add_epi32(in[num_col], in[endidx]);
    1116    25432200 :     s2 = _mm256_sub_epi32(in[num_col], in[endidx]);
    1117             : 
    1118             :     // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
    1119    12716100 :     u0 = _mm256_mullo_epi32(s0, cospi32);
    1120    12716100 :     u1 = _mm256_mullo_epi32(s1, cospi32);
    1121    12716100 :     u2 = _mm256_add_epi32(u0, u1);
    1122    12716100 :     v0 = _mm256_sub_epi32(u0, u1);
    1123             : 
    1124    12716100 :     u3 = _mm256_add_epi32(u2, rnding);
    1125    12716100 :     v1 = _mm256_add_epi32(v0, rnding);
    1126             : 
    1127    12716100 :     u0 = _mm256_srai_epi32(u3, bit);
    1128    12716100 :     u2 = _mm256_srai_epi32(v1, bit);
    1129             : 
    1130             :     // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
    1131    12716100 :     v0 = _mm256_mullo_epi32(s2, cospi48);
    1132    12716100 :     v1 = _mm256_mullo_epi32(s3, cospi16);
    1133    12716100 :     v2 = _mm256_add_epi32(v0, v1);
    1134             : 
    1135    12716100 :     v3 = _mm256_add_epi32(v2, rnding);
    1136    12716100 :     u1 = _mm256_srai_epi32(v3, bit);
    1137             : 
    1138    12716100 :     v0 = _mm256_mullo_epi32(s2, cospi16);
    1139    12716100 :     v1 = _mm256_mullo_epi32(s3, cospi48);
    1140    12716100 :     v2 = _mm256_sub_epi32(v1, v0);
    1141             : 
    1142    12716100 :     v3 = _mm256_add_epi32(v2, rnding);
    1143    12716100 :     u3 = _mm256_srai_epi32(v3, bit);
    1144             : 
    1145             :     // Note: shift[1] and shift[2] are zeros
    1146             : 
    1147             :     // Transpose 4x4 32-bit
    1148    12716100 :     v0 = _mm256_unpacklo_epi32(u0, u1);
    1149    12716100 :     v1 = _mm256_unpackhi_epi32(u0, u1);
    1150    12716100 :     v2 = _mm256_unpacklo_epi32(u2, u3);
    1151    12716100 :     v3 = _mm256_unpackhi_epi32(u2, u3);
    1152             : 
    1153    12716100 :     out[0] = _mm256_unpacklo_epi64(v0, v2);
    1154    12716100 :     out[1] = _mm256_unpackhi_epi64(v0, v2);
    1155    12716100 :     out[2] = _mm256_unpacklo_epi64(v1, v3);
    1156    12716100 :     out[3] = _mm256_unpackhi_epi64(v1, v3);
    1157             : 
    1158    12716100 :     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
    1159    12716100 :     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
    1160    12716100 :     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
    1161    12716100 :     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
    1162    12716100 : }
    1163             : 
    1164     7345900 : static INLINE void fdct16x4_avx2(__m256i *input, __m256i *output, int32_t bit) {
    1165     7345900 :     __m128i *in = (__m128i *)input;
    1166     7345900 :     __m128i *out = (__m128i *)output;
    1167             : 
    1168     7345900 :     const int32_t *cospi = cospi_arr(bit);
    1169     7345760 :     const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
    1170     7345760 :     const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
    1171     7345760 :     const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
    1172     7345760 :     const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
    1173     7345760 :     const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
    1174     7345760 :     const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
    1175     7345760 :     const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
    1176     7345760 :     const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
    1177     7345760 :     const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
    1178     7345760 :     const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
    1179     7345760 :     const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
    1180     7345760 :     const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
    1181     7345760 :     const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
    1182     7345760 :     const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
    1183     7345760 :     const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
    1184     7345760 :     const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
    1185     7345760 :     const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
    1186     7345760 :     const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
    1187     7345760 :     const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
    1188             :     __m128i u[16], v[16], x;
    1189             : 
    1190             :     // stage 0
    1191             :     // stage 1
    1192     7345760 :     u[0] = _mm_add_epi32(in[0], in[15]);
    1193     7345760 :     v[15] = _mm_sub_epi32(in[0], in[15]);
    1194     7345760 :     u[1] = _mm_add_epi32(in[1], in[14]);
    1195     7345760 :     v[14] = _mm_sub_epi32(in[1], in[14]);
    1196     7345760 :     u[2] = _mm_add_epi32(in[2], in[13]);
    1197     7345760 :     u[13] = _mm_sub_epi32(in[2], in[13]);
    1198     7345760 :     u[3] = _mm_add_epi32(in[3], in[12]);
    1199     7345760 :     u[12] = _mm_sub_epi32(in[3], in[12]);
    1200     7345760 :     u[4] = _mm_add_epi32(in[4], in[11]);
    1201     7345760 :     u[11] = _mm_sub_epi32(in[4], in[11]);
    1202     7345760 :     u[5] = _mm_add_epi32(in[5], in[10]);
    1203     7345760 :     u[10] = _mm_sub_epi32(in[5], in[10]);
    1204     7345760 :     u[6] = _mm_add_epi32(in[6], in[9]);
    1205     7345760 :     v[9] = _mm_sub_epi32(in[6], in[9]);
    1206     7345760 :     u[7] = _mm_add_epi32(in[7], in[8]);
    1207     7345760 :     v[8] = _mm_sub_epi32(in[7], in[8]);
    1208             : 
    1209             :     // stage 2
    1210     7345760 :     v[0] = _mm_add_epi32(u[0], u[7]);
    1211     7345760 :     u[7] = _mm_sub_epi32(u[0], u[7]);
    1212     7345760 :     v[1] = _mm_add_epi32(u[1], u[6]);
    1213     7345760 :     v[6] = _mm_sub_epi32(u[1], u[6]);
    1214     7345760 :     v[2] = _mm_add_epi32(u[2], u[5]);
    1215     7345760 :     v[5] = _mm_sub_epi32(u[2], u[5]);
    1216     7345760 :     v[3] = _mm_add_epi32(u[3], u[4]);
    1217     7345760 :     u[4] = _mm_sub_epi32(u[3], u[4]);
    1218             : 
    1219     7345760 :     v[10] = _mm_mullo_epi32(u[10], cospim32);
    1220     7345760 :     x = _mm_mullo_epi32(u[13], cospi32);
    1221     7345760 :     v[10] = _mm_add_epi32(v[10], x);
    1222     7345760 :     v[10] = _mm_add_epi32(v[10], rnding);
    1223     7345760 :     v[10] = _mm_srai_epi32(v[10], bit);
    1224             : 
    1225     7345760 :     v[13] = _mm_mullo_epi32(u[10], cospi32);
    1226     7345760 :     x = _mm_mullo_epi32(u[13], cospim32);
    1227     7345760 :     v[13] = _mm_sub_epi32(v[13], x);
    1228     7345760 :     v[13] = _mm_add_epi32(v[13], rnding);
    1229     7345760 :     v[13] = _mm_srai_epi32(v[13], bit);
    1230             : 
    1231     7345760 :     v[11] = _mm_mullo_epi32(u[11], cospim32);
    1232     7345760 :     x = _mm_mullo_epi32(u[12], cospi32);
    1233     7345760 :     v[11] = _mm_add_epi32(v[11], x);
    1234     7345760 :     v[11] = _mm_add_epi32(v[11], rnding);
    1235     7345760 :     v[11] = _mm_srai_epi32(v[11], bit);
    1236             : 
    1237     7345760 :     v[12] = _mm_mullo_epi32(u[11], cospi32);
    1238     7345760 :     x = _mm_mullo_epi32(u[12], cospim32);
    1239     7345760 :     v[12] = _mm_sub_epi32(v[12], x);
    1240     7345760 :     v[12] = _mm_add_epi32(v[12], rnding);
    1241     7345760 :     v[12] = _mm_srai_epi32(v[12], bit);
    1242             : 
    1243             :     // stage 3
    1244     7345760 :     u[0] = _mm_add_epi32(v[0], v[3]);
    1245     7345760 :     u[3] = _mm_sub_epi32(v[0], v[3]);
    1246     7345760 :     u[1] = _mm_add_epi32(v[1], v[2]);
    1247     7345760 :     u[2] = _mm_sub_epi32(v[1], v[2]);
    1248             : 
    1249     7345760 :     u[5] = _mm_mullo_epi32(v[5], cospim32);
    1250     7345760 :     x = _mm_mullo_epi32(v[6], cospi32);
    1251     7345760 :     u[5] = _mm_add_epi32(u[5], x);
    1252     7345760 :     u[5] = _mm_add_epi32(u[5], rnding);
    1253     7345760 :     u[5] = _mm_srai_epi32(u[5], bit);
    1254             : 
    1255     7345760 :     u[6] = _mm_mullo_epi32(v[5], cospi32);
    1256     7345760 :     x = _mm_mullo_epi32(v[6], cospim32);
    1257     7345760 :     u[6] = _mm_sub_epi32(u[6], x);
    1258     7345760 :     u[6] = _mm_add_epi32(u[6], rnding);
    1259     7345760 :     u[6] = _mm_srai_epi32(u[6], bit);
    1260             : 
    1261     7345760 :     u[8] = _mm_add_epi32(v[8], v[11]);
    1262     7345760 :     v[11] = _mm_sub_epi32(v[8], v[11]);
    1263     7345760 :     u[9] = _mm_add_epi32(v[9], v[10]);
    1264     7345760 :     u[10] = _mm_sub_epi32(v[9], v[10]);
    1265     7345760 :     u[12] = _mm_sub_epi32(v[15], v[12]);
    1266     7345760 :     v[15] = _mm_add_epi32(v[15], v[12]);
    1267     7345760 :     u[13] = _mm_sub_epi32(v[14], v[13]);
    1268     7345760 :     u[14] = _mm_add_epi32(v[14], v[13]);
    1269             : 
    1270             :     // stage 4
    1271     7345760 :     u[0] = _mm_mullo_epi32(u[0], cospi32);
    1272     7345760 :     u[1] = _mm_mullo_epi32(u[1], cospi32);
    1273     7345760 :     v[0] = _mm_add_epi32(u[0], u[1]);
    1274     7345760 :     v[0] = _mm_add_epi32(v[0], rnding);
    1275     7345760 :     out[0] = _mm_srai_epi32(v[0], bit);
    1276             : 
    1277     7345760 :     v[1] = _mm_sub_epi32(u[0], u[1]);
    1278     7345760 :     v[1] = _mm_add_epi32(v[1], rnding);
    1279     7345760 :     out[8] = _mm_srai_epi32(v[1], bit);
    1280             : 
    1281     7345760 :     v[2] = _mm_mullo_epi32(u[2], cospi48);
    1282     7345760 :     x = _mm_mullo_epi32(u[3], cospi16);
    1283     7345760 :     v[2] = _mm_add_epi32(v[2], x);
    1284     7345760 :     v[2] = _mm_add_epi32(v[2], rnding);
    1285     7345760 :     out[4] = _mm_srai_epi32(v[2], bit);
    1286             : 
    1287     7345760 :     v[3] = _mm_mullo_epi32(u[2], cospi16);
    1288     7345760 :     x = _mm_mullo_epi32(u[3], cospi48);
    1289     7345760 :     v[3] = _mm_sub_epi32(x, v[3]);
    1290     7345760 :     v[3] = _mm_add_epi32(v[3], rnding);
    1291     7345760 :     out[12] = _mm_srai_epi32(v[3], bit);
    1292             : 
    1293     7345760 :     v[4] = _mm_add_epi32(u[4], u[5]);
    1294     7345760 :     v[5] = _mm_sub_epi32(u[4], u[5]);
    1295     7345760 :     v[6] = _mm_sub_epi32(u[7], u[6]);
    1296     7345760 :     v[7] = _mm_add_epi32(u[7], u[6]);
    1297     7345760 :     v[8] = u[8];
    1298             : 
    1299     7345760 :     v[9] = _mm_mullo_epi32(u[9], cospim16);
    1300     7345760 :     x = _mm_mullo_epi32(u[14], cospi48);
    1301     7345760 :     v[9] = _mm_add_epi32(v[9], x);
    1302     7345760 :     v[9] = _mm_add_epi32(v[9], rnding);
    1303     7345760 :     v[9] = _mm_srai_epi32(v[9], bit);
    1304             : 
    1305     7345760 :     v[14] = _mm_mullo_epi32(u[9], cospi48);
    1306     7345760 :     x = _mm_mullo_epi32(u[14], cospim16);
    1307     7345760 :     v[14] = _mm_sub_epi32(v[14], x);
    1308     7345760 :     v[14] = _mm_add_epi32(v[14], rnding);
    1309     7345760 :     v[14] = _mm_srai_epi32(v[14], bit);
    1310             : 
    1311     7345760 :     v[10] = _mm_mullo_epi32(u[10], cospim48);
    1312     7345760 :     x = _mm_mullo_epi32(u[13], cospim16);
    1313     7345760 :     v[10] = _mm_add_epi32(v[10], x);
    1314     7345760 :     v[10] = _mm_add_epi32(v[10], rnding);
    1315     7345760 :     v[10] = _mm_srai_epi32(v[10], bit);
    1316             : 
    1317     7345760 :     v[13] = _mm_mullo_epi32(u[10], cospim16);
    1318     7345760 :     x = _mm_mullo_epi32(u[13], cospim48);
    1319     7345760 :     v[13] = _mm_sub_epi32(v[13], x);
    1320     7345760 :     v[13] = _mm_add_epi32(v[13], rnding);
    1321     7345760 :     v[13] = _mm_srai_epi32(v[13], bit);
    1322             : 
    1323     7345760 :     v[12] = u[12];
    1324             : 
    1325             :     // stage 5
    1326     7345760 :     u[4] = _mm_mullo_epi32(v[4], cospi56);
    1327     7345760 :     x = _mm_mullo_epi32(v[7], cospi8);
    1328     7345760 :     u[4] = _mm_add_epi32(u[4], x);
    1329     7345760 :     u[4] = _mm_add_epi32(u[4], rnding);
    1330     7345760 :     out[2] = _mm_srai_epi32(u[4], bit);
    1331             : 
    1332     7345760 :     u[7] = _mm_mullo_epi32(v[4], cospi8);
    1333     7345760 :     x = _mm_mullo_epi32(v[7], cospi56);
    1334     7345760 :     u[7] = _mm_sub_epi32(x, u[7]);
    1335     7345760 :     u[7] = _mm_add_epi32(u[7], rnding);
    1336     7345760 :     out[14] = _mm_srai_epi32(u[7], bit);
    1337             : 
    1338     7345760 :     u[5] = _mm_mullo_epi32(v[5], cospi24);
    1339     7345760 :     x = _mm_mullo_epi32(v[6], cospi40);
    1340     7345760 :     u[5] = _mm_add_epi32(u[5], x);
    1341     7345760 :     u[5] = _mm_add_epi32(u[5], rnding);
    1342     7345760 :     out[10] = _mm_srai_epi32(u[5], bit);
    1343             : 
    1344     7345760 :     u[6] = _mm_mullo_epi32(v[5], cospi40);
    1345     7345760 :     x = _mm_mullo_epi32(v[6], cospi24);
    1346     7345760 :     u[6] = _mm_sub_epi32(x, u[6]);
    1347     7345760 :     u[6] = _mm_add_epi32(u[6], rnding);
    1348     7345760 :     out[6] = _mm_srai_epi32(u[6], bit);
    1349             : 
    1350     7345760 :     u[8] = _mm_add_epi32(v[8], v[9]);
    1351     7345760 :     u[9] = _mm_sub_epi32(v[8], v[9]);
    1352     7345760 :     u[10] = _mm_sub_epi32(v[11], v[10]);
    1353     7345760 :     u[11] = _mm_add_epi32(v[11], v[10]);
    1354     7345760 :     u[12] = _mm_add_epi32(v[12], v[13]);
    1355     7345760 :     u[13] = _mm_sub_epi32(v[12], v[13]);
    1356     7345760 :     u[14] = _mm_sub_epi32(v[15], v[14]);
    1357     7345760 :     u[15] = _mm_add_epi32(v[15], v[14]);
    1358             : 
    1359             :     // stage 6
    1360     7345760 :     v[8] = _mm_mullo_epi32(u[8], cospi60);
    1361     7345760 :     x = _mm_mullo_epi32(u[15], cospi4);
    1362     7345760 :     v[8] = _mm_add_epi32(v[8], x);
    1363     7345760 :     v[8] = _mm_add_epi32(v[8], rnding);
    1364     7345760 :     out[1] = _mm_srai_epi32(v[8], bit);
    1365             : 
    1366     7345760 :     v[15] = _mm_mullo_epi32(u[8], cospi4);
    1367     7345760 :     x = _mm_mullo_epi32(u[15], cospi60);
    1368     7345760 :     v[15] = _mm_sub_epi32(x, v[15]);
    1369     7345760 :     v[15] = _mm_add_epi32(v[15], rnding);
    1370     7345760 :     out[15] = _mm_srai_epi32(v[15], bit);
    1371             : 
    1372     7345760 :     v[9] = _mm_mullo_epi32(u[9], cospi28);
    1373     7345760 :     x = _mm_mullo_epi32(u[14], cospi36);
    1374     7345760 :     v[9] = _mm_add_epi32(v[9], x);
    1375     7345760 :     v[9] = _mm_add_epi32(v[9], rnding);
    1376     7345760 :     out[9] = _mm_srai_epi32(v[9], bit);
    1377             : 
    1378     7345760 :     v[14] = _mm_mullo_epi32(u[9], cospi36);
    1379     7345760 :     x = _mm_mullo_epi32(u[14], cospi28);
    1380     7345760 :     v[14] = _mm_sub_epi32(x, v[14]);
    1381     7345760 :     v[14] = _mm_add_epi32(v[14], rnding);
    1382     7345760 :     out[7] = _mm_srai_epi32(v[14], bit);
    1383             : 
    1384     7345760 :     v[10] = _mm_mullo_epi32(u[10], cospi44);
    1385     7345760 :     x = _mm_mullo_epi32(u[13], cospi20);
    1386     7345760 :     v[10] = _mm_add_epi32(v[10], x);
    1387     7345760 :     v[10] = _mm_add_epi32(v[10], rnding);
    1388     7345760 :     out[5] = _mm_srai_epi32(v[10], bit);
    1389             : 
    1390     7345760 :     v[13] = _mm_mullo_epi32(u[10], cospi20);
    1391     7345760 :     x = _mm_mullo_epi32(u[13], cospi44);
    1392     7345760 :     v[13] = _mm_sub_epi32(x, v[13]);
    1393     7345760 :     v[13] = _mm_add_epi32(v[13], rnding);
    1394     7345760 :     out[11] = _mm_srai_epi32(v[13], bit);
    1395             : 
    1396     7345760 :     v[11] = _mm_mullo_epi32(u[11], cospi12);
    1397     7345760 :     x = _mm_mullo_epi32(u[12], cospi52);
    1398     7345760 :     v[11] = _mm_add_epi32(v[11], x);
    1399     7345760 :     v[11] = _mm_add_epi32(v[11], rnding);
    1400     7345760 :     out[13] = _mm_srai_epi32(v[11], bit);
    1401             : 
    1402     7345760 :     v[12] = _mm_mullo_epi32(u[11], cospi52);
    1403     7345760 :     x = _mm_mullo_epi32(u[12], cospi12);
    1404     7345760 :     v[12] = _mm_sub_epi32(x, v[12]);
    1405     7345760 :     v[12] = _mm_add_epi32(v[12], rnding);
    1406     7345760 :     out[3] = _mm_srai_epi32(v[12], bit);
    1407     7345760 : }
    1408             : 
    1409     3275680 : static INLINE void fadst8x4_avx2(__m256i *input, __m256i *output, int32_t bit,
    1410             :     const int32_t col_num) {
    1411     3275680 :     __m128i *in = (__m128i *)input;
    1412     3275680 :     __m128i *out = (__m128i *)output;
    1413     3275680 :     const int32_t *cospi = cospi_arr(bit);
    1414     3275650 :     const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
    1415     3275650 :     const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
    1416     3275650 :     const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
    1417     3275650 :     const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
    1418     3275650 :     const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
    1419     3275650 :     const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
    1420     3275650 :     const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
    1421     3275650 :     const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
    1422     3275650 :     const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
    1423     3275650 :     const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
    1424     3275650 :     const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
    1425     3275650 :     const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
    1426     3275650 :     const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
    1427     3275650 :     const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
    1428     3275650 :     const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
    1429     3275650 :     const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
    1430     3275650 :     const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
    1431     6551310 :     const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
    1432     3275650 :     const __m128i zero = _mm_setzero_si128();
    1433             :     __m128i u0, u1, u2, u3, u4, u5, u6, u7;
    1434             :     __m128i v0, v1, v2, v3, v4, v5, v6, v7;
    1435             :     __m128i x, y;
    1436             :     int32_t col;
    1437             : 
    1438             :     // Note:
    1439             :     //  Even column: 0, 2, ..., 14
    1440             :     //  Odd column: 1, 3, ..., 15
    1441             :     //  one even column plus one odd column constructs one row (8 coeffs)
    1442             :     //  total we have 8 rows (8x8).
    1443     6551390 :     for (col = 0; col < col_num; ++col) {
    1444             :         // stage 0
    1445             :         // stage 1
    1446     3275740 :         u0 = in[col_num * 0 + col];
    1447     3275740 :         u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]);
    1448     3275740 :         u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]);
    1449     3275740 :         u3 = in[col_num * 4 + col];
    1450     3275740 :         u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]);
    1451     3275740 :         u5 = in[col_num * 6 + col];
    1452     3275740 :         u6 = in[col_num * 2 + col];
    1453     3275740 :         u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]);
    1454             : 
    1455             :         // stage 2
    1456     3275740 :         v0 = u0;
    1457     3275740 :         v1 = u1;
    1458             : 
    1459     3275740 :         x = _mm_mullo_epi32(u2, cospi32);
    1460     3275740 :         y = _mm_mullo_epi32(u3, cospi32);
    1461     3275740 :         v2 = _mm_add_epi32(x, y);
    1462     3275740 :         v2 = _mm_add_epi32(v2, rnding);
    1463     3275740 :         v2 = _mm_srai_epi32(v2, bit);
    1464             : 
    1465     3275740 :         v3 = _mm_sub_epi32(x, y);
    1466     3275740 :         v3 = _mm_add_epi32(v3, rnding);
    1467     3275740 :         v3 = _mm_srai_epi32(v3, bit);
    1468             : 
    1469     3275740 :         v4 = u4;
    1470     3275740 :         v5 = u5;
    1471             : 
    1472     3275740 :         x = _mm_mullo_epi32(u6, cospi32);
    1473     3275740 :         y = _mm_mullo_epi32(u7, cospi32);
    1474     3275740 :         v6 = _mm_add_epi32(x, y);
    1475     3275740 :         v6 = _mm_add_epi32(v6, rnding);
    1476     3275740 :         v6 = _mm_srai_epi32(v6, bit);
    1477             : 
    1478     3275740 :         v7 = _mm_sub_epi32(x, y);
    1479     3275740 :         v7 = _mm_add_epi32(v7, rnding);
    1480     3275740 :         v7 = _mm_srai_epi32(v7, bit);
    1481             : 
    1482             :         // stage 3
    1483     3275740 :         u0 = _mm_add_epi32(v0, v2);
    1484     3275740 :         u1 = _mm_add_epi32(v1, v3);
    1485     3275740 :         u2 = _mm_sub_epi32(v0, v2);
    1486     3275740 :         u3 = _mm_sub_epi32(v1, v3);
    1487     3275740 :         u4 = _mm_add_epi32(v4, v6);
    1488     3275740 :         u5 = _mm_add_epi32(v5, v7);
    1489     3275740 :         u6 = _mm_sub_epi32(v4, v6);
    1490     3275740 :         u7 = _mm_sub_epi32(v5, v7);
    1491             : 
    1492             :         // stage 4
    1493     3275740 :         v0 = u0;
    1494     3275740 :         v1 = u1;
    1495     3275740 :         v2 = u2;
    1496     3275740 :         v3 = u3;
    1497             : 
    1498     3275740 :         x = _mm_mullo_epi32(u4, cospi16);
    1499     3275740 :         y = _mm_mullo_epi32(u5, cospi48);
    1500     3275740 :         v4 = _mm_add_epi32(x, y);
    1501     3275740 :         v4 = _mm_add_epi32(v4, rnding);
    1502     3275740 :         v4 = _mm_srai_epi32(v4, bit);
    1503             : 
    1504     3275740 :         x = _mm_mullo_epi32(u4, cospi48);
    1505     3275740 :         y = _mm_mullo_epi32(u5, cospim16);
    1506     3275740 :         v5 = _mm_add_epi32(x, y);
    1507     3275740 :         v5 = _mm_add_epi32(v5, rnding);
    1508     3275740 :         v5 = _mm_srai_epi32(v5, bit);
    1509             : 
    1510     3275740 :         x = _mm_mullo_epi32(u6, cospim48);
    1511     3275740 :         y = _mm_mullo_epi32(u7, cospi16);
    1512     3275740 :         v6 = _mm_add_epi32(x, y);
    1513     3275740 :         v6 = _mm_add_epi32(v6, rnding);
    1514     3275740 :         v6 = _mm_srai_epi32(v6, bit);
    1515             : 
    1516     3275740 :         x = _mm_mullo_epi32(u6, cospi16);
    1517     3275740 :         y = _mm_mullo_epi32(u7, cospi48);
    1518     3275740 :         v7 = _mm_add_epi32(x, y);
    1519     3275740 :         v7 = _mm_add_epi32(v7, rnding);
    1520     3275740 :         v7 = _mm_srai_epi32(v7, bit);
    1521             : 
    1522             :         // stage 5
    1523     3275740 :         u0 = _mm_add_epi32(v0, v4);
    1524     3275740 :         u1 = _mm_add_epi32(v1, v5);
    1525     3275740 :         u2 = _mm_add_epi32(v2, v6);
    1526     3275740 :         u3 = _mm_add_epi32(v3, v7);
    1527     3275740 :         u4 = _mm_sub_epi32(v0, v4);
    1528     3275740 :         u5 = _mm_sub_epi32(v1, v5);
    1529     3275740 :         u6 = _mm_sub_epi32(v2, v6);
    1530     3275740 :         u7 = _mm_sub_epi32(v3, v7);
    1531             : 
    1532             :         // stage 6
    1533     3275740 :         x = _mm_mullo_epi32(u0, cospi4);
    1534     3275740 :         y = _mm_mullo_epi32(u1, cospi60);
    1535     3275740 :         v0 = _mm_add_epi32(x, y);
    1536     3275740 :         v0 = _mm_add_epi32(v0, rnding);
    1537     6551470 :         out[col_num * 7 + col] = _mm_srai_epi32(v0, bit);
    1538             : 
    1539     3275740 :         x = _mm_mullo_epi32(u0, cospi60);
    1540     3275740 :         y = _mm_mullo_epi32(u1, cospim4);
    1541     3275740 :         v1 = _mm_add_epi32(x, y);
    1542     3275740 :         v1 = _mm_add_epi32(v1, rnding);
    1543     6551470 :         out[col_num * 0 + col] = _mm_srai_epi32(v1, bit);
    1544             : 
    1545     3275740 :         x = _mm_mullo_epi32(u2, cospi20);
    1546     3275740 :         y = _mm_mullo_epi32(u3, cospi44);
    1547     3275740 :         v2 = _mm_add_epi32(x, y);
    1548     3275740 :         v2 = _mm_add_epi32(v2, rnding);
    1549     6551470 :         out[col_num * 5 + col] = _mm_srai_epi32(v2, bit);
    1550             : 
    1551     3275740 :         x = _mm_mullo_epi32(u2, cospi44);
    1552     3275740 :         y = _mm_mullo_epi32(u3, cospim20);
    1553     3275740 :         v3 = _mm_add_epi32(x, y);
    1554     3275740 :         v3 = _mm_add_epi32(v3, rnding);
    1555     6551470 :         out[col_num * 2 + col] = _mm_srai_epi32(v3, bit);
    1556             : 
    1557     3275740 :         x = _mm_mullo_epi32(u4, cospi36);
    1558     3275740 :         y = _mm_mullo_epi32(u5, cospi28);
    1559     3275740 :         v4 = _mm_add_epi32(x, y);
    1560     3275740 :         v4 = _mm_add_epi32(v4, rnding);
    1561     6551470 :         out[col_num * 3 + col] = _mm_srai_epi32(v4, bit);
    1562             : 
    1563     3275740 :         x = _mm_mullo_epi32(u4, cospi28);
    1564     3275740 :         y = _mm_mullo_epi32(u5, cospim36);
    1565     3275740 :         v5 = _mm_add_epi32(x, y);
    1566     3275740 :         v5 = _mm_add_epi32(v5, rnding);
    1567     6551470 :         out[col_num * 4 + col] = _mm_srai_epi32(v5, bit);
    1568             : 
    1569     3275740 :         x = _mm_mullo_epi32(u6, cospi52);
    1570     3275740 :         y = _mm_mullo_epi32(u7, cospi12);
    1571     3275740 :         v6 = _mm_add_epi32(x, y);
    1572     3275740 :         v6 = _mm_add_epi32(v6, rnding);
    1573     6551470 :         out[col_num * 1 + col] = _mm_srai_epi32(v6, bit);
    1574             : 
    1575     3275740 :         x = _mm_mullo_epi32(u6, cospi12);
    1576     3275740 :         y = _mm_mullo_epi32(u7, cospim52);
    1577     3275740 :         v7 = _mm_add_epi32(x, y);
    1578     3275740 :         v7 = _mm_add_epi32(v7, rnding);
    1579     6551470 :         out[col_num * 6 + col] = _mm_srai_epi32(v7, bit);
    1580             :     }
    1581     3275650 : }
    1582             : 
    1583     2709950 : static INLINE void fadst16x4_avx2(__m256i *input, __m256i *output, int32_t bit) {
    1584     2709950 :     __m128i *in = (__m128i *)input;
    1585     2709950 :     __m128i *out = (__m128i *)output;
    1586             : 
    1587     2709950 :     const int32_t *cospi = cospi_arr(bit);
    1588     2709930 :     const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
    1589     2709930 :     const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
    1590     2709930 :     const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
    1591     2709930 :     const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
    1592     2709930 :     const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
    1593     2709930 :     const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
    1594     2709930 :     const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
    1595     2709930 :     const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
    1596     2709930 :     const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
    1597     2709930 :     const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
    1598     2709930 :     const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
    1599     2709930 :     const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
    1600     2709930 :     const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
    1601     2709930 :     const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
    1602     2709930 :     const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
    1603     2709930 :     const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
    1604     2709930 :     const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
    1605     2709930 :     const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
    1606     2709930 :     const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
    1607     2709930 :     const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
    1608     2709930 :     const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
    1609     2709930 :     const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
    1610     2709930 :     const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
    1611     2709930 :     const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
    1612     2709930 :     const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
    1613     2709930 :     const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
    1614     2709930 :     const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
    1615     2709930 :     const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
    1616     2709930 :     const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
    1617     2709930 :     const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
    1618     2709930 :     const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
    1619     2709930 :     const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
    1620     2709930 :     const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
    1621     2709930 :     const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
    1622     2709930 :     const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
    1623     2709930 :     const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
    1624     2709930 :     const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
    1625     5419860 :     const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
    1626     2709930 :     const __m128i zero = _mm_setzero_si128();
    1627             : 
    1628             :     __m128i u[16], v[16], x, y;
    1629             :     __m128i tmp[13];
    1630             : 
    1631     2709930 :     tmp[0] = _mm_sub_epi32(zero, in[15]);
    1632     2709930 :     u[2] = _mm_sub_epi32(zero, in[7]);
    1633     2709930 :     tmp[1] = _mm_sub_epi32(zero, in[3]);
    1634     2709930 :     u[7] = _mm_sub_epi32(zero, in[11]);
    1635     2709930 :     tmp[2] = _mm_sub_epi32(zero, in[1]);
    1636     2709930 :     u[11] = _mm_sub_epi32(zero, in[9]);
    1637     2709930 :     tmp[3] = _mm_sub_epi32(zero, in[13]);
    1638     2709930 :     u[14] = _mm_sub_epi32(zero, in[5]);
    1639             : 
    1640             :     // stage 2
    1641             : 
    1642     2709930 :     x = _mm_mullo_epi32(u[2], cospi32);
    1643     5419860 :     y = _mm_mullo_epi32(in[8], cospi32);
    1644     2709930 :     v[2] = _mm_add_epi32(x, y);
    1645     2709930 :     v[2] = _mm_add_epi32(v[2], rnding);
    1646     5419860 :     v[2] = _mm_srai_epi32(v[2], bit);
    1647             : 
    1648     2709930 :     v[3] = _mm_sub_epi32(x, y);
    1649     2709930 :     v[3] = _mm_add_epi32(v[3], rnding);
    1650     2709930 :     v[3] = _mm_srai_epi32(v[3], bit);
    1651             : 
    1652     2709930 :     x = _mm_mullo_epi32(in[4], cospi32);
    1653     5419860 :     y = _mm_mullo_epi32(u[7], cospi32);
    1654     2709930 :     v[6] = _mm_add_epi32(x, y);
    1655     2709930 :     v[6] = _mm_add_epi32(v[6], rnding);
    1656     5419860 :     v[6] = _mm_srai_epi32(v[6], bit);
    1657             : 
    1658     2709930 :     v[7] = _mm_sub_epi32(x, y);
    1659     2709930 :     v[7] = _mm_add_epi32(v[7], rnding);
    1660     2709930 :     v[7] = _mm_srai_epi32(v[7], bit);
    1661             : 
    1662     2709930 :     x = _mm_mullo_epi32(in[6], cospi32);
    1663     5419860 :     y = _mm_mullo_epi32(u[11], cospi32);
    1664     2709930 :     v[10] = _mm_add_epi32(x, y);
    1665     2709930 :     v[10] = _mm_add_epi32(v[10], rnding);
    1666     5419860 :     v[10] = _mm_srai_epi32(v[10], bit);
    1667             : 
    1668     2709930 :     v[11] = _mm_sub_epi32(x, y);
    1669     2709930 :     v[11] = _mm_add_epi32(v[11], rnding);
    1670     2709930 :     v[11] = _mm_srai_epi32(v[11], bit);
    1671             : 
    1672     2709930 :     x = _mm_mullo_epi32(u[14], cospi32);
    1673     5419860 :     y = _mm_mullo_epi32(in[10], cospi32);
    1674     2709930 :     v[14] = _mm_add_epi32(x, y);
    1675     2709930 :     v[14] = _mm_add_epi32(v[14], rnding);
    1676     5419860 :     v[14] = _mm_srai_epi32(v[14], bit);
    1677             : 
    1678     2709930 :     v[15] = _mm_sub_epi32(x, y);
    1679     2709930 :     v[15] = _mm_add_epi32(v[15], rnding);
    1680     2709930 :     v[15] = _mm_srai_epi32(v[15], bit);
    1681             : 
    1682             :     // stage 3
    1683     2709930 :     tmp[4] = _mm_add_epi32(in[0], v[2]);
    1684     2709930 :     tmp[5] = _mm_add_epi32(tmp[0], v[3]);
    1685     2709930 :     tmp[6] = _mm_sub_epi32(in[0], v[2]);
    1686     2709930 :     tmp[0] = _mm_sub_epi32(tmp[0], v[3]);
    1687     2709930 :     u[4] = _mm_add_epi32(tmp[1], v[6]);
    1688     2709930 :     u[5] = _mm_add_epi32(in[12], v[7]);
    1689     2709930 :     u[6] = _mm_sub_epi32(tmp[1], v[6]);
    1690     2709930 :     u[7] = _mm_sub_epi32(in[12], v[7]);
    1691     2709930 :     tmp[1] = _mm_add_epi32(tmp[2], v[10]);
    1692     2709930 :     tmp[7] = _mm_add_epi32(in[14], v[11]);
    1693     2709930 :     tmp[2] = _mm_sub_epi32(tmp[2], v[10]);
    1694     2709930 :     tmp[8] = _mm_sub_epi32(in[14], v[11]);
    1695     2709930 :     u[12] = _mm_add_epi32(in[2], v[14]);
    1696     2709930 :     u[13] = _mm_add_epi32(tmp[3], v[15]);
    1697     2709930 :     u[14] = _mm_sub_epi32(in[2], v[14]);
    1698     2709930 :     u[15] = _mm_sub_epi32(tmp[3], v[15]);
    1699             : 
    1700             :     // stage 4
    1701     2709930 :     v[4] = half_btf_small(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
    1702     2709900 :     v[5] = half_btf_small(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
    1703     2709880 :     v[6] = half_btf_small(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
    1704     2709860 :     v[7] = half_btf_small(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
    1705     2709860 :     v[12] = half_btf_small(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
    1706     2709860 :     v[13] = half_btf_small(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
    1707     2709850 :     v[14] = half_btf_small(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
    1708     2709850 :     v[15] = half_btf_small(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
    1709             : 
    1710             :     // stage 5
    1711     2709860 :     tmp[9] = _mm_add_epi32(tmp[4], v[4]);
    1712     2709860 :     tmp[10] = _mm_add_epi32(tmp[5], v[5]);
    1713     2709860 :     tmp[11] = _mm_add_epi32(tmp[6], v[6]);
    1714     2709860 :     tmp[12] = _mm_add_epi32(tmp[0], v[7]);
    1715     2709860 :     tmp[4] = _mm_sub_epi32(tmp[4], v[4]);
    1716     2709860 :     tmp[5] = _mm_sub_epi32(tmp[5], v[5]);
    1717     2709860 :     tmp[6] = _mm_sub_epi32(tmp[6], v[6]);
    1718     2709860 :     tmp[0] = _mm_sub_epi32(tmp[0], v[7]);
    1719     2709860 :     u[8] = _mm_add_epi32(tmp[1], v[12]);
    1720     2709860 :     u[9] = _mm_add_epi32(tmp[7], v[13]);
    1721     2709860 :     u[10] = _mm_add_epi32(tmp[2], v[14]);
    1722     2709860 :     u[11] = _mm_add_epi32(tmp[8], v[15]);
    1723     2709860 :     u[12] = _mm_sub_epi32(tmp[1], v[12]);
    1724     2709860 :     u[13] = _mm_sub_epi32(tmp[7], v[13]);
    1725     2709860 :     u[14] = _mm_sub_epi32(tmp[2], v[14]);
    1726     2709860 :     u[15] = _mm_sub_epi32(tmp[8], v[15]);
    1727             : 
    1728             :     // stage 6
    1729     2709860 :     v[8] = half_btf_small(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
    1730     2709900 :     v[9] = half_btf_small(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
    1731     2709870 :     v[10] = half_btf_small(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
    1732     2709860 :     v[11] = half_btf_small(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
    1733     2709870 :     v[12] = half_btf_small(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
    1734     2709870 :     v[13] = half_btf_small(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
    1735     2709870 :     v[14] = half_btf_small(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
    1736     2709860 :     v[15] = half_btf_small(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
    1737             : 
    1738             :     // stage 7
    1739     2709880 :     u[0] = _mm_add_epi32(tmp[9], v[8]);
    1740     2709880 :     u[1] = _mm_add_epi32(tmp[10], v[9]);
    1741     2709880 :     u[2] = _mm_add_epi32(tmp[11], v[10]);
    1742     2709880 :     u[3] = _mm_add_epi32(tmp[12], v[11]);
    1743     2709880 :     u[4] = _mm_add_epi32(tmp[4], v[12]);
    1744     2709880 :     u[5] = _mm_add_epi32(tmp[5], v[13]);
    1745     2709880 :     u[6] = _mm_add_epi32(tmp[6], v[14]);
    1746     2709880 :     u[7] = _mm_add_epi32(tmp[0], v[15]);
    1747     2709880 :     u[8] = _mm_sub_epi32(tmp[9], v[8]);
    1748     2709880 :     u[9] = _mm_sub_epi32(tmp[10], v[9]);
    1749     2709880 :     u[10] = _mm_sub_epi32(tmp[11], v[10]);
    1750     2709880 :     u[11] = _mm_sub_epi32(tmp[12], v[11]);
    1751     2709880 :     u[12] = _mm_sub_epi32(tmp[4], v[12]);
    1752     2709880 :     u[13] = _mm_sub_epi32(tmp[5], v[13]);
    1753     2709880 :     u[14] = _mm_sub_epi32(tmp[6], v[14]);
    1754     2709880 :     u[15] = _mm_sub_epi32(tmp[0], v[15]);
    1755             : 
    1756             :     // stage 8
    1757     2709880 :     out[15] = half_btf_small(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
    1758     2709910 :     out[0] = half_btf_small(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
    1759     2709900 :     out[13] = half_btf_small(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
    1760     2709890 :     out[2] = half_btf_small(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
    1761     2709890 :     out[11] = half_btf_small(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
    1762     2709890 :     out[4] = half_btf_small(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
    1763     2709890 :     out[9] = half_btf_small(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
    1764     2709890 :     out[6] = half_btf_small(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
    1765     2709890 :     out[7] = half_btf_small(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
    1766     2709890 :     out[8] = half_btf_small(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
    1767     2709890 :     out[5] = half_btf_small(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
    1768     2709890 :     out[10] = half_btf_small(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
    1769     2709900 :     out[3] = half_btf_small(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
    1770     2709890 :     out[12] = half_btf_small(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
    1771     2709890 :     out[1] = half_btf_small(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
    1772     2709880 :     out[14] = half_btf_small(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
    1773     2709880 : }
    1774             : 
    1775    26337600 : static void fdct16x16_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
    1776    26337600 :     const int32_t *cospi = cospi_arr(bit);
    1777    26334400 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    1778    26334400 :     const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
    1779    26334400 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    1780    26334400 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    1781    26334400 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    1782    26334400 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    1783    26334400 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    1784    26334400 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    1785    26334400 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    1786    26334400 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    1787    26334400 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    1788    26334400 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    1789    26334400 :     const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
    1790    26334400 :     const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
    1791    26334400 :     const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
    1792    26334400 :     const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
    1793    26334400 :     const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    1794    26334400 :     const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
    1795    26334400 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    1796             :     __m256i u[16], v[16], x;
    1797             :     int32_t col;
    1798             : 
    1799    80563600 :     for (col = 0; col < col_num; ++col) {
    1800             :         // stage 0
    1801             :         // stage 1
    1802    54229100 :         u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
    1803    54229100 :         u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
    1804    54229100 :         u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
    1805    54229100 :         u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
    1806    54229100 :         u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
    1807    54229100 :         u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
    1808    54229100 :         u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
    1809    54229100 :         u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
    1810    54229100 :         u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
    1811    54229100 :         u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
    1812    54229100 :         u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
    1813    54229100 :         u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
    1814    54229100 :         u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
    1815    54229100 :         u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
    1816    54229100 :         u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
    1817    54229100 :         u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
    1818             : 
    1819             :         // stage 2
    1820    54229100 :         v[0] = _mm256_add_epi32(u[0], u[7]);
    1821    54229100 :         v[7] = _mm256_sub_epi32(u[0], u[7]);
    1822    54229100 :         v[1] = _mm256_add_epi32(u[1], u[6]);
    1823    54229100 :         v[6] = _mm256_sub_epi32(u[1], u[6]);
    1824    54229100 :         v[2] = _mm256_add_epi32(u[2], u[5]);
    1825    54229100 :         v[5] = _mm256_sub_epi32(u[2], u[5]);
    1826    54229100 :         v[3] = _mm256_add_epi32(u[3], u[4]);
    1827    54229100 :         v[4] = _mm256_sub_epi32(u[3], u[4]);
    1828    54229100 :         v[8] = u[8];
    1829    54229100 :         v[9] = u[9];
    1830             : 
    1831    54229100 :         v[10] = _mm256_mullo_epi32(u[10], cospim32);
    1832    54229100 :         x = _mm256_mullo_epi32(u[13], cospi32);
    1833    54229100 :         v[10] = _mm256_add_epi32(v[10], x);
    1834    54229100 :         v[10] = _mm256_add_epi32(v[10], rnding);
    1835    54229100 :         v[10] = _mm256_srai_epi32(v[10], bit);
    1836             : 
    1837    54229100 :         v[13] = _mm256_mullo_epi32(u[10], cospi32);
    1838    54229100 :         x = _mm256_mullo_epi32(u[13], cospim32);
    1839    54229100 :         v[13] = _mm256_sub_epi32(v[13], x);
    1840    54229100 :         v[13] = _mm256_add_epi32(v[13], rnding);
    1841    54229100 :         v[13] = _mm256_srai_epi32(v[13], bit);
    1842             : 
    1843    54229100 :         v[11] = _mm256_mullo_epi32(u[11], cospim32);
    1844    54229100 :         x = _mm256_mullo_epi32(u[12], cospi32);
    1845    54229100 :         v[11] = _mm256_add_epi32(v[11], x);
    1846    54229100 :         v[11] = _mm256_add_epi32(v[11], rnding);
    1847    54229100 :         v[11] = _mm256_srai_epi32(v[11], bit);
    1848             : 
    1849    54229100 :         v[12] = _mm256_mullo_epi32(u[11], cospi32);
    1850    54229100 :         x = _mm256_mullo_epi32(u[12], cospim32);
    1851    54229100 :         v[12] = _mm256_sub_epi32(v[12], x);
    1852    54229100 :         v[12] = _mm256_add_epi32(v[12], rnding);
    1853    54229100 :         v[12] = _mm256_srai_epi32(v[12], bit);
    1854    54229100 :         v[14] = u[14];
    1855    54229100 :         v[15] = u[15];
    1856             : 
    1857             :         // stage 3
    1858    54229100 :         u[0] = _mm256_add_epi32(v[0], v[3]);
    1859    54229100 :         u[3] = _mm256_sub_epi32(v[0], v[3]);
    1860    54229100 :         u[1] = _mm256_add_epi32(v[1], v[2]);
    1861    54229100 :         u[2] = _mm256_sub_epi32(v[1], v[2]);
    1862    54229100 :         u[4] = v[4];
    1863             : 
    1864    54229100 :         u[5] = _mm256_mullo_epi32(v[5], cospim32);
    1865    54229100 :         x = _mm256_mullo_epi32(v[6], cospi32);
    1866    54229100 :         u[5] = _mm256_add_epi32(u[5], x);
    1867    54229100 :         u[5] = _mm256_add_epi32(u[5], rnding);
    1868    54229100 :         u[5] = _mm256_srai_epi32(u[5], bit);
    1869             : 
    1870    54229100 :         u[6] = _mm256_mullo_epi32(v[5], cospi32);
    1871    54229100 :         x = _mm256_mullo_epi32(v[6], cospim32);
    1872    54229100 :         u[6] = _mm256_sub_epi32(u[6], x);
    1873    54229100 :         u[6] = _mm256_add_epi32(u[6], rnding);
    1874    54229100 :         u[6] = _mm256_srai_epi32(u[6], bit);
    1875             : 
    1876    54229100 :         u[7] = v[7];
    1877    54229100 :         u[8] = _mm256_add_epi32(v[8], v[11]);
    1878    54229100 :         u[11] = _mm256_sub_epi32(v[8], v[11]);
    1879    54229100 :         u[9] = _mm256_add_epi32(v[9], v[10]);
    1880    54229100 :         u[10] = _mm256_sub_epi32(v[9], v[10]);
    1881    54229100 :         u[12] = _mm256_sub_epi32(v[15], v[12]);
    1882    54229100 :         u[15] = _mm256_add_epi32(v[15], v[12]);
    1883    54229100 :         u[13] = _mm256_sub_epi32(v[14], v[13]);
    1884    54229100 :         u[14] = _mm256_add_epi32(v[14], v[13]);
    1885             : 
    1886             :         // stage 4
    1887    54229100 :         u[0] = _mm256_mullo_epi32(u[0], cospi32);
    1888    54229100 :         u[1] = _mm256_mullo_epi32(u[1], cospi32);
    1889    54229100 :         v[0] = _mm256_add_epi32(u[0], u[1]);
    1890    54229100 :         v[0] = _mm256_add_epi32(v[0], rnding);
    1891    54229100 :         v[0] = _mm256_srai_epi32(v[0], bit);
    1892             : 
    1893    54229100 :         v[1] = _mm256_sub_epi32(u[0], u[1]);
    1894    54229100 :         v[1] = _mm256_add_epi32(v[1], rnding);
    1895    54229100 :         v[1] = _mm256_srai_epi32(v[1], bit);
    1896             : 
    1897    54229100 :         v[2] = _mm256_mullo_epi32(u[2], cospi48);
    1898    54229100 :         x = _mm256_mullo_epi32(u[3], cospi16);
    1899    54229100 :         v[2] = _mm256_add_epi32(v[2], x);
    1900    54229100 :         v[2] = _mm256_add_epi32(v[2], rnding);
    1901    54229100 :         v[2] = _mm256_srai_epi32(v[2], bit);
    1902             : 
    1903    54229100 :         v[3] = _mm256_mullo_epi32(u[2], cospi16);
    1904    54229100 :         x = _mm256_mullo_epi32(u[3], cospi48);
    1905    54229100 :         v[3] = _mm256_sub_epi32(x, v[3]);
    1906    54229100 :         v[3] = _mm256_add_epi32(v[3], rnding);
    1907    54229100 :         v[3] = _mm256_srai_epi32(v[3], bit);
    1908             : 
    1909    54229100 :         v[4] = _mm256_add_epi32(u[4], u[5]);
    1910    54229100 :         v[5] = _mm256_sub_epi32(u[4], u[5]);
    1911    54229100 :         v[6] = _mm256_sub_epi32(u[7], u[6]);
    1912    54229100 :         v[7] = _mm256_add_epi32(u[7], u[6]);
    1913    54229100 :         v[8] = u[8];
    1914             : 
    1915    54229100 :         v[9] = _mm256_mullo_epi32(u[9], cospim16);
    1916    54229100 :         x = _mm256_mullo_epi32(u[14], cospi48);
    1917    54229100 :         v[9] = _mm256_add_epi32(v[9], x);
    1918    54229100 :         v[9] = _mm256_add_epi32(v[9], rnding);
    1919    54229100 :         v[9] = _mm256_srai_epi32(v[9], bit);
    1920             : 
    1921    54229100 :         v[14] = _mm256_mullo_epi32(u[9], cospi48);
    1922    54229100 :         x = _mm256_mullo_epi32(u[14], cospim16);
    1923    54229100 :         v[14] = _mm256_sub_epi32(v[14], x);
    1924    54229100 :         v[14] = _mm256_add_epi32(v[14], rnding);
    1925    54229100 :         v[14] = _mm256_srai_epi32(v[14], bit);
    1926             : 
    1927    54229100 :         v[10] = _mm256_mullo_epi32(u[10], cospim48);
    1928    54229100 :         x = _mm256_mullo_epi32(u[13], cospim16);
    1929    54229100 :         v[10] = _mm256_add_epi32(v[10], x);
    1930    54229100 :         v[10] = _mm256_add_epi32(v[10], rnding);
    1931    54229100 :         v[10] = _mm256_srai_epi32(v[10], bit);
    1932             : 
    1933    54229100 :         v[13] = _mm256_mullo_epi32(u[10], cospim16);
    1934    54229100 :         x = _mm256_mullo_epi32(u[13], cospim48);
    1935    54229100 :         v[13] = _mm256_sub_epi32(v[13], x);
    1936    54229100 :         v[13] = _mm256_add_epi32(v[13], rnding);
    1937    54229100 :         v[13] = _mm256_srai_epi32(v[13], bit);
    1938             : 
    1939    54229100 :         v[11] = u[11];
    1940    54229100 :         v[12] = u[12];
    1941    54229100 :         v[15] = u[15];
    1942             : 
    1943             :         // stage 5
    1944    54229100 :         u[0] = v[0];
    1945    54229100 :         u[1] = v[1];
    1946    54229100 :         u[2] = v[2];
    1947    54229100 :         u[3] = v[3];
    1948             : 
    1949    54229100 :         u[4] = _mm256_mullo_epi32(v[4], cospi56);
    1950    54229100 :         x = _mm256_mullo_epi32(v[7], cospi8);
    1951    54229100 :         u[4] = _mm256_add_epi32(u[4], x);
    1952    54229100 :         u[4] = _mm256_add_epi32(u[4], rnding);
    1953    54229100 :         u[4] = _mm256_srai_epi32(u[4], bit);
    1954             : 
    1955    54229100 :         u[7] = _mm256_mullo_epi32(v[4], cospi8);
    1956    54229100 :         x = _mm256_mullo_epi32(v[7], cospi56);
    1957    54229100 :         u[7] = _mm256_sub_epi32(x, u[7]);
    1958    54229100 :         u[7] = _mm256_add_epi32(u[7], rnding);
    1959    54229100 :         u[7] = _mm256_srai_epi32(u[7], bit);
    1960             : 
    1961    54229100 :         u[5] = _mm256_mullo_epi32(v[5], cospi24);
    1962    54229100 :         x = _mm256_mullo_epi32(v[6], cospi40);
    1963    54229100 :         u[5] = _mm256_add_epi32(u[5], x);
    1964    54229100 :         u[5] = _mm256_add_epi32(u[5], rnding);
    1965    54229100 :         u[5] = _mm256_srai_epi32(u[5], bit);
    1966             : 
    1967    54229100 :         u[6] = _mm256_mullo_epi32(v[5], cospi40);
    1968    54229100 :         x = _mm256_mullo_epi32(v[6], cospi24);
    1969    54229100 :         u[6] = _mm256_sub_epi32(x, u[6]);
    1970    54229100 :         u[6] = _mm256_add_epi32(u[6], rnding);
    1971    54229100 :         u[6] = _mm256_srai_epi32(u[6], bit);
    1972             : 
    1973    54229100 :         u[8] = _mm256_add_epi32(v[8], v[9]);
    1974    54229100 :         u[9] = _mm256_sub_epi32(v[8], v[9]);
    1975    54229100 :         u[10] = _mm256_sub_epi32(v[11], v[10]);
    1976    54229100 :         u[11] = _mm256_add_epi32(v[11], v[10]);
    1977    54229100 :         u[12] = _mm256_add_epi32(v[12], v[13]);
    1978    54229100 :         u[13] = _mm256_sub_epi32(v[12], v[13]);
    1979    54229100 :         u[14] = _mm256_sub_epi32(v[15], v[14]);
    1980    54229100 :         u[15] = _mm256_add_epi32(v[15], v[14]);
    1981             : 
    1982             :         // stage 6
    1983    54229100 :         v[0] = u[0];
    1984    54229100 :         v[1] = u[1];
    1985    54229100 :         v[2] = u[2];
    1986    54229100 :         v[3] = u[3];
    1987    54229100 :         v[4] = u[4];
    1988    54229100 :         v[5] = u[5];
    1989    54229100 :         v[6] = u[6];
    1990    54229100 :         v[7] = u[7];
    1991             : 
    1992    54229100 :         v[8] = _mm256_mullo_epi32(u[8], cospi60);
    1993    54229100 :         x = _mm256_mullo_epi32(u[15], cospi4);
    1994    54229100 :         v[8] = _mm256_add_epi32(v[8], x);
    1995    54229100 :         v[8] = _mm256_add_epi32(v[8], rnding);
    1996    54229100 :         v[8] = _mm256_srai_epi32(v[8], bit);
    1997             : 
    1998    54229100 :         v[15] = _mm256_mullo_epi32(u[8], cospi4);
    1999    54229100 :         x = _mm256_mullo_epi32(u[15], cospi60);
    2000    54229100 :         v[15] = _mm256_sub_epi32(x, v[15]);
    2001    54229100 :         v[15] = _mm256_add_epi32(v[15], rnding);
    2002    54229100 :         v[15] = _mm256_srai_epi32(v[15], bit);
    2003             : 
    2004    54229100 :         v[9] = _mm256_mullo_epi32(u[9], cospi28);
    2005    54229100 :         x = _mm256_mullo_epi32(u[14], cospi36);
    2006    54229100 :         v[9] = _mm256_add_epi32(v[9], x);
    2007    54229100 :         v[9] = _mm256_add_epi32(v[9], rnding);
    2008    54229100 :         v[9] = _mm256_srai_epi32(v[9], bit);
    2009             : 
    2010    54229100 :         v[14] = _mm256_mullo_epi32(u[9], cospi36);
    2011    54229100 :         x = _mm256_mullo_epi32(u[14], cospi28);
    2012    54229100 :         v[14] = _mm256_sub_epi32(x, v[14]);
    2013    54229100 :         v[14] = _mm256_add_epi32(v[14], rnding);
    2014    54229100 :         v[14] = _mm256_srai_epi32(v[14], bit);
    2015             : 
    2016    54229100 :         v[10] = _mm256_mullo_epi32(u[10], cospi44);
    2017    54229100 :         x = _mm256_mullo_epi32(u[13], cospi20);
    2018    54229100 :         v[10] = _mm256_add_epi32(v[10], x);
    2019    54229100 :         v[10] = _mm256_add_epi32(v[10], rnding);
    2020    54229100 :         v[10] = _mm256_srai_epi32(v[10], bit);
    2021             : 
    2022    54229100 :         v[13] = _mm256_mullo_epi32(u[10], cospi20);
    2023    54229100 :         x = _mm256_mullo_epi32(u[13], cospi44);
    2024    54229100 :         v[13] = _mm256_sub_epi32(x, v[13]);
    2025    54229100 :         v[13] = _mm256_add_epi32(v[13], rnding);
    2026    54229100 :         v[13] = _mm256_srai_epi32(v[13], bit);
    2027             : 
    2028    54229100 :         v[11] = _mm256_mullo_epi32(u[11], cospi12);
    2029    54229100 :         x = _mm256_mullo_epi32(u[12], cospi52);
    2030    54229100 :         v[11] = _mm256_add_epi32(v[11], x);
    2031    54229100 :         v[11] = _mm256_add_epi32(v[11], rnding);
    2032    54229100 :         v[11] = _mm256_srai_epi32(v[11], bit);
    2033             : 
    2034    54229100 :         v[12] = _mm256_mullo_epi32(u[11], cospi52);
    2035    54229100 :         x = _mm256_mullo_epi32(u[12], cospi12);
    2036    54229100 :         v[12] = _mm256_sub_epi32(x, v[12]);
    2037    54229100 :         v[12] = _mm256_add_epi32(v[12], rnding);
    2038    54229100 :         v[12] = _mm256_srai_epi32(v[12], bit);
    2039             : 
    2040    54229100 :         out[0 * col_num + col] = v[0];
    2041    54229100 :         out[1 * col_num + col] = v[8];
    2042    54229100 :         out[2 * col_num + col] = v[4];
    2043    54229100 :         out[3 * col_num + col] = v[12];
    2044    54229100 :         out[4 * col_num + col] = v[2];
    2045    54229100 :         out[5 * col_num + col] = v[10];
    2046    54229100 :         out[6 * col_num + col] = v[6];
    2047    54229100 :         out[7 * col_num + col] = v[14];
    2048    54229100 :         out[8 * col_num + col] = v[1];
    2049    54229100 :         out[9 * col_num + col] = v[9];
    2050    54229100 :         out[10 * col_num + col] = v[5];
    2051    54229100 :         out[11 * col_num + col] = v[13];
    2052    54229100 :         out[12 * col_num + col] = v[3];
    2053    54229100 :         out[13 * col_num + col] = v[11];
    2054    54229100 :         out[14 * col_num + col] = v[7];
    2055    54229100 :         out[15 * col_num + col] = v[15];
    2056             :     }
    2057    26334400 : }
    2058             : 
    2059     4409730 : static INLINE void fadst4x8_row_avx2(__m256i *input, __m256i *output, int32_t bit,
    2060             :     const int32_t num_col) {
    2061     4409730 :     const int32_t *sinpi = sinpi_arr(bit);
    2062     4409690 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    2063     4409690 :     const __m256i sinpi1 = _mm256_set1_epi32((int32_t)sinpi[1]);
    2064     4409690 :     const __m256i sinpi2 = _mm256_set1_epi32((int32_t)sinpi[2]);
    2065     4409690 :     const __m256i sinpi3 = _mm256_set1_epi32((int32_t)sinpi[3]);
    2066     4409690 :     const __m256i sinpi4 = _mm256_set1_epi32((int32_t)sinpi[4]);
    2067             :     __m256i t;
    2068             :     __m256i s0, s1, s2, s3, s4, s5, s6, s7;
    2069             :     __m256i x0, x1, x2, x3;
    2070             :     __m256i u0, u1, u2, u3;
    2071             :     __m256i v0, v1, v2, v3;
    2072             :     __m256i in[4];
    2073             :     __m256i out[4];
    2074             : 
    2075     4409690 :     in[0] = _mm256_permute2x128_si256(input[0], input[2], 0x20);
    2076     4409690 :     in[1] = _mm256_permute2x128_si256(input[0], input[2], 0x31);
    2077     4409690 :     in[2] = _mm256_permute2x128_si256(input[1], input[3], 0x20);
    2078     4409690 :     in[3] = _mm256_permute2x128_si256(input[1], input[3], 0x31);
    2079             : 
    2080     4409690 :     int32_t idx = 0 * num_col;
    2081     4409690 :     s0 = _mm256_mullo_epi32(in[idx], sinpi1);
    2082     4409690 :     s1 = _mm256_mullo_epi32(in[idx], sinpi4);
    2083     4409690 :     t = _mm256_add_epi32(in[idx], in[idx + num_col]);
    2084     4409690 :     idx += num_col;
    2085     4409690 :     s2 = _mm256_mullo_epi32(in[idx], sinpi2);
    2086     4409690 :     s3 = _mm256_mullo_epi32(in[idx], sinpi1);
    2087     4409690 :     idx += num_col;
    2088     4409690 :     s4 = _mm256_mullo_epi32(in[idx], sinpi3);
    2089     4409690 :     idx += num_col;
    2090     4409690 :     s5 = _mm256_mullo_epi32(in[idx], sinpi4);
    2091     4409690 :     s6 = _mm256_mullo_epi32(in[idx], sinpi2);
    2092     8819390 :     s7 = _mm256_sub_epi32(t, in[idx]);
    2093             : 
    2094     4409690 :     t = _mm256_add_epi32(s0, s2);
    2095     4409690 :     x0 = _mm256_add_epi32(t, s5);
    2096     4409690 :     x1 = _mm256_mullo_epi32(s7, sinpi3);
    2097     4409690 :     t = _mm256_sub_epi32(s1, s3);
    2098     4409690 :     x2 = _mm256_add_epi32(t, s6);
    2099     4409690 :     x3 = s4;
    2100             : 
    2101     4409690 :     s0 = _mm256_add_epi32(x0, x3);
    2102     4409690 :     s1 = x1;
    2103     4409690 :     s2 = _mm256_sub_epi32(x2, x3);
    2104     4409690 :     t = _mm256_sub_epi32(x2, x0);
    2105     4409690 :     s3 = _mm256_add_epi32(t, x3);
    2106             : 
    2107     4409690 :     u0 = _mm256_add_epi32(s0, rnding);
    2108     4409690 :     u0 = _mm256_srai_epi32(u0, bit);
    2109             : 
    2110     4409690 :     u1 = _mm256_add_epi32(s1, rnding);
    2111     4409690 :     u1 = _mm256_srai_epi32(u1, bit);
    2112             : 
    2113     4409690 :     u2 = _mm256_add_epi32(s2, rnding);
    2114     4409690 :     u2 = _mm256_srai_epi32(u2, bit);
    2115             : 
    2116     4409690 :     u3 = _mm256_add_epi32(s3, rnding);
    2117     4409690 :     u3 = _mm256_srai_epi32(u3, bit);
    2118             : 
    2119     4409690 :     v0 = _mm256_unpacklo_epi32(u0, u1);
    2120     4409690 :     v1 = _mm256_unpackhi_epi32(u0, u1);
    2121     4409690 :     v2 = _mm256_unpacklo_epi32(u2, u3);
    2122     4409690 :     v3 = _mm256_unpackhi_epi32(u2, u3);
    2123             : 
    2124     4409690 :     out[0] = _mm256_unpacklo_epi64(v0, v2);
    2125     4409690 :     out[1] = _mm256_unpackhi_epi64(v0, v2);
    2126     4409690 :     out[2] = _mm256_unpacklo_epi64(v1, v3);
    2127     4409690 :     out[3] = _mm256_unpackhi_epi64(v1, v3);
    2128             : 
    2129     4409690 :     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
    2130     4409690 :     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
    2131     4409690 :     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
    2132     4409690 :     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
    2133     4409690 : }
    2134             : 
    2135     4361490 : static INLINE void fadst4x8_col_avx2(__m256i *in, __m256i *output, int32_t bit,
    2136             :     const int32_t num_col) {
    2137     4361490 :     const int32_t *sinpi = sinpi_arr(bit);
    2138     4361460 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    2139     4361460 :     const __m256i sinpi1 = _mm256_set1_epi32((int32_t)sinpi[1]);
    2140     4361460 :     const __m256i sinpi2 = _mm256_set1_epi32((int32_t)sinpi[2]);
    2141     4361460 :     const __m256i sinpi3 = _mm256_set1_epi32((int32_t)sinpi[3]);
    2142     4361460 :     const __m256i sinpi4 = _mm256_set1_epi32((int32_t)sinpi[4]);
    2143             :     __m256i t;
    2144             :     __m256i s0, s1, s2, s3, s4, s5, s6, s7;
    2145             :     __m256i x0, x1, x2, x3;
    2146             :     __m256i u0, u1, u2, u3;
    2147             :     __m256i v0, v1, v2, v3;
    2148             :     __m256i out[4];
    2149             : 
    2150     4361460 :     int32_t idx = 0 * num_col;
    2151     4361460 :     s0 = _mm256_mullo_epi32(in[idx], sinpi1);
    2152     4361460 :     s1 = _mm256_mullo_epi32(in[idx], sinpi4);
    2153     4361460 :     t = _mm256_add_epi32(in[idx], in[idx + num_col]);
    2154     4361460 :     idx += num_col;
    2155     4361460 :     s2 = _mm256_mullo_epi32(in[idx], sinpi2);
    2156     4361460 :     s3 = _mm256_mullo_epi32(in[idx], sinpi1);
    2157     4361460 :     idx += num_col;
    2158     4361460 :     s4 = _mm256_mullo_epi32(in[idx], sinpi3);
    2159     4361460 :     idx += num_col;
    2160     4361460 :     s5 = _mm256_mullo_epi32(in[idx], sinpi4);
    2161     4361460 :     s6 = _mm256_mullo_epi32(in[idx], sinpi2);
    2162     8722910 :     s7 = _mm256_sub_epi32(t, in[idx]);
    2163             : 
    2164     4361460 :     t = _mm256_add_epi32(s0, s2);
    2165     4361460 :     x0 = _mm256_add_epi32(t, s5);
    2166     4361460 :     x1 = _mm256_mullo_epi32(s7, sinpi3);
    2167     4361460 :     t = _mm256_sub_epi32(s1, s3);
    2168     4361460 :     x2 = _mm256_add_epi32(t, s6);
    2169     4361460 :     x3 = s4;
    2170             : 
    2171     4361460 :     s0 = _mm256_add_epi32(x0, x3);
    2172     4361460 :     s1 = x1;
    2173     4361460 :     s2 = _mm256_sub_epi32(x2, x3);
    2174     4361460 :     t = _mm256_sub_epi32(x2, x0);
    2175     4361460 :     s3 = _mm256_add_epi32(t, x3);
    2176             : 
    2177     4361460 :     u0 = _mm256_add_epi32(s0, rnding);
    2178     4361460 :     u0 = _mm256_srai_epi32(u0, bit);
    2179             : 
    2180     4361460 :     u1 = _mm256_add_epi32(s1, rnding);
    2181     4361460 :     u1 = _mm256_srai_epi32(u1, bit);
    2182             : 
    2183     4361460 :     u2 = _mm256_add_epi32(s2, rnding);
    2184     4361460 :     u2 = _mm256_srai_epi32(u2, bit);
    2185             : 
    2186     4361460 :     u3 = _mm256_add_epi32(s3, rnding);
    2187     4361460 :     u3 = _mm256_srai_epi32(u3, bit);
    2188             : 
    2189     4361460 :     v0 = _mm256_unpacklo_epi32(u0, u1);
    2190     4361460 :     v1 = _mm256_unpackhi_epi32(u0, u1);
    2191     4361460 :     v2 = _mm256_unpacklo_epi32(u2, u3);
    2192     4361460 :     v3 = _mm256_unpackhi_epi32(u2, u3);
    2193             : 
    2194     4361460 :     out[0] = _mm256_unpacklo_epi64(v0, v2);
    2195     4361460 :     out[1] = _mm256_unpackhi_epi64(v0, v2);
    2196     4361460 :     out[2] = _mm256_unpacklo_epi64(v1, v3);
    2197     4361460 :     out[3] = _mm256_unpackhi_epi64(v1, v3);
    2198             : 
    2199     4361460 :     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
    2200     4361460 :     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
    2201     4361460 :     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
    2202     4361460 :     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
    2203     4361460 : }
    2204             : 
    2205    10729000 : static INLINE void fdct4x8_avx2(__m256i *input, __m256i *output,
    2206             :     int32_t bit) {
    2207    10729000 :     __m128i *in = (__m128i *)input;
    2208    10729000 :     __m128i *out = (__m128i *)output;
    2209    10729000 :     const int32_t *cospi = cospi_arr(bit);
    2210    10728500 :     const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
    2211    10728500 :     const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
    2212    10728500 :     const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
    2213    10728500 :     const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
    2214    10728500 :     const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
    2215    10728500 :     const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
    2216    10728500 :     const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
    2217    10728500 :     const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
    2218    10728500 :     const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
    2219             :     __m128i u[8], v[8];
    2220             : 
    2221             :     // Even 8 points 0, 2, ..., 14
    2222             :     // stage 0
    2223             :     // stage 1
    2224    10728500 :     u[0] = _mm_add_epi32(in[0], in[7]);
    2225    10728500 :     v[7] = _mm_sub_epi32(in[0], in[7]);  // v[7]
    2226    10728500 :     u[1] = _mm_add_epi32(in[1], in[6]);
    2227    10728500 :     u[6] = _mm_sub_epi32(in[1], in[6]);
    2228    10728500 :     u[2] = _mm_add_epi32(in[2], in[5]);
    2229    10728500 :     u[5] = _mm_sub_epi32(in[2], in[5]);
    2230    10728500 :     u[3] = _mm_add_epi32(in[3], in[4]);
    2231    10728500 :     v[4] = _mm_sub_epi32(in[3], in[4]);  // v[4]
    2232             : 
    2233             :     // stage 2
    2234    10728500 :     v[0] = _mm_add_epi32(u[0], u[3]);
    2235    10728500 :     v[3] = _mm_sub_epi32(u[0], u[3]);
    2236    10728500 :     v[1] = _mm_add_epi32(u[1], u[2]);
    2237    10728500 :     v[2] = _mm_sub_epi32(u[1], u[2]);
    2238             : 
    2239    10728500 :     v[5] = _mm_mullo_epi32(u[5], cospim32);
    2240    10728500 :     v[6] = _mm_mullo_epi32(u[6], cospi32);
    2241    10728500 :     v[5] = _mm_add_epi32(v[5], v[6]);
    2242    10728500 :     v[5] = _mm_add_epi32(v[5], rnding);
    2243    10728500 :     v[5] = _mm_srai_epi32(v[5], bit);
    2244             : 
    2245    10728500 :     u[0] = _mm_mullo_epi32(u[5], cospi32);
    2246    10728500 :     v[6] = _mm_mullo_epi32(u[6], cospim32);
    2247    10728500 :     v[6] = _mm_sub_epi32(u[0], v[6]);
    2248    10728500 :     v[6] = _mm_add_epi32(v[6], rnding);
    2249    10728500 :     v[6] = _mm_srai_epi32(v[6], bit);
    2250             : 
    2251             :     // stage 3
    2252             :     // type 0
    2253    10728500 :     v[0] = _mm_mullo_epi32(v[0], cospi32);
    2254    10728500 :     v[1] = _mm_mullo_epi32(v[1], cospi32);
    2255    10728500 :     u[0] = _mm_add_epi32(v[0], v[1]);
    2256    10728500 :     u[0] = _mm_add_epi32(u[0], rnding);
    2257    10728500 :     out[0] = _mm_srai_epi32(u[0], bit);
    2258             : 
    2259    10728500 :     u[1] = _mm_sub_epi32(v[0], v[1]);
    2260    10728500 :     u[1] = _mm_add_epi32(u[1], rnding);
    2261    10728500 :     out[4] = _mm_srai_epi32(u[1], bit);
    2262             : 
    2263             :     // type 1
    2264    10728500 :     v[0] = _mm_mullo_epi32(v[2], cospi48);
    2265    10728500 :     v[1] = _mm_mullo_epi32(v[3], cospi16);
    2266    10728500 :     u[2] = _mm_add_epi32(v[0], v[1]);
    2267    10728500 :     u[2] = _mm_add_epi32(u[2], rnding);
    2268    10728500 :     out[2] = _mm_srai_epi32(u[2], bit);
    2269             : 
    2270    10728500 :     v[0] = _mm_mullo_epi32(v[2], cospi16);
    2271    10728500 :     v[1] = _mm_mullo_epi32(v[3], cospi48);
    2272    10728500 :     u[3] = _mm_sub_epi32(v[1], v[0]);
    2273    10728500 :     u[3] = _mm_add_epi32(u[3], rnding);
    2274    10728500 :     out[6] = _mm_srai_epi32(u[3], bit);
    2275             : 
    2276    10728500 :     u[4] = _mm_add_epi32(v[4], v[5]);
    2277    10728500 :     u[5] = _mm_sub_epi32(v[4], v[5]);
    2278    10728500 :     u[6] = _mm_sub_epi32(v[7], v[6]);
    2279    10728500 :     u[7] = _mm_add_epi32(v[7], v[6]);
    2280             : 
    2281             :     // stage 4
    2282             :     // stage 5
    2283    10728500 :     v[0] = _mm_mullo_epi32(u[4], cospi56);
    2284    10728500 :     v[1] = _mm_mullo_epi32(u[7], cospi8);
    2285    10728500 :     v[0] = _mm_add_epi32(v[0], v[1]);
    2286    10728500 :     v[0] = _mm_add_epi32(v[0], rnding);
    2287    10728500 :     out[1] = _mm_srai_epi32(v[0], bit);  // buf0[4]
    2288             : 
    2289    10728500 :     v[0] = _mm_mullo_epi32(u[4], cospi8);
    2290    10728500 :     v[1] = _mm_mullo_epi32(u[7], cospi56);
    2291    10728500 :     v[0] = _mm_sub_epi32(v[1], v[0]);
    2292    10728500 :     v[0] = _mm_add_epi32(v[0], rnding);
    2293    10728500 :     out[7] = _mm_srai_epi32(v[0], bit);  // buf0[7]
    2294             : 
    2295    10728500 :     v[0] = _mm_mullo_epi32(u[5], cospi24);
    2296    10728500 :     v[1] = _mm_mullo_epi32(u[6], cospi40);
    2297    10728500 :     v[0] = _mm_add_epi32(v[0], v[1]);
    2298    10728500 :     v[0] = _mm_add_epi32(v[0], rnding);
    2299    10728500 :     out[5] = _mm_srai_epi32(v[0], bit);  // buf0[5]
    2300             : 
    2301    10728500 :     v[0] = _mm_mullo_epi32(u[5], cospi40);
    2302    10728500 :     v[1] = _mm_mullo_epi32(u[6], cospi24);
    2303    10728500 :     v[0] = _mm_sub_epi32(v[1], v[0]);
    2304    10728500 :     v[0] = _mm_add_epi32(v[0], rnding);
    2305    10728500 :     out[3] = _mm_srai_epi32(v[0], bit);  // buf0[6]
    2306    10728500 : }
    2307             : 
    2308     5242180 : static void fadst16x16_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
    2309     5242180 :     const int32_t *cospi = cospi_arr(bit);
    2310     5242090 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    2311     5242090 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    2312     5242090 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    2313     5242090 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    2314     5242090 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    2315     5242090 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    2316     5242090 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    2317     5242090 :     const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
    2318     5242090 :     const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
    2319     5242090 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    2320     5242090 :     const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
    2321     5242090 :     const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
    2322     5242090 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    2323     5242090 :     const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
    2324     5242090 :     const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
    2325     5242090 :     const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
    2326     5242090 :     const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
    2327     5242090 :     const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
    2328     5242090 :     const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
    2329     5242090 :     const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
    2330     5242090 :     const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
    2331     5242090 :     const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
    2332     5242090 :     const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
    2333     5242090 :     const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
    2334     5242090 :     const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
    2335     5242090 :     const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
    2336     5242090 :     const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
    2337     5242090 :     const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
    2338     5242090 :     const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
    2339     5242090 :     const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
    2340     5242090 :     const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
    2341     5242090 :     const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
    2342     5242090 :     const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
    2343     5242090 :     const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
    2344     5242090 :     const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
    2345     5242090 :     const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
    2346     5242090 :     const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
    2347    10484200 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    2348     5242090 :     const __m256i zero = _mm256_setzero_si256();
    2349             : 
    2350             :     __m256i u[16], v[16], x, y;
    2351             :     int32_t col;
    2352             : 
    2353    13052000 :     for (col = 0; col < col_num; ++col) {
    2354             :         // stage 0
    2355             :         // stage 1
    2356     7809750 :         u[0] = in[0 * col_num + col];
    2357     7809750 :         u[1] = _mm256_sub_epi32(zero, in[15 * col_num + col]);
    2358     7809750 :         u[2] = _mm256_sub_epi32(zero, in[7 * col_num + col]);
    2359     7809750 :         u[3] = in[8 * col_num + col];
    2360     7809750 :         u[4] = _mm256_sub_epi32(zero, in[3 * col_num + col]);
    2361     7809750 :         u[5] = in[12 * col_num + col];
    2362     7809750 :         u[6] = in[4 * col_num + col];
    2363     7809750 :         u[7] = _mm256_sub_epi32(zero, in[11 * col_num + col]);
    2364     7809750 :         u[8] = _mm256_sub_epi32(zero, in[1 * col_num + col]);
    2365     7809750 :         u[9] = in[14 * col_num + col];
    2366     7809750 :         u[10] = in[6 * col_num + col];
    2367     7809750 :         u[11] = _mm256_sub_epi32(zero, in[9 * col_num + col]);
    2368     7809750 :         u[12] = in[2 * col_num + col];
    2369     7809750 :         u[13] = _mm256_sub_epi32(zero, in[13 * col_num + col]);
    2370     7809750 :         u[14] = _mm256_sub_epi32(zero, in[5 * col_num + col]);
    2371     7809750 :         u[15] = in[10 * col_num + col];
    2372             : 
    2373             :         // stage 2
    2374     7809750 :         v[0] = u[0];
    2375     7809750 :         v[1] = u[1];
    2376             : 
    2377     7809750 :         x = _mm256_mullo_epi32(u[2], cospi32);
    2378    15619500 :         y = _mm256_mullo_epi32(u[3], cospi32);
    2379     7809750 :         v[2] = _mm256_add_epi32(x, y);
    2380     7809750 :         v[2] = _mm256_add_epi32(v[2], rnding);
    2381    15619500 :         v[2] = _mm256_srai_epi32(v[2], bit);
    2382             : 
    2383     7809750 :         v[3] = _mm256_sub_epi32(x, y);
    2384     7809750 :         v[3] = _mm256_add_epi32(v[3], rnding);
    2385     7809750 :         v[3] = _mm256_srai_epi32(v[3], bit);
    2386             : 
    2387     7809750 :         v[4] = u[4];
    2388     7809750 :         v[5] = u[5];
    2389             : 
    2390     7809750 :         x = _mm256_mullo_epi32(u[6], cospi32);
    2391    15619500 :         y = _mm256_mullo_epi32(u[7], cospi32);
    2392     7809750 :         v[6] = _mm256_add_epi32(x, y);
    2393     7809750 :         v[6] = _mm256_add_epi32(v[6], rnding);
    2394    15619500 :         v[6] = _mm256_srai_epi32(v[6], bit);
    2395             : 
    2396     7809750 :         v[7] = _mm256_sub_epi32(x, y);
    2397     7809750 :         v[7] = _mm256_add_epi32(v[7], rnding);
    2398     7809750 :         v[7] = _mm256_srai_epi32(v[7], bit);
    2399             : 
    2400     7809750 :         v[8] = u[8];
    2401     7809750 :         v[9] = u[9];
    2402             : 
    2403     7809750 :         x = _mm256_mullo_epi32(u[10], cospi32);
    2404    15619500 :         y = _mm256_mullo_epi32(u[11], cospi32);
    2405     7809750 :         v[10] = _mm256_add_epi32(x, y);
    2406     7809750 :         v[10] = _mm256_add_epi32(v[10], rnding);
    2407    15619500 :         v[10] = _mm256_srai_epi32(v[10], bit);
    2408             : 
    2409     7809750 :         v[11] = _mm256_sub_epi32(x, y);
    2410     7809750 :         v[11] = _mm256_add_epi32(v[11], rnding);
    2411     7809750 :         v[11] = _mm256_srai_epi32(v[11], bit);
    2412             : 
    2413     7809750 :         v[12] = u[12];
    2414     7809750 :         v[13] = u[13];
    2415             : 
    2416     7809750 :         x = _mm256_mullo_epi32(u[14], cospi32);
    2417    15619500 :         y = _mm256_mullo_epi32(u[15], cospi32);
    2418     7809750 :         v[14] = _mm256_add_epi32(x, y);
    2419     7809750 :         v[14] = _mm256_add_epi32(v[14], rnding);
    2420    15619500 :         v[14] = _mm256_srai_epi32(v[14], bit);
    2421             : 
    2422     7809750 :         v[15] = _mm256_sub_epi32(x, y);
    2423     7809750 :         v[15] = _mm256_add_epi32(v[15], rnding);
    2424     7809750 :         v[15] = _mm256_srai_epi32(v[15], bit);
    2425             : 
    2426             :         // stage 3
    2427     7809750 :         u[0] = _mm256_add_epi32(v[0], v[2]);
    2428     7809750 :         u[1] = _mm256_add_epi32(v[1], v[3]);
    2429     7809750 :         u[2] = _mm256_sub_epi32(v[0], v[2]);
    2430     7809750 :         u[3] = _mm256_sub_epi32(v[1], v[3]);
    2431     7809750 :         u[4] = _mm256_add_epi32(v[4], v[6]);
    2432     7809750 :         u[5] = _mm256_add_epi32(v[5], v[7]);
    2433     7809750 :         u[6] = _mm256_sub_epi32(v[4], v[6]);
    2434     7809750 :         u[7] = _mm256_sub_epi32(v[5], v[7]);
    2435     7809750 :         u[8] = _mm256_add_epi32(v[8], v[10]);
    2436     7809750 :         u[9] = _mm256_add_epi32(v[9], v[11]);
    2437     7809750 :         u[10] = _mm256_sub_epi32(v[8], v[10]);
    2438     7809750 :         u[11] = _mm256_sub_epi32(v[9], v[11]);
    2439     7809750 :         u[12] = _mm256_add_epi32(v[12], v[14]);
    2440     7809750 :         u[13] = _mm256_add_epi32(v[13], v[15]);
    2441     7809750 :         u[14] = _mm256_sub_epi32(v[12], v[14]);
    2442     7809750 :         u[15] = _mm256_sub_epi32(v[13], v[15]);
    2443             : 
    2444             :         // stage 4
    2445     7809750 :         v[0] = u[0];
    2446     7809750 :         v[1] = u[1];
    2447     7809750 :         v[2] = u[2];
    2448     7809750 :         v[3] = u[3];
    2449     7809750 :         v[4] = half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
    2450     7809660 :         v[5] = half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
    2451     7809480 :         v[6] = half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
    2452     7809290 :         v[7] = half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
    2453     7809200 :         v[8] = u[8];
    2454     7809200 :         v[9] = u[9];
    2455     7809200 :         v[10] = u[10];
    2456     7809200 :         v[11] = u[11];
    2457     7809200 :         v[12] = half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
    2458     7809340 :         v[13] = half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
    2459     7809210 :         v[14] = half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
    2460     7809160 :         v[15] = half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
    2461             : 
    2462             :         // stage 5
    2463     7809150 :         u[0] = _mm256_add_epi32(v[0], v[4]);
    2464     7809150 :         u[1] = _mm256_add_epi32(v[1], v[5]);
    2465     7809150 :         u[2] = _mm256_add_epi32(v[2], v[6]);
    2466     7809150 :         u[3] = _mm256_add_epi32(v[3], v[7]);
    2467     7809150 :         u[4] = _mm256_sub_epi32(v[0], v[4]);
    2468     7809150 :         u[5] = _mm256_sub_epi32(v[1], v[5]);
    2469     7809150 :         u[6] = _mm256_sub_epi32(v[2], v[6]);
    2470     7809150 :         u[7] = _mm256_sub_epi32(v[3], v[7]);
    2471     7809150 :         u[8] = _mm256_add_epi32(v[8], v[12]);
    2472     7809150 :         u[9] = _mm256_add_epi32(v[9], v[13]);
    2473     7809150 :         u[10] = _mm256_add_epi32(v[10], v[14]);
    2474     7809150 :         u[11] = _mm256_add_epi32(v[11], v[15]);
    2475     7809150 :         u[12] = _mm256_sub_epi32(v[8], v[12]);
    2476     7809150 :         u[13] = _mm256_sub_epi32(v[9], v[13]);
    2477     7809150 :         u[14] = _mm256_sub_epi32(v[10], v[14]);
    2478     7809150 :         u[15] = _mm256_sub_epi32(v[11], v[15]);
    2479             : 
    2480             :         // stage 6
    2481     7809150 :         v[0] = u[0];
    2482     7809150 :         v[1] = u[1];
    2483     7809150 :         v[2] = u[2];
    2484     7809150 :         v[3] = u[3];
    2485     7809150 :         v[4] = u[4];
    2486     7809150 :         v[5] = u[5];
    2487     7809150 :         v[6] = u[6];
    2488     7809150 :         v[7] = u[7];
    2489     7809150 :         v[8] = half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
    2490     7809440 :         v[9] = half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
    2491     7809290 :         v[10] = half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
    2492     7809200 :         v[11] = half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
    2493     7809080 :         v[12] = half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
    2494     7808950 :         v[13] = half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
    2495     7808880 :         v[14] = half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
    2496     7808840 :         v[15] = half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
    2497             : 
    2498             :         // stage 7
    2499     7808990 :         u[0] = _mm256_add_epi32(v[0], v[8]);
    2500     7808990 :         u[1] = _mm256_add_epi32(v[1], v[9]);
    2501     7808990 :         u[2] = _mm256_add_epi32(v[2], v[10]);
    2502     7808990 :         u[3] = _mm256_add_epi32(v[3], v[11]);
    2503     7808990 :         u[4] = _mm256_add_epi32(v[4], v[12]);
    2504     7808990 :         u[5] = _mm256_add_epi32(v[5], v[13]);
    2505     7808990 :         u[6] = _mm256_add_epi32(v[6], v[14]);
    2506     7808990 :         u[7] = _mm256_add_epi32(v[7], v[15]);
    2507     7808990 :         u[8] = _mm256_sub_epi32(v[0], v[8]);
    2508     7808990 :         u[9] = _mm256_sub_epi32(v[1], v[9]);
    2509     7808990 :         u[10] = _mm256_sub_epi32(v[2], v[10]);
    2510     7808990 :         u[11] = _mm256_sub_epi32(v[3], v[11]);
    2511     7808990 :         u[12] = _mm256_sub_epi32(v[4], v[12]);
    2512     7808990 :         u[13] = _mm256_sub_epi32(v[5], v[13]);
    2513     7808990 :         u[14] = _mm256_sub_epi32(v[6], v[14]);
    2514     7808990 :         u[15] = _mm256_sub_epi32(v[7], v[15]);
    2515             : 
    2516             :         // stage 8
    2517     7808990 :         v[0] = half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
    2518     7809460 :         v[1] = half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
    2519     7809210 :         v[2] = half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
    2520     7809140 :         v[3] = half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
    2521     7809090 :         v[4] = half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
    2522     7809070 :         v[5] = half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
    2523     7809040 :         v[6] = half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
    2524     7809040 :         v[7] = half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
    2525     7809020 :         v[8] = half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
    2526     7809000 :         v[9] = half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
    2527     7809030 :         v[10] = half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
    2528     7809030 :         v[11] = half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
    2529     7808870 :         v[12] = half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
    2530     7808800 :         v[13] = half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
    2531     7808780 :         v[14] = half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
    2532     7808730 :         v[15] = half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
    2533             : 
    2534             :         // stage 9
    2535     7809880 :         out[0 * col_num + col] = v[1];
    2536     7809880 :         out[1 * col_num + col] = v[14];
    2537     7809880 :         out[2 * col_num + col] = v[3];
    2538     7809880 :         out[3 * col_num + col] = v[12];
    2539     7809880 :         out[4 * col_num + col] = v[5];
    2540     7809880 :         out[5 * col_num + col] = v[10];
    2541     7809880 :         out[6 * col_num + col] = v[7];
    2542     7809880 :         out[7 * col_num + col] = v[8];
    2543     7809880 :         out[8 * col_num + col] = v[9];
    2544     7809880 :         out[9 * col_num + col] = v[6];
    2545     7809880 :         out[10 * col_num + col] = v[11];
    2546     7809880 :         out[11 * col_num + col] = v[4];
    2547     7809880 :         out[12 * col_num + col] = v[13];
    2548     7809880 :         out[13 * col_num + col] = v[2];
    2549     7809880 :         out[14 * col_num + col] = v[15];
    2550     7809880 :         out[15 * col_num + col] = v[0];
    2551             :     }
    2552     5242220 : }
    2553             : 
    2554     7143920 : void eb_av1_fwd_txfm2d_16x16_avx2(int16_t *input, int32_t *coeff, uint32_t stride, TxType tx_type, uint8_t  bd)
    2555             : {
    2556             :     __m256i in[32], out[32];
    2557     7143920 :     const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
    2558     7143920 :     const int32_t txw_idx = get_txw_idx(TX_16X16);
    2559     7143740 :     const int32_t txh_idx = get_txh_idx(TX_16X16);
    2560     7144920 :     const int32_t col_num = 2;
    2561     7144920 :     switch (tx_type) {
    2562      399596 :     case IDTX:
    2563      399596 :         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
    2564      399603 :         fidtx16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2565      399604 :         col_txfm_16x16_rounding(out, -shift[1]);
    2566      399606 :         fidtx16x16_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2567      399602 :         write_buffer_16x16(out, coeff);
    2568      399605 :         break;
    2569     4652780 :     case DCT_DCT:
    2570     4652780 :         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
    2571     4652930 :         fdct16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2572     4653140 :         col_txfm_16x16_rounding(out, -shift[1]);
    2573     4653140 :         transpose_16x16_avx2(out, in);
    2574     4653080 :         fdct16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2575     4653090 :         transpose_16x16_avx2(out, in);
    2576     4653100 :         write_buffer_16x16(in, coeff);
    2577     4652960 :         break;
    2578      410312 :     case ADST_DCT:
    2579      410312 :         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
    2580      410321 :         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2581      410321 :         col_txfm_16x16_rounding(out, -shift[1]);
    2582      410323 :         transpose_16x16_avx2(out, in);
    2583      410324 :         fdct16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2584      410322 :         transpose_16x16_avx2(out, in);
    2585      410323 :         write_buffer_16x16(in, coeff);
    2586      410321 :         break;
    2587      411547 :     case DCT_ADST:
    2588      411547 :         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
    2589      411556 :         fdct16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2590      411563 :         col_txfm_16x16_rounding(out, -shift[1]);
    2591      411563 :         transpose_16x16_avx2(out, in);
    2592      411567 :         fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2593      411561 :         transpose_16x16_avx2(out, in);
    2594      411564 :         write_buffer_16x16(in, coeff);
    2595      411560 :         break;
    2596      356861 :     case ADST_ADST:
    2597      356861 :         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
    2598      356866 :         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2599      356874 :         col_txfm_16x16_rounding(out, -shift[1]);
    2600      356874 :         transpose_16x16_avx2(out, in);
    2601      356873 :         fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2602      356871 :         transpose_16x16_avx2(out, in);
    2603      356872 :         write_buffer_16x16(in, coeff);
    2604      356869 :         break;
    2605      129117 :     case DCT_FLIPADST:
    2606      129117 :         load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
    2607      129118 :         fdct16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2608      129118 :         col_txfm_16x16_rounding(out, -shift[1]);
    2609      129117 :         transpose_16x16_avx2(out, in);
    2610      129117 :         fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2611      129117 :         transpose_16x16_avx2(out, in);
    2612      129118 :         write_buffer_16x16(in, coeff);
    2613      129116 :         break;
    2614      128915 :     case FLIPADST_DCT:
    2615      128915 :         load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
    2616      128916 :         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2617      128916 :         col_txfm_16x16_rounding(out, -shift[1]);
    2618      128916 :         transpose_16x16_avx2(out, in);
    2619      128916 :         fdct16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2620      128916 :         transpose_16x16_avx2(out, in);
    2621      128916 :         write_buffer_16x16(in, coeff);
    2622      128916 :         break;
    2623      128990 :     case FLIPADST_FLIPADST:
    2624      128990 :         load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
    2625      128992 :         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2626      128993 :         col_txfm_16x16_rounding(out, -shift[1]);
    2627      128993 :         transpose_16x16_avx2(out, in);
    2628      128993 :         fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2629      128991 :         transpose_16x16_avx2(out, in);
    2630      128993 :         write_buffer_16x16(in, coeff);
    2631      128992 :         break;
    2632      128942 :     case ADST_FLIPADST:
    2633      128942 :         load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
    2634      128942 :         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2635      128942 :         col_txfm_16x16_rounding(out, -shift[1]);
    2636      128942 :         transpose_16x16_avx2(out, in);
    2637      128942 :         fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2638      128942 :         transpose_16x16_avx2(out, in);
    2639      128941 :         write_buffer_16x16(in, coeff);
    2640      128941 :         break;
    2641      129330 :     case FLIPADST_ADST:
    2642      129330 :         load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
    2643      129332 :         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2644      129332 :         col_txfm_16x16_rounding(out, -shift[1]);
    2645      129331 :         transpose_16x16_avx2(out, in);
    2646      129332 :         fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2647      129331 :         transpose_16x16_avx2(out, in);
    2648      129332 :         write_buffer_16x16(in, coeff);
    2649      129332 :         break;
    2650      135833 :     case V_DCT:
    2651      135833 :         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
    2652      135833 :         fdct16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2653      135833 :         col_txfm_16x16_rounding(out, -shift[1]);
    2654      135832 :         fidtx16x16_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2655      135833 :         write_buffer_16x16(out, coeff);
    2656      135833 :         break;
    2657      132700 :     case H_DCT:
    2658      132700 :         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
    2659      132700 :         fidtx16x16_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2660      132699 :         col_txfm_16x16_rounding(in, -shift[1]);
    2661      132699 :         transpose_16x16_avx2(in, out);
    2662      132700 :         fdct16x16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2663      132700 :         transpose_16x16_avx2(in, out);
    2664      132700 :         write_buffer_16x16(out, coeff);
    2665      132700 :         break;
    2666           0 :     case V_ADST:
    2667           0 :         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
    2668           0 :         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2669           0 :         col_txfm_16x16_rounding(out, -shift[1]);
    2670           0 :         fidtx16x16_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2671           0 :         write_buffer_16x16(out, coeff);
    2672           0 :         break;
    2673           0 :     case H_ADST:
    2674           0 :         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
    2675           0 :         fidtx16x16_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2676           0 :         col_txfm_16x16_rounding(in, -shift[1]);
    2677           0 :         transpose_16x16_avx2(in, out);
    2678           0 :         fadst16x16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2679           0 :         transpose_16x16_avx2(in, out);
    2680           0 :         write_buffer_16x16(out, coeff);
    2681           0 :         break;
    2682           0 :     case V_FLIPADST:
    2683           0 :         load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
    2684           0 :         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2685           0 :         col_txfm_16x16_rounding(out, -shift[1]);
    2686           0 :         fidtx16x16_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2687           0 :         write_buffer_16x16(out, coeff);
    2688           0 :         break;
    2689           0 :     case H_FLIPADST:
    2690           0 :         load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
    2691           0 :         fidtx16x16_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
    2692           0 :         col_txfm_16x16_rounding(in, -shift[1]);
    2693           0 :         transpose_16x16_avx2(in, out);
    2694           0 :         fadst16x16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
    2695           0 :         transpose_16x16_avx2(in, out);
    2696           0 :         write_buffer_16x16(out, coeff);
    2697           0 :         break;
    2698           0 :     default: assert(0);
    2699             :     }
    2700             :     (void)bd;
    2701     7145150 : }
    2702             : 
    2703    13048600 : static void av1_fdct32_new_avx2(const __m256i *input, __m256i *output,
    2704             :     int8_t cos_bit, const int32_t col_num, const int32_t stride) {
    2705    13048600 :     const int32_t *cospi = cospi_arr(cos_bit);
    2706    13048500 :     const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
    2707    13048500 :     const int32_t columns = col_num >> 3;
    2708             : 
    2709    13048500 :     __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
    2710    13048500 :     __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
    2711    13048500 :     __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
    2712    13048500 :     __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
    2713    13048500 :     __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
    2714    13048500 :     __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
    2715    13048500 :     __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
    2716    13048500 :     __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
    2717    13048500 :     __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
    2718    13048500 :     __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
    2719    13048500 :     __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
    2720    13048500 :     __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
    2721    13048500 :     __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
    2722    13048500 :     __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
    2723    13048500 :     __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
    2724    13048500 :     __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
    2725    13048500 :     __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
    2726    13048500 :     __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
    2727    13048500 :     __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
    2728    13048500 :     __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
    2729    13048500 :     __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
    2730    13048500 :     __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
    2731    13048500 :     __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
    2732    13048500 :     __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
    2733    13048500 :     __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
    2734    13048500 :     __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
    2735    13048500 :     __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
    2736    13048500 :     __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
    2737    13048500 :     __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
    2738    13048500 :     __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
    2739    13048500 :     __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
    2740    13048500 :     __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
    2741    13048500 :     __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
    2742    13048500 :     __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
    2743    13048500 :     __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
    2744    13048500 :     __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
    2745    13048500 :     __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
    2746    13048500 :     __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
    2747             : 
    2748             :     __m256i buf0[32];
    2749             :     __m256i buf1[32];
    2750             : 
    2751    43311900 :     for (int32_t col = 0; col < columns; col++) {
    2752    30263400 :         const __m256i *in = &input[col];
    2753    30263400 :         __m256i *out = &output[col];
    2754             : 
    2755             :         // stage 0
    2756             :         // stage 1
    2757    30263400 :         buf1[0] = _mm256_add_epi32(in[0 * stride], in[31 * stride]);
    2758    30263400 :         buf1[31] = _mm256_sub_epi32(in[0 * stride], in[31 * stride]);
    2759    30263400 :         buf1[1] = _mm256_add_epi32(in[1 * stride], in[30 * stride]);
    2760    30263400 :         buf1[30] = _mm256_sub_epi32(in[1 * stride], in[30 * stride]);
    2761    30263400 :         buf1[2] = _mm256_add_epi32(in[2 * stride], in[29 * stride]);
    2762    30263400 :         buf1[29] = _mm256_sub_epi32(in[2 * stride], in[29 * stride]);
    2763    30263400 :         buf1[3] = _mm256_add_epi32(in[3 * stride], in[28 * stride]);
    2764    30263400 :         buf1[28] = _mm256_sub_epi32(in[3 * stride], in[28 * stride]);
    2765    30263400 :         buf1[4] = _mm256_add_epi32(in[4 * stride], in[27 * stride]);
    2766    30263400 :         buf1[27] = _mm256_sub_epi32(in[4 * stride], in[27 * stride]);
    2767    30263400 :         buf1[5] = _mm256_add_epi32(in[5 * stride], in[26 * stride]);
    2768    30263400 :         buf1[26] = _mm256_sub_epi32(in[5 * stride], in[26 * stride]);
    2769    30263400 :         buf1[6] = _mm256_add_epi32(in[6 * stride], in[25 * stride]);
    2770    30263400 :         buf1[25] = _mm256_sub_epi32(in[6 * stride], in[25 * stride]);
    2771    30263400 :         buf1[7] = _mm256_add_epi32(in[7 * stride], in[24 * stride]);
    2772    30263400 :         buf1[24] = _mm256_sub_epi32(in[7 * stride], in[24 * stride]);
    2773    30263400 :         buf1[8] = _mm256_add_epi32(in[8 * stride], in[23 * stride]);
    2774    30263400 :         buf1[23] = _mm256_sub_epi32(in[8 * stride], in[23 * stride]);
    2775    30263400 :         buf1[9] = _mm256_add_epi32(in[9 * stride], in[22 * stride]);
    2776    30263400 :         buf1[22] = _mm256_sub_epi32(in[9 * stride], in[22 * stride]);
    2777    30263400 :         buf1[10] = _mm256_add_epi32(in[10 * stride], in[21 * stride]);
    2778    30263400 :         buf1[21] = _mm256_sub_epi32(in[10 * stride], in[21 * stride]);
    2779    30263400 :         buf1[11] = _mm256_add_epi32(in[11 * stride], in[20 * stride]);
    2780    30263400 :         buf1[20] = _mm256_sub_epi32(in[11 * stride], in[20 * stride]);
    2781    30263400 :         buf1[12] = _mm256_add_epi32(in[12 * stride], in[19 * stride]);
    2782    30263400 :         buf1[19] = _mm256_sub_epi32(in[12 * stride], in[19 * stride]);
    2783    30263400 :         buf1[13] = _mm256_add_epi32(in[13 * stride], in[18 * stride]);
    2784    30263400 :         buf1[18] = _mm256_sub_epi32(in[13 * stride], in[18 * stride]);
    2785    30263400 :         buf1[14] = _mm256_add_epi32(in[14 * stride], in[17 * stride]);
    2786    30263400 :         buf1[17] = _mm256_sub_epi32(in[14 * stride], in[17 * stride]);
    2787    30263400 :         buf1[15] = _mm256_add_epi32(in[15 * stride], in[16 * stride]);
    2788    30263400 :         buf1[16] = _mm256_sub_epi32(in[15 * stride], in[16 * stride]);
    2789             : 
    2790             :         // stage 2
    2791    30263400 :         buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]);
    2792    30263400 :         buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]);
    2793    30263400 :         buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]);
    2794    30263400 :         buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]);
    2795    30263400 :         buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]);
    2796    30263400 :         buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]);
    2797    30263400 :         buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]);
    2798    30263400 :         buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]);
    2799    30263400 :         buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]);
    2800    30263400 :         buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]);
    2801    30263400 :         buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]);
    2802    30263400 :         buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]);
    2803    30263400 :         buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]);
    2804    30263400 :         buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]);
    2805    30263400 :         buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]);
    2806    30263400 :         buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]);
    2807    30263400 :         buf0[16] = buf1[16];
    2808    30263400 :         buf0[17] = buf1[17];
    2809    30263400 :         buf0[18] = buf1[18];
    2810    30263400 :         buf0[19] = buf1[19];
    2811   302634000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, buf1[20], buf1[27],
    2812             :             buf0[20], buf0[27], __rounding, cos_bit);
    2813   302634000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, buf1[21], buf1[26],
    2814             :             buf0[21], buf0[26], __rounding, cos_bit);
    2815   302634000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, buf1[22], buf1[25],
    2816             :             buf0[22], buf0[25], __rounding, cos_bit);
    2817   302634000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, buf1[23], buf1[24],
    2818             :             buf0[23], buf0[24], __rounding, cos_bit);
    2819    30263400 :         buf0[28] = buf1[28];
    2820    30263400 :         buf0[29] = buf1[29];
    2821    30263400 :         buf0[30] = buf1[30];
    2822    30263400 :         buf0[31] = buf1[31];
    2823             : 
    2824             :         // stage 3
    2825    30263400 :         buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]);
    2826    30263400 :         buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]);
    2827    30263400 :         buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]);
    2828    30263400 :         buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]);
    2829    30263400 :         buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]);
    2830    30263400 :         buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]);
    2831    30263400 :         buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]);
    2832    30263400 :         buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]);
    2833    30263400 :         buf1[8] = buf0[8];
    2834    30263400 :         buf1[9] = buf0[9];
    2835   302634000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, buf0[10], buf0[13],
    2836             :             buf1[10], buf1[13], __rounding, cos_bit);
    2837   302634000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, buf0[11], buf0[12],
    2838             :             buf1[11], buf1[12], __rounding, cos_bit);
    2839    30263400 :         buf1[14] = buf0[14];
    2840    30263400 :         buf1[15] = buf0[15];
    2841    30263400 :         buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]);
    2842    30263400 :         buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]);
    2843    30263400 :         buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]);
    2844    30263400 :         buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]);
    2845    30263400 :         buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]);
    2846    30263400 :         buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]);
    2847    30263400 :         buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]);
    2848    30263400 :         buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]);
    2849    30263400 :         buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]);
    2850    30263400 :         buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]);
    2851    30263400 :         buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]);
    2852    30263400 :         buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]);
    2853    30263400 :         buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]);
    2854    30263400 :         buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]);
    2855    30263400 :         buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]);
    2856    30263400 :         buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]);
    2857             : 
    2858             :         // stage 4
    2859    30263400 :         buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]);
    2860    30263400 :         buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]);
    2861    30263400 :         buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]);
    2862    30263400 :         buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]);
    2863    30263400 :         buf0[4] = buf1[4];
    2864   302634000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, buf1[5], buf1[6],
    2865             :             buf0[5], buf0[6], __rounding, cos_bit);
    2866    30263400 :         buf0[7] = buf1[7];
    2867    30263400 :         buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]);
    2868    30263400 :         buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]);
    2869    30263400 :         buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]);
    2870    30263400 :         buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]);
    2871    30263400 :         buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]);
    2872    30263400 :         buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]);
    2873    30263400 :         buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]);
    2874    30263400 :         buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]);
    2875    30263400 :         buf0[16] = buf1[16];
    2876    30263400 :         buf0[17] = buf1[17];
    2877   302634000 :         btf_32_type0_avx2_new(cospi_m16, cospi_p48, buf1[18], buf1[29],
    2878             :             buf0[18], buf0[29], __rounding, cos_bit);
    2879   302634000 :         btf_32_type0_avx2_new(cospi_m16, cospi_p48, buf1[19], buf1[28],
    2880             :             buf0[19], buf0[28], __rounding, cos_bit);
    2881   302634000 :         btf_32_type0_avx2_new(cospi_m48, cospi_m16, buf1[20], buf1[27],
    2882             :             buf0[20], buf0[27], __rounding, cos_bit);
    2883   302634000 :         btf_32_type0_avx2_new(cospi_m48, cospi_m16, buf1[21], buf1[26],
    2884             :             buf0[21], buf0[26], __rounding, cos_bit);
    2885    30263400 :         buf0[22] = buf1[22];
    2886    30263400 :         buf0[23] = buf1[23];
    2887    30263400 :         buf0[24] = buf1[24];
    2888    30263400 :         buf0[25] = buf1[25];
    2889    30263400 :         buf0[30] = buf1[30];
    2890    30263400 :         buf0[31] = buf1[31];
    2891             : 
    2892             :         // stage 5
    2893   302634000 :         btf_32_type0_avx2_new(cospi_p32, cospi_p32, buf0[0], buf0[1],
    2894             :             buf1[0], buf1[1], __rounding, cos_bit);
    2895   302634000 :         btf_32_type1_avx2_new(cospi_p48, cospi_p16, buf0[2], buf0[3],
    2896             :             buf1[2], buf1[3], __rounding, cos_bit);
    2897    30263400 :         buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]);
    2898    30263400 :         buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]);
    2899    30263400 :         buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]);
    2900    30263400 :         buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]);
    2901    30263400 :         buf1[8] = buf0[8];
    2902   302634000 :         btf_32_type0_avx2_new(cospi_m16, cospi_p48, buf0[9], buf0[14],
    2903             :             buf1[9], buf1[14], __rounding, cos_bit);
    2904   302634000 :         btf_32_type0_avx2_new(cospi_m48, cospi_m16, buf0[10], buf0[13],
    2905             :             buf1[10], buf1[13], __rounding, cos_bit);
    2906    30263400 :         buf1[11] = buf0[11];
    2907    30263400 :         buf1[12] = buf0[12];
    2908    30263400 :         buf1[15] = buf0[15];
    2909    30263400 :         buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]);
    2910    30263400 :         buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]);
    2911    30263400 :         buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]);
    2912    30263400 :         buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]);
    2913    30263400 :         buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]);
    2914    30263400 :         buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]);
    2915    30263400 :         buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]);
    2916    30263400 :         buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]);
    2917    30263400 :         buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]);
    2918    30263400 :         buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]);
    2919    30263400 :         buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]);
    2920    30263400 :         buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]);
    2921    30263400 :         buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]);
    2922    30263400 :         buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]);
    2923    30263400 :         buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]);
    2924    30263400 :         buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]);
    2925             : 
    2926             :         // stage 6
    2927    30263400 :         buf0[0] = buf1[0];
    2928    30263400 :         buf0[1] = buf1[1];
    2929    30263400 :         buf0[2] = buf1[2];
    2930    30263400 :         buf0[3] = buf1[3];
    2931   302634000 :         btf_32_type1_avx2_new(cospi_p56, cospi_p08, buf1[4], buf1[7],
    2932             :             buf0[4], buf0[7], __rounding, cos_bit);
    2933   302634000 :         btf_32_type1_avx2_new(cospi_p24, cospi_p40, buf1[5], buf1[6],
    2934             :             buf0[5], buf0[6], __rounding, cos_bit);
    2935    30263400 :         buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]);
    2936    30263400 :         buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]);
    2937    30263400 :         buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]);
    2938    30263400 :         buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]);
    2939    30263400 :         buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]);
    2940    30263400 :         buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]);
    2941    30263400 :         buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]);
    2942    30263400 :         buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]);
    2943    30263400 :         buf0[16] = buf1[16];
    2944   302634000 :         btf_32_type0_avx2_new(cospi_m08, cospi_p56, buf1[17], buf1[30],
    2945             :             buf0[17], buf0[30], __rounding, cos_bit);
    2946   302634000 :         btf_32_type0_avx2_new(cospi_m56, cospi_m08, buf1[18], buf1[29],
    2947             :             buf0[18],
    2948             :             buf0[29], __rounding, cos_bit);
    2949    30263400 :         buf0[19] = buf1[19];
    2950    30263400 :         buf0[20] = buf1[20];
    2951   302634000 :         btf_32_type0_avx2_new(cospi_m40, cospi_p24, buf1[21], buf1[26],
    2952             :             buf0[21], buf0[26], __rounding, cos_bit);
    2953   302634000 :         btf_32_type0_avx2_new(cospi_m24, cospi_m40, buf1[22], buf1[25],
    2954             :             buf0[22], buf0[25], __rounding, cos_bit);
    2955    30263400 :         buf0[23] = buf1[23];
    2956    30263400 :         buf0[24] = buf1[24];
    2957    30263400 :         buf0[27] = buf1[27];
    2958    30263400 :         buf0[28] = buf1[28];
    2959    30263400 :         buf0[31] = buf1[31];
    2960             : 
    2961             :         // stage 7
    2962    30263400 :         buf1[0] = buf0[0];
    2963    30263400 :         buf1[1] = buf0[1];
    2964    30263400 :         buf1[2] = buf0[2];
    2965    30263400 :         buf1[3] = buf0[3];
    2966    30263400 :         buf1[4] = buf0[4];
    2967    30263400 :         buf1[5] = buf0[5];
    2968    30263400 :         buf1[6] = buf0[6];
    2969    30263400 :         buf1[7] = buf0[7];
    2970   302634000 :         btf_32_type1_avx2_new(cospi_p60, cospi_p04, buf0[8], buf0[15],
    2971             :             buf1[8], buf1[15], __rounding, cos_bit);
    2972   302634000 :         btf_32_type1_avx2_new(cospi_p28, cospi_p36, buf0[9], buf0[14],
    2973             :             buf1[9], buf1[14], __rounding, cos_bit);
    2974   302634000 :         btf_32_type1_avx2_new(cospi_p44, cospi_p20, buf0[10], buf0[13],
    2975             :             buf1[10], buf1[13], __rounding, cos_bit);
    2976   302634000 :         btf_32_type1_avx2_new(cospi_p12, cospi_p52, buf0[11], buf0[12],
    2977             :             buf1[11], buf1[12], __rounding, cos_bit);
    2978    30263400 :         buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]);
    2979    30263400 :         buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]);
    2980    30263400 :         buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]);
    2981    30263400 :         buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]);
    2982    30263400 :         buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]);
    2983    30263400 :         buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]);
    2984    30263400 :         buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]);
    2985    30263400 :         buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]);
    2986    30263400 :         buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]);
    2987    30263400 :         buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]);
    2988    30263400 :         buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]);
    2989    30263400 :         buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]);
    2990    30263400 :         buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]);
    2991    30263400 :         buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]);
    2992    30263400 :         buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]);
    2993    30263400 :         buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]);
    2994             : 
    2995             :         // stage 8
    2996    30263400 :         buf0[0] = buf1[0];
    2997    30263400 :         buf0[1] = buf1[1];
    2998    30263400 :         buf0[2] = buf1[2];
    2999    30263400 :         buf0[3] = buf1[3];
    3000    30263400 :         buf0[4] = buf1[4];
    3001    30263400 :         buf0[5] = buf1[5];
    3002    30263400 :         buf0[6] = buf1[6];
    3003    30263400 :         buf0[7] = buf1[7];
    3004    30263400 :         buf0[8] = buf1[8];
    3005    30263400 :         buf0[9] = buf1[9];
    3006    30263400 :         buf0[10] = buf1[10];
    3007    30263400 :         buf0[11] = buf1[11];
    3008    30263400 :         buf0[12] = buf1[12];
    3009    30263400 :         buf0[13] = buf1[13];
    3010    30263400 :         buf0[14] = buf1[14];
    3011    30263400 :         buf0[15] = buf1[15];
    3012   302634000 :         btf_32_type1_avx2_new(cospi_p62, cospi_p02, buf1[16], buf1[31],
    3013             :             buf0[16], buf0[31], __rounding, cos_bit);
    3014   302634000 :         btf_32_type1_avx2_new(cospi_p30, cospi_p34, buf1[17], buf1[30],
    3015             :             buf0[17], buf0[30], __rounding, cos_bit);
    3016   302634000 :         btf_32_type1_avx2_new(cospi_p46, cospi_p18, buf1[18], buf1[29],
    3017             :             buf0[18], buf0[29], __rounding, cos_bit);
    3018   302634000 :         btf_32_type1_avx2_new(cospi_p14, cospi_p50, buf1[19], buf1[28],
    3019             :             buf0[19], buf0[28], __rounding, cos_bit);
    3020   302634000 :         btf_32_type1_avx2_new(cospi_p54, cospi_p10, buf1[20], buf1[27],
    3021             :             buf0[20], buf0[27], __rounding, cos_bit);
    3022   302634000 :         btf_32_type1_avx2_new(cospi_p22, cospi_p42, buf1[21], buf1[26],
    3023             :             buf0[21], buf0[26], __rounding, cos_bit);
    3024   302634000 :         btf_32_type1_avx2_new(cospi_p38, cospi_p26, buf1[22], buf1[25],
    3025             :             buf0[22], buf0[25], __rounding, cos_bit);
    3026   302634000 :         btf_32_type1_avx2_new(cospi_p06, cospi_p58, buf1[23], buf1[24],
    3027             :             buf0[23], buf0[24], __rounding, cos_bit);
    3028             : 
    3029             :         // stage 9
    3030    30263400 :         out[0 * stride] = buf0[0];
    3031    30263400 :         out[1 * stride] = buf0[16];
    3032    30263400 :         out[2 * stride] = buf0[8];
    3033    30263400 :         out[3 * stride] = buf0[24];
    3034    30263400 :         out[4 * stride] = buf0[4];
    3035    30263400 :         out[5 * stride] = buf0[20];
    3036    30263400 :         out[6 * stride] = buf0[12];
    3037    30263400 :         out[7 * stride] = buf0[28];
    3038    30263400 :         out[8 * stride] = buf0[2];
    3039    30263400 :         out[9 * stride] = buf0[18];
    3040    30263400 :         out[10 * stride] = buf0[10];
    3041    30263400 :         out[11 * stride] = buf0[26];
    3042    30263400 :         out[12 * stride] = buf0[6];
    3043    30263400 :         out[13 * stride] = buf0[22];
    3044    30263400 :         out[14 * stride] = buf0[14];
    3045    30263400 :         out[15 * stride] = buf0[30];
    3046    30263400 :         out[16 * stride] = buf0[1];
    3047    30263400 :         out[17 * stride] = buf0[17];
    3048    30263400 :         out[18 * stride] = buf0[9];
    3049    30263400 :         out[19 * stride] = buf0[25];
    3050    30263400 :         out[20 * stride] = buf0[5];
    3051    30263400 :         out[21 * stride] = buf0[21];
    3052    30263400 :         out[22 * stride] = buf0[13];
    3053    30263400 :         out[23 * stride] = buf0[29];
    3054    30263400 :         out[24 * stride] = buf0[3];
    3055    30263400 :         out[25 * stride] = buf0[19];
    3056    30263400 :         out[26 * stride] = buf0[11];
    3057    30263400 :         out[27 * stride] = buf0[27];
    3058    30263400 :         out[28 * stride] = buf0[7];
    3059    30263400 :         out[29 * stride] = buf0[23];
    3060    30263400 :         out[30 * stride] = buf0[15];
    3061    30263400 :         out[31 * stride] = buf0[31];
    3062             :     }
    3063    13048500 : }
    3064             : 
    3065     8737700 : static void av1_fdct32_new_line_wraper_avx2(const __m256i *input,
    3066             :     __m256i *output, int8_t cos_bit, const int32_t stride) {
    3067     8737700 :     av1_fdct32_new_avx2(input, output, cos_bit, 8, stride);
    3068     8737820 : }
    3069             : 
    3070     3344550 : static void av1_fdct64_new_avx2(const __m256i *input, __m256i *output,
    3071             :     int8_t cos_bit, const int32_t col_num, const int32_t stride) {
    3072     3344550 :     const int32_t *cospi = cospi_arr(cos_bit);
    3073     3344570 :     const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
    3074     3344570 :     const int32_t columns = col_num >> 3;
    3075             : 
    3076     3344570 :     __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
    3077     3344570 :     __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
    3078     3344570 :     __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
    3079     3344570 :     __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
    3080     3344570 :     __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
    3081     3344570 :     __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
    3082     3344570 :     __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
    3083     3344570 :     __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
    3084     3344570 :     __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
    3085     3344570 :     __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
    3086     3344570 :     __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
    3087     3344570 :     __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
    3088     3344570 :     __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
    3089     3344570 :     __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
    3090     3344570 :     __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
    3091     3344570 :     __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
    3092     3344570 :     __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
    3093     3344570 :     __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
    3094     3344570 :     __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
    3095     3344570 :     __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
    3096     3344570 :     __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
    3097     3344570 :     __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
    3098     3344570 :     __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
    3099     3344570 :     __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
    3100     3344570 :     __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
    3101     3344570 :     __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
    3102     3344570 :     __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
    3103     3344570 :     __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
    3104     3344570 :     __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
    3105     3344570 :     __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
    3106     3344570 :     __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
    3107     3344570 :     __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
    3108     3344570 :     __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
    3109     3344570 :     __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
    3110     3344570 :     __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
    3111     3344570 :     __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
    3112     3344570 :     __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
    3113     3344570 :     __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
    3114     3344570 :     __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
    3115     3344570 :     __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
    3116     3344570 :     __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
    3117     3344570 :     __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
    3118     3344570 :     __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
    3119     3344570 :     __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
    3120     3344570 :     __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
    3121     3344570 :     __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
    3122     3344570 :     __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
    3123     3344570 :     __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
    3124     3344570 :     __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
    3125     3344570 :     __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
    3126     3344570 :     __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
    3127     3344570 :     __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
    3128     3344570 :     __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
    3129     3344570 :     __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
    3130     3344570 :     __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
    3131     3344570 :     __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
    3132     3344570 :     __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
    3133     3344570 :     __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
    3134     3344570 :     __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
    3135     3344570 :     __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
    3136     3344570 :     __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
    3137     3344570 :     __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
    3138     3344570 :     __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
    3139     3344570 :     __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
    3140     3344570 :     __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
    3141     3344570 :     __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
    3142     3344570 :     __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
    3143     3344570 :     __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
    3144     3344570 :     __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
    3145     3344570 :     __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
    3146     3344570 :     __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
    3147     3344570 :     __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
    3148     3344570 :     __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
    3149     3344570 :     __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
    3150     3344570 :     __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
    3151     3344570 :     __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
    3152     3344570 :     __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
    3153     3344570 :     __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
    3154             : 
    3155    15943600 :     for (int32_t col = 0; col < columns; col++) {
    3156    12599000 :         const __m256i *in = &input[col];
    3157    12599000 :         __m256i *out = &output[col];
    3158             : 
    3159             :         // stage 1
    3160             :         __m256i x1[64];
    3161    12599000 :         x1[0] = _mm256_add_epi32(in[0 * stride], in[63 * stride]);
    3162    12599000 :         x1[63] = _mm256_sub_epi32(in[0 * stride], in[63 * stride]);
    3163    12599000 :         x1[1] = _mm256_add_epi32(in[1 * stride], in[62 * stride]);
    3164    12599000 :         x1[62] = _mm256_sub_epi32(in[1 * stride], in[62 * stride]);
    3165    12599000 :         x1[2] = _mm256_add_epi32(in[2 * stride], in[61 * stride]);
    3166    12599000 :         x1[61] = _mm256_sub_epi32(in[2 * stride], in[61 * stride]);
    3167    12599000 :         x1[3] = _mm256_add_epi32(in[3 * stride], in[60 * stride]);
    3168    12599000 :         x1[60] = _mm256_sub_epi32(in[3 * stride], in[60 * stride]);
    3169    12599000 :         x1[4] = _mm256_add_epi32(in[4 * stride], in[59 * stride]);
    3170    12599000 :         x1[59] = _mm256_sub_epi32(in[4 * stride], in[59 * stride]);
    3171    12599000 :         x1[5] = _mm256_add_epi32(in[5 * stride], in[58 * stride]);
    3172    12599000 :         x1[58] = _mm256_sub_epi32(in[5 * stride], in[58 * stride]);
    3173    12599000 :         x1[6] = _mm256_add_epi32(in[6 * stride], in[57 * stride]);
    3174    12599000 :         x1[57] = _mm256_sub_epi32(in[6 * stride], in[57 * stride]);
    3175    12599000 :         x1[7] = _mm256_add_epi32(in[7 * stride], in[56 * stride]);
    3176    12599000 :         x1[56] = _mm256_sub_epi32(in[7 * stride], in[56 * stride]);
    3177    12599000 :         x1[8] = _mm256_add_epi32(in[8 * stride], in[55 * stride]);
    3178    12599000 :         x1[55] = _mm256_sub_epi32(in[8 * stride], in[55 * stride]);
    3179    12599000 :         x1[9] = _mm256_add_epi32(in[9 * stride], in[54 * stride]);
    3180    12599000 :         x1[54] = _mm256_sub_epi32(in[9 * stride], in[54 * stride]);
    3181    12599000 :         x1[10] = _mm256_add_epi32(in[10 * stride], in[53 * stride]);
    3182    12599000 :         x1[53] = _mm256_sub_epi32(in[10 * stride], in[53 * stride]);
    3183    12599000 :         x1[11] = _mm256_add_epi32(in[11 * stride], in[52 * stride]);
    3184    12599000 :         x1[52] = _mm256_sub_epi32(in[11 * stride], in[52 * stride]);
    3185    12599000 :         x1[12] = _mm256_add_epi32(in[12 * stride], in[51 * stride]);
    3186    12599000 :         x1[51] = _mm256_sub_epi32(in[12 * stride], in[51 * stride]);
    3187    12599000 :         x1[13] = _mm256_add_epi32(in[13 * stride], in[50 * stride]);
    3188    12599000 :         x1[50] = _mm256_sub_epi32(in[13 * stride], in[50 * stride]);
    3189    12599000 :         x1[14] = _mm256_add_epi32(in[14 * stride], in[49 * stride]);
    3190    12599000 :         x1[49] = _mm256_sub_epi32(in[14 * stride], in[49 * stride]);
    3191    12599000 :         x1[15] = _mm256_add_epi32(in[15 * stride], in[48 * stride]);
    3192    12599000 :         x1[48] = _mm256_sub_epi32(in[15 * stride], in[48 * stride]);
    3193    12599000 :         x1[16] = _mm256_add_epi32(in[16 * stride], in[47 * stride]);
    3194    12599000 :         x1[47] = _mm256_sub_epi32(in[16 * stride], in[47 * stride]);
    3195    12599000 :         x1[17] = _mm256_add_epi32(in[17 * stride], in[46 * stride]);
    3196    12599000 :         x1[46] = _mm256_sub_epi32(in[17 * stride], in[46 * stride]);
    3197    12599000 :         x1[18] = _mm256_add_epi32(in[18 * stride], in[45 * stride]);
    3198    12599000 :         x1[45] = _mm256_sub_epi32(in[18 * stride], in[45 * stride]);
    3199    12599000 :         x1[19] = _mm256_add_epi32(in[19 * stride], in[44 * stride]);
    3200    12599000 :         x1[44] = _mm256_sub_epi32(in[19 * stride], in[44 * stride]);
    3201    12599000 :         x1[20] = _mm256_add_epi32(in[20 * stride], in[43 * stride]);
    3202    12599000 :         x1[43] = _mm256_sub_epi32(in[20 * stride], in[43 * stride]);
    3203    12599000 :         x1[21] = _mm256_add_epi32(in[21 * stride], in[42 * stride]);
    3204    12599000 :         x1[42] = _mm256_sub_epi32(in[21 * stride], in[42 * stride]);
    3205    12599000 :         x1[22] = _mm256_add_epi32(in[22 * stride], in[41 * stride]);
    3206    12599000 :         x1[41] = _mm256_sub_epi32(in[22 * stride], in[41 * stride]);
    3207    12599000 :         x1[23] = _mm256_add_epi32(in[23 * stride], in[40 * stride]);
    3208    12599000 :         x1[40] = _mm256_sub_epi32(in[23 * stride], in[40 * stride]);
    3209    12599000 :         x1[24] = _mm256_add_epi32(in[24 * stride], in[39 * stride]);
    3210    12599000 :         x1[39] = _mm256_sub_epi32(in[24 * stride], in[39 * stride]);
    3211    12599000 :         x1[25] = _mm256_add_epi32(in[25 * stride], in[38 * stride]);
    3212    12599000 :         x1[38] = _mm256_sub_epi32(in[25 * stride], in[38 * stride]);
    3213    12599000 :         x1[26] = _mm256_add_epi32(in[26 * stride], in[37 * stride]);
    3214    12599000 :         x1[37] = _mm256_sub_epi32(in[26 * stride], in[37 * stride]);
    3215    12599000 :         x1[27] = _mm256_add_epi32(in[27 * stride], in[36 * stride]);
    3216    12599000 :         x1[36] = _mm256_sub_epi32(in[27 * stride], in[36 * stride]);
    3217    12599000 :         x1[28] = _mm256_add_epi32(in[28 * stride], in[35 * stride]);
    3218    12599000 :         x1[35] = _mm256_sub_epi32(in[28 * stride], in[35 * stride]);
    3219    12599000 :         x1[29] = _mm256_add_epi32(in[29 * stride], in[34 * stride]);
    3220    12599000 :         x1[34] = _mm256_sub_epi32(in[29 * stride], in[34 * stride]);
    3221    12599000 :         x1[30] = _mm256_add_epi32(in[30 * stride], in[33 * stride]);
    3222    12599000 :         x1[33] = _mm256_sub_epi32(in[30 * stride], in[33 * stride]);
    3223    12599000 :         x1[31] = _mm256_add_epi32(in[31 * stride], in[32 * stride]);
    3224    12599000 :         x1[32] = _mm256_sub_epi32(in[31 * stride], in[32 * stride]);
    3225             : 
    3226             :         // stage 2
    3227             :         __m256i x2[64];
    3228    12599000 :         x2[0] = _mm256_add_epi32(x1[0], x1[31]);
    3229    12599000 :         x2[31] = _mm256_sub_epi32(x1[0], x1[31]);
    3230    12599000 :         x2[1] = _mm256_add_epi32(x1[1], x1[30]);
    3231    12599000 :         x2[30] = _mm256_sub_epi32(x1[1], x1[30]);
    3232    12599000 :         x2[2] = _mm256_add_epi32(x1[2], x1[29]);
    3233    12599000 :         x2[29] = _mm256_sub_epi32(x1[2], x1[29]);
    3234    12599000 :         x2[3] = _mm256_add_epi32(x1[3], x1[28]);
    3235    12599000 :         x2[28] = _mm256_sub_epi32(x1[3], x1[28]);
    3236    12599000 :         x2[4] = _mm256_add_epi32(x1[4], x1[27]);
    3237    12599000 :         x2[27] = _mm256_sub_epi32(x1[4], x1[27]);
    3238    12599000 :         x2[5] = _mm256_add_epi32(x1[5], x1[26]);
    3239    12599000 :         x2[26] = _mm256_sub_epi32(x1[5], x1[26]);
    3240    12599000 :         x2[6] = _mm256_add_epi32(x1[6], x1[25]);
    3241    12599000 :         x2[25] = _mm256_sub_epi32(x1[6], x1[25]);
    3242    12599000 :         x2[7] = _mm256_add_epi32(x1[7], x1[24]);
    3243    12599000 :         x2[24] = _mm256_sub_epi32(x1[7], x1[24]);
    3244    12599000 :         x2[8] = _mm256_add_epi32(x1[8], x1[23]);
    3245    12599000 :         x2[23] = _mm256_sub_epi32(x1[8], x1[23]);
    3246    12599000 :         x2[9] = _mm256_add_epi32(x1[9], x1[22]);
    3247    12599000 :         x2[22] = _mm256_sub_epi32(x1[9], x1[22]);
    3248    12599000 :         x2[10] = _mm256_add_epi32(x1[10], x1[21]);
    3249    12599000 :         x2[21] = _mm256_sub_epi32(x1[10], x1[21]);
    3250    12599000 :         x2[11] = _mm256_add_epi32(x1[11], x1[20]);
    3251    12599000 :         x2[20] = _mm256_sub_epi32(x1[11], x1[20]);
    3252    12599000 :         x2[12] = _mm256_add_epi32(x1[12], x1[19]);
    3253    12599000 :         x2[19] = _mm256_sub_epi32(x1[12], x1[19]);
    3254    12599000 :         x2[13] = _mm256_add_epi32(x1[13], x1[18]);
    3255    12599000 :         x2[18] = _mm256_sub_epi32(x1[13], x1[18]);
    3256    12599000 :         x2[14] = _mm256_add_epi32(x1[14], x1[17]);
    3257    12599000 :         x2[17] = _mm256_sub_epi32(x1[14], x1[17]);
    3258    12599000 :         x2[15] = _mm256_add_epi32(x1[15], x1[16]);
    3259    12599000 :         x2[16] = _mm256_sub_epi32(x1[15], x1[16]);
    3260    12599000 :         x2[32] = x1[32];
    3261    12599000 :         x2[33] = x1[33];
    3262    12599000 :         x2[34] = x1[34];
    3263    12599000 :         x2[35] = x1[35];
    3264    12599000 :         x2[36] = x1[36];
    3265    12599000 :         x2[37] = x1[37];
    3266    12599000 :         x2[38] = x1[38];
    3267    12599000 :         x2[39] = x1[39];
    3268   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[40], x1[55],
    3269             :             x2[40], x2[55], __rounding, cos_bit);
    3270   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[41], x1[54],
    3271             :             x2[41], x2[54], __rounding, cos_bit);
    3272   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[42], x1[53],
    3273             :             x2[42], x2[53], __rounding, cos_bit);
    3274   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[43], x1[52],
    3275             :             x2[43], x2[52], __rounding, cos_bit);
    3276   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[44], x1[51],
    3277             :             x2[44], x2[51], __rounding, cos_bit);
    3278   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[45], x1[50],
    3279             :             x2[45], x2[50], __rounding, cos_bit);
    3280   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[46], x1[49],
    3281             :             x2[46], x2[49], __rounding, cos_bit);
    3282   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x1[47], x1[48],
    3283             :             x2[47], x2[48], __rounding, cos_bit);
    3284    12599000 :         x2[56] = x1[56];
    3285    12599000 :         x2[57] = x1[57];
    3286    12599000 :         x2[58] = x1[58];
    3287    12599000 :         x2[59] = x1[59];
    3288    12599000 :         x2[60] = x1[60];
    3289    12599000 :         x2[61] = x1[61];
    3290    12599000 :         x2[62] = x1[62];
    3291    12599000 :         x2[63] = x1[63];
    3292             : 
    3293             :         // stage 3
    3294             :         __m256i x3[64];
    3295    12599000 :         x3[0] = _mm256_add_epi32(x2[0], x2[15]);
    3296    12599000 :         x3[15] = _mm256_sub_epi32(x2[0], x2[15]);
    3297    12599000 :         x3[1] = _mm256_add_epi32(x2[1], x2[14]);
    3298    12599000 :         x3[14] = _mm256_sub_epi32(x2[1], x2[14]);
    3299    12599000 :         x3[2] = _mm256_add_epi32(x2[2], x2[13]);
    3300    12599000 :         x3[13] = _mm256_sub_epi32(x2[2], x2[13]);
    3301    12599000 :         x3[3] = _mm256_add_epi32(x2[3], x2[12]);
    3302    12599000 :         x3[12] = _mm256_sub_epi32(x2[3], x2[12]);
    3303    12599000 :         x3[4] = _mm256_add_epi32(x2[4], x2[11]);
    3304    12599000 :         x3[11] = _mm256_sub_epi32(x2[4], x2[11]);
    3305    12599000 :         x3[5] = _mm256_add_epi32(x2[5], x2[10]);
    3306    12599000 :         x3[10] = _mm256_sub_epi32(x2[5], x2[10]);
    3307    12599000 :         x3[6] = _mm256_add_epi32(x2[6], x2[9]);
    3308    12599000 :         x3[9] = _mm256_sub_epi32(x2[6], x2[9]);
    3309    12599000 :         x3[7] = _mm256_add_epi32(x2[7], x2[8]);
    3310    12599000 :         x3[8] = _mm256_sub_epi32(x2[7], x2[8]);
    3311    12599000 :         x3[16] = x2[16];
    3312    12599000 :         x3[17] = x2[17];
    3313    12599000 :         x3[18] = x2[18];
    3314    12599000 :         x3[19] = x2[19];
    3315   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x2[20], x2[27],
    3316             :             x3[20], x3[27], __rounding, cos_bit);
    3317   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x2[21], x2[26],
    3318             :             x3[21], x3[26], __rounding, cos_bit);
    3319   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x2[22], x2[25],
    3320             :             x3[22], x3[25], __rounding, cos_bit);
    3321   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x2[23], x2[24],
    3322             :             x3[23], x3[24], __rounding, cos_bit);
    3323    12599000 :         x3[28] = x2[28];
    3324    12599000 :         x3[29] = x2[29];
    3325    12599000 :         x3[30] = x2[30];
    3326    12599000 :         x3[31] = x2[31];
    3327    12599000 :         x3[32] = _mm256_add_epi32(x2[32], x2[47]);
    3328    12599000 :         x3[47] = _mm256_sub_epi32(x2[32], x2[47]);
    3329    12599000 :         x3[33] = _mm256_add_epi32(x2[33], x2[46]);
    3330    12599000 :         x3[46] = _mm256_sub_epi32(x2[33], x2[46]);
    3331    12599000 :         x3[34] = _mm256_add_epi32(x2[34], x2[45]);
    3332    12599000 :         x3[45] = _mm256_sub_epi32(x2[34], x2[45]);
    3333    12599000 :         x3[35] = _mm256_add_epi32(x2[35], x2[44]);
    3334    12599000 :         x3[44] = _mm256_sub_epi32(x2[35], x2[44]);
    3335    12599000 :         x3[36] = _mm256_add_epi32(x2[36], x2[43]);
    3336    12599000 :         x3[43] = _mm256_sub_epi32(x2[36], x2[43]);
    3337    12599000 :         x3[37] = _mm256_add_epi32(x2[37], x2[42]);
    3338    12599000 :         x3[42] = _mm256_sub_epi32(x2[37], x2[42]);
    3339    12599000 :         x3[38] = _mm256_add_epi32(x2[38], x2[41]);
    3340    12599000 :         x3[41] = _mm256_sub_epi32(x2[38], x2[41]);
    3341    12599000 :         x3[39] = _mm256_add_epi32(x2[39], x2[40]);
    3342    12599000 :         x3[40] = _mm256_sub_epi32(x2[39], x2[40]);
    3343    12599000 :         x3[48] = _mm256_sub_epi32(x2[63], x2[48]);
    3344    12599000 :         x3[63] = _mm256_add_epi32(x2[63], x2[48]);
    3345    12599000 :         x3[49] = _mm256_sub_epi32(x2[62], x2[49]);
    3346    12599000 :         x3[62] = _mm256_add_epi32(x2[62], x2[49]);
    3347    12599000 :         x3[50] = _mm256_sub_epi32(x2[61], x2[50]);
    3348    12599000 :         x3[61] = _mm256_add_epi32(x2[61], x2[50]);
    3349    12599000 :         x3[51] = _mm256_sub_epi32(x2[60], x2[51]);
    3350    12599000 :         x3[60] = _mm256_add_epi32(x2[60], x2[51]);
    3351    12599000 :         x3[52] = _mm256_sub_epi32(x2[59], x2[52]);
    3352    12599000 :         x3[59] = _mm256_add_epi32(x2[59], x2[52]);
    3353    12599000 :         x3[53] = _mm256_sub_epi32(x2[58], x2[53]);
    3354    12599000 :         x3[58] = _mm256_add_epi32(x2[58], x2[53]);
    3355    12599000 :         x3[54] = _mm256_sub_epi32(x2[57], x2[54]);
    3356    12599000 :         x3[57] = _mm256_add_epi32(x2[57], x2[54]);
    3357    12599000 :         x3[55] = _mm256_sub_epi32(x2[56], x2[55]);
    3358    12599000 :         x3[56] = _mm256_add_epi32(x2[56], x2[55]);
    3359             : 
    3360             :         // stage 4
    3361             :         __m256i x4[64];
    3362    12599000 :         x4[0] = _mm256_add_epi32(x3[0], x3[7]);
    3363    12599000 :         x4[7] = _mm256_sub_epi32(x3[0], x3[7]);
    3364    12599000 :         x4[1] = _mm256_add_epi32(x3[1], x3[6]);
    3365    12599000 :         x4[6] = _mm256_sub_epi32(x3[1], x3[6]);
    3366    12599000 :         x4[2] = _mm256_add_epi32(x3[2], x3[5]);
    3367    12599000 :         x4[5] = _mm256_sub_epi32(x3[2], x3[5]);
    3368    12599000 :         x4[3] = _mm256_add_epi32(x3[3], x3[4]);
    3369    12599000 :         x4[4] = _mm256_sub_epi32(x3[3], x3[4]);
    3370    12599000 :         x4[8] = x3[8];
    3371    12599000 :         x4[9] = x3[9];
    3372   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x3[10], x3[13],
    3373             :             x4[10], x4[13], __rounding, cos_bit);
    3374   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x3[11], x3[12],
    3375             :             x4[11], x4[12], __rounding, cos_bit);
    3376    12599000 :         x4[14] = x3[14];
    3377    12599000 :         x4[15] = x3[15];
    3378    12599000 :         x4[16] = _mm256_add_epi32(x3[16], x3[23]);
    3379    12599000 :         x4[23] = _mm256_sub_epi32(x3[16], x3[23]);
    3380    12599000 :         x4[17] = _mm256_add_epi32(x3[17], x3[22]);
    3381    12599000 :         x4[22] = _mm256_sub_epi32(x3[17], x3[22]);
    3382    12599000 :         x4[18] = _mm256_add_epi32(x3[18], x3[21]);
    3383    12599000 :         x4[21] = _mm256_sub_epi32(x3[18], x3[21]);
    3384    12599000 :         x4[19] = _mm256_add_epi32(x3[19], x3[20]);
    3385    12599000 :         x4[20] = _mm256_sub_epi32(x3[19], x3[20]);
    3386    12599000 :         x4[24] = _mm256_sub_epi32(x3[31], x3[24]);
    3387    12599000 :         x4[31] = _mm256_add_epi32(x3[31], x3[24]);
    3388    12599000 :         x4[25] = _mm256_sub_epi32(x3[30], x3[25]);
    3389    12599000 :         x4[30] = _mm256_add_epi32(x3[30], x3[25]);
    3390    12599000 :         x4[26] = _mm256_sub_epi32(x3[29], x3[26]);
    3391    12599000 :         x4[29] = _mm256_add_epi32(x3[29], x3[26]);
    3392    12599000 :         x4[27] = _mm256_sub_epi32(x3[28], x3[27]);
    3393    12599000 :         x4[28] = _mm256_add_epi32(x3[28], x3[27]);
    3394    12599000 :         x4[32] = x3[32];
    3395    12599000 :         x4[33] = x3[33];
    3396    12599000 :         x4[34] = x3[34];
    3397    12599000 :         x4[35] = x3[35];
    3398   125990000 :         btf_32_type0_avx2_new(cospi_m16, cospi_p48, x3[36], x3[59],
    3399             :             x4[36], x4[59], __rounding, cos_bit);
    3400   125990000 :         btf_32_type0_avx2_new(cospi_m16, cospi_p48, x3[37], x3[58],
    3401             :             x4[37], x4[58], __rounding, cos_bit);
    3402   125990000 :         btf_32_type0_avx2_new(cospi_m16, cospi_p48, x3[38], x3[57],
    3403             :             x4[38], x4[57], __rounding, cos_bit);
    3404   125990000 :         btf_32_type0_avx2_new(cospi_m16, cospi_p48, x3[39], x3[56],
    3405             :             x4[39], x4[56], __rounding, cos_bit);
    3406   125990000 :         btf_32_type0_avx2_new(cospi_m48, cospi_m16, x3[40], x3[55],
    3407             :             x4[40], x4[55], __rounding, cos_bit);
    3408   125990000 :         btf_32_type0_avx2_new(cospi_m48, cospi_m16, x3[41], x3[54],
    3409             :             x4[41], x4[54], __rounding, cos_bit);
    3410   125990000 :         btf_32_type0_avx2_new(cospi_m48, cospi_m16, x3[42], x3[53],
    3411             :             x4[42], x4[53], __rounding, cos_bit);
    3412   125990000 :         btf_32_type0_avx2_new(cospi_m48, cospi_m16, x3[43], x3[52],
    3413             :             x4[43], x4[52], __rounding, cos_bit);
    3414    12599000 :         x4[44] = x3[44];
    3415    12599000 :         x4[45] = x3[45];
    3416    12599000 :         x4[46] = x3[46];
    3417    12599000 :         x4[47] = x3[47];
    3418    12599000 :         x4[48] = x3[48];
    3419    12599000 :         x4[49] = x3[49];
    3420    12599000 :         x4[50] = x3[50];
    3421    12599000 :         x4[51] = x3[51];
    3422    12599000 :         x4[60] = x3[60];
    3423    12599000 :         x4[61] = x3[61];
    3424    12599000 :         x4[62] = x3[62];
    3425    12599000 :         x4[63] = x3[63];
    3426             : 
    3427             :         // stage 5
    3428             :         __m256i x5[64];
    3429    12599000 :         x5[0] = _mm256_add_epi32(x4[0], x4[3]);
    3430    12599000 :         x5[3] = _mm256_sub_epi32(x4[0], x4[3]);
    3431    12599000 :         x5[1] = _mm256_add_epi32(x4[1], x4[2]);
    3432    12599000 :         x5[2] = _mm256_sub_epi32(x4[1], x4[2]);
    3433    12599000 :         x5[4] = x4[4];
    3434   125990000 :         btf_32_type0_avx2_new(cospi_m32, cospi_p32, x4[5], x4[6],
    3435             :             x5[5], x5[6], __rounding, cos_bit);
    3436    12599000 :         x5[7] = x4[7];
    3437    12599000 :         x5[8] = _mm256_add_epi32(x4[8], x4[11]);
    3438    12599000 :         x5[11] = _mm256_sub_epi32(x4[8], x4[11]);
    3439    12599000 :         x5[9] = _mm256_add_epi32(x4[9], x4[10]);
    3440    12599000 :         x5[10] = _mm256_sub_epi32(x4[9], x4[10]);
    3441    12599000 :         x5[12] = _mm256_sub_epi32(x4[15], x4[12]);
    3442    12599000 :         x5[15] = _mm256_add_epi32(x4[15], x4[12]);
    3443    12599000 :         x5[13] = _mm256_sub_epi32(x4[14], x4[13]);
    3444    12599000 :         x5[14] = _mm256_add_epi32(x4[14], x4[13]);
    3445    12599000 :         x5[16] = x4[16];
    3446    12599000 :         x5[17] = x4[17];
    3447   125990000 :         btf_32_type0_avx2_new(cospi_m16, cospi_p48, x4[18], x4[29],
    3448             :             x5[18], x5[29], __rounding, cos_bit);
    3449   125990000 :         btf_32_type0_avx2_new(cospi_m16, cospi_p48, x4[19], x4[28],
    3450             :             x5[19], x5[28], __rounding, cos_bit);
    3451   125990000 :         btf_32_type0_avx2_new(cospi_m48, cospi_m16, x4[20], x4[27],
    3452             :             x5[20], x5[27], __rounding, cos_bit);
    3453   125990000 :         btf_32_type0_avx2_new(cospi_m48, cospi_m16, x4[21], x4[26],
    3454             :             x5[21], x5[26], __rounding, cos_bit);
    3455    12599000 :         x5[22] = x4[22];
    3456    12599000 :         x5[23] = x4[23];
    3457    12599000 :         x5[24] = x4[24];
    3458    12599000 :         x5[25] = x4[25];
    3459    12599000 :         x5[30] = x4[30];
    3460    12599000 :         x5[31] = x4[31];
    3461    12599000 :         x5[32] = _mm256_add_epi32(x4[32], x4[39]);
    3462    12599000 :         x5[39] = _mm256_sub_epi32(x4[32], x4[39]);
    3463    12599000 :         x5[33] = _mm256_add_epi32(x4[33], x4[38]);
    3464    12599000 :         x5[38] = _mm256_sub_epi32(x4[33], x4[38]);
    3465    12599000 :         x5[34] = _mm256_add_epi32(x4[34], x4[37]);
    3466    12599000 :         x5[37] = _mm256_sub_epi32(x4[34], x4[37]);
    3467    12599000 :         x5[35] = _mm256_add_epi32(x4[35], x4[36]);
    3468    12599000 :         x5[36] = _mm256_sub_epi32(x4[35], x4[36]);
    3469    12599000 :         x5[40] = _mm256_sub_epi32(x4[47], x4[40]);
    3470    12599000 :         x5[47] = _mm256_add_epi32(x4[47], x4[40]);
    3471    12599000 :         x5[41] = _mm256_sub_epi32(x4[46], x4[41]);
    3472    12599000 :         x5[46] = _mm256_add_epi32(x4[46], x4[41]);
    3473    12599000 :         x5[42] = _mm256_sub_epi32(x4[45], x4[42]);
    3474    12599000 :         x5[45] = _mm256_add_epi32(x4[45], x4[42]);
    3475    12599000 :         x5[43] = _mm256_sub_epi32(x4[44], x4[43]);
    3476    12599000 :         x5[44] = _mm256_add_epi32(x4[44], x4[43]);
    3477    12599000 :         x5[48] = _mm256_add_epi32(x4[48], x4[55]);
    3478    12599000 :         x5[55] = _mm256_sub_epi32(x4[48], x4[55]);
    3479    12599000 :         x5[49] = _mm256_add_epi32(x4[49], x4[54]);
    3480    12599000 :         x5[54] = _mm256_sub_epi32(x4[49], x4[54]);
    3481    12599000 :         x5[50] = _mm256_add_epi32(x4[50], x4[53]);
    3482    12599000 :         x5[53] = _mm256_sub_epi32(x4[50], x4[53]);
    3483    12599000 :         x5[51] = _mm256_add_epi32(x4[51], x4[52]);
    3484    12599000 :         x5[52] = _mm256_sub_epi32(x4[51], x4[52]);
    3485    12599000 :         x5[56] = _mm256_sub_epi32(x4[63], x4[56]);
    3486    12599000 :         x5[63] = _mm256_add_epi32(x4[63], x4[56]);
    3487    12599000 :         x5[57] = _mm256_sub_epi32(x4[62], x4[57]);
    3488    12599000 :         x5[62] = _mm256_add_epi32(x4[62], x4[57]);
    3489    12599000 :         x5[58] = _mm256_sub_epi32(x4[61], x4[58]);
    3490    12599000 :         x5[61] = _mm256_add_epi32(x4[61], x4[58]);
    3491    12599000 :         x5[59] = _mm256_sub_epi32(x4[60], x4[59]);
    3492    12599000 :         x5[60] = _mm256_add_epi32(x4[60], x4[59]);
    3493             : 
    3494             :         // stage 6
    3495             :         __m256i x6[64];
    3496   125990000 :         btf_32_type0_avx2_new(cospi_p32, cospi_p32, x5[0], x5[1],
    3497             :             x6[0], x6[1], __rounding, cos_bit);
    3498   125990000 :         btf_32_type1_avx2_new(cospi_p48, cospi_p16, x5[2], x5[3],
    3499             :             x6[2], x6[3], __rounding, cos_bit);
    3500    12599000 :         x6[4] = _mm256_add_epi32(x5[4], x5[5]);
    3501    12599000 :         x6[5] = _mm256_sub_epi32(x5[4], x5[5]);
    3502    12599000 :         x6[6] = _mm256_sub_epi32(x5[7], x5[6]);
    3503    12599000 :         x6[7] = _mm256_add_epi32(x5[7], x5[6]);
    3504    12599000 :         x6[8] = x5[8];
    3505   125990000 :         btf_32_type0_avx2_new(cospi_m16, cospi_p48, x5[9], x5[14],
    3506             :             x6[9], x6[14], __rounding, cos_bit);
    3507   125990000 :         btf_32_type0_avx2_new(cospi_m48, cospi_m16, x5[10], x5[13],
    3508             :             x6[10], x6[13], __rounding, cos_bit);
    3509    12599000 :         x6[11] = x5[11];
    3510    12599000 :         x6[12] = x5[12];
    3511    12599000 :         x6[15] = x5[15];
    3512    12599000 :         x6[16] = _mm256_add_epi32(x5[16], x5[19]);
    3513    12599000 :         x6[19] = _mm256_sub_epi32(x5[16], x5[19]);
    3514    12599000 :         x6[17] = _mm256_add_epi32(x5[17], x5[18]);
    3515    12599000 :         x6[18] = _mm256_sub_epi32(x5[17], x5[18]);
    3516    12599000 :         x6[20] = _mm256_sub_epi32(x5[23], x5[20]);
    3517    12599000 :         x6[23] = _mm256_add_epi32(x5[23], x5[20]);
    3518    12599000 :         x6[21] = _mm256_sub_epi32(x5[22], x5[21]);
    3519    12599000 :         x6[22] = _mm256_add_epi32(x5[22], x5[21]);
    3520    12599000 :         x6[24] = _mm256_add_epi32(x5[24], x5[27]);
    3521    12599000 :         x6[27] = _mm256_sub_epi32(x5[24], x5[27]);
    3522    12599000 :         x6[25] = _mm256_add_epi32(x5[25], x5[26]);
    3523    12599000 :         x6[26] = _mm256_sub_epi32(x5[25], x5[26]);
    3524    12599000 :         x6[28] = _mm256_sub_epi32(x5[31], x5[28]);
    3525    12599000 :         x6[31] = _mm256_add_epi32(x5[31], x5[28]);
    3526    12599000 :         x6[29] = _mm256_sub_epi32(x5[30], x5[29]);
    3527    12599000 :         x6[30] = _mm256_add_epi32(x5[30], x5[29]);
    3528    12599000 :         x6[32] = x5[32];
    3529    12599000 :         x6[33] = x5[33];
    3530   125990000 :         btf_32_type0_avx2_new(cospi_m08, cospi_p56, x5[34], x5[61],
    3531             :             x6[34], x6[61], __rounding, cos_bit);
    3532   125990000 :         btf_32_type0_avx2_new(cospi_m08, cospi_p56, x5[35], x5[60],
    3533             :             x6[35], x6[60], __rounding, cos_bit);
    3534   125990000 :         btf_32_type0_avx2_new(cospi_m56, cospi_m08, x5[36], x5[59],
    3535             :             x6[36], x6[59], __rounding, cos_bit);
    3536   125990000 :         btf_32_type0_avx2_new(cospi_m56, cospi_m08, x5[37], x5[58],
    3537             :             x6[37], x6[58], __rounding, cos_bit);
    3538    12599000 :         x6[38] = x5[38];
    3539    12599000 :         x6[39] = x5[39];
    3540    12599000 :         x6[40] = x5[40];
    3541    12599000 :         x6[41] = x5[41];
    3542   125990000 :         btf_32_type0_avx2_new(cospi_m40, cospi_p24, x5[42], x5[53],
    3543             :             x6[42], x6[53], __rounding, cos_bit);
    3544   125990000 :         btf_32_type0_avx2_new(cospi_m40, cospi_p24, x5[43], x5[52],
    3545             :             x6[43], x6[52], __rounding, cos_bit);
    3546   125990000 :         btf_32_type0_avx2_new(cospi_m24, cospi_m40, x5[44], x5[51],
    3547             :             x6[44], x6[51], __rounding, cos_bit);
    3548   125990000 :         btf_32_type0_avx2_new(cospi_m24, cospi_m40, x5[45], x5[50],
    3549             :             x6[45], x6[50], __rounding, cos_bit);
    3550    12599000 :         x6[46] = x5[46];
    3551    12599000 :         x6[47] = x5[47];
    3552    12599000 :         x6[48] = x5[48];
    3553    12599000 :         x6[49] = x5[49];
    3554    12599000 :         x6[54] = x5[54];
    3555    12599000 :         x6[55] = x5[55];
    3556    12599000 :         x6[56] = x5[56];
    3557    12599000 :         x6[57] = x5[57];
    3558    12599000 :         x6[62] = x5[62];
    3559    12599000 :         x6[63] = x5[63];
    3560             : 
    3561             :         // stage 7
    3562             :         __m256i x7[64];
    3563    12599000 :         x7[0] = x6[0];
    3564    12599000 :         x7[1] = x6[1];
    3565    12599000 :         x7[2] = x6[2];
    3566    12599000 :         x7[3] = x6[3];
    3567   125990000 :         btf_32_type1_avx2_new(cospi_p56, cospi_p08, x6[4], x6[7],
    3568             :             x7[4], x7[7], __rounding, cos_bit);
    3569   125990000 :         btf_32_type1_avx2_new(cospi_p24, cospi_p40, x6[5], x6[6],
    3570             :             x7[5], x7[6], __rounding, cos_bit);
    3571    12599000 :         x7[8] = _mm256_add_epi32(x6[8], x6[9]);
    3572    12599000 :         x7[9] = _mm256_sub_epi32(x6[8], x6[9]);
    3573    12599000 :         x7[10] = _mm256_sub_epi32(x6[11], x6[10]);
    3574    12599000 :         x7[11] = _mm256_add_epi32(x6[11], x6[10]);
    3575    12599000 :         x7[12] = _mm256_add_epi32(x6[12], x6[13]);
    3576    12599000 :         x7[13] = _mm256_sub_epi32(x6[12], x6[13]);
    3577    12599000 :         x7[14] = _mm256_sub_epi32(x6[15], x6[14]);
    3578    12599000 :         x7[15] = _mm256_add_epi32(x6[15], x6[14]);
    3579    12599000 :         x7[16] = x6[16];
    3580   125990000 :         btf_32_type0_avx2_new(cospi_m08, cospi_p56, x6[17], x6[30],
    3581             :             x7[17], x7[30], __rounding, cos_bit);
    3582   125990000 :         btf_32_type0_avx2_new(cospi_m56, cospi_m08, x6[18], x6[29],
    3583             :             x7[18], x7[29], __rounding, cos_bit);
    3584    12599000 :         x7[19] = x6[19];
    3585    12599000 :         x7[20] = x6[20];
    3586   125990000 :         btf_32_type0_avx2_new(cospi_m40, cospi_p24, x6[21], x6[26],
    3587             :             x7[21], x7[26], __rounding, cos_bit);
    3588   125990000 :         btf_32_type0_avx2_new(cospi_m24, cospi_m40, x6[22], x6[25],
    3589             :             x7[22], x7[25], __rounding, cos_bit);
    3590    12599000 :         x7[23] = x6[23];
    3591    12599000 :         x7[24] = x6[24];
    3592    12599000 :         x7[27] = x6[27];
    3593    12599000 :         x7[28] = x6[28];
    3594    12599000 :         x7[31] = x6[31];
    3595    12599000 :         x7[32] = _mm256_add_epi32(x6[32], x6[35]);
    3596    12599000 :         x7[35] = _mm256_sub_epi32(x6[32], x6[35]);
    3597    12599000 :         x7[33] = _mm256_add_epi32(x6[33], x6[34]);
    3598    12599000 :         x7[34] = _mm256_sub_epi32(x6[33], x6[34]);
    3599    12599000 :         x7[36] = _mm256_sub_epi32(x6[39], x6[36]);
    3600    12599000 :         x7[39] = _mm256_add_epi32(x6[39], x6[36]);
    3601    12599000 :         x7[37] = _mm256_sub_epi32(x6[38], x6[37]);
    3602    12599000 :         x7[38] = _mm256_add_epi32(x6[38], x6[37]);
    3603    12599000 :         x7[40] = _mm256_add_epi32(x6[40], x6[43]);
    3604    12599000 :         x7[43] = _mm256_sub_epi32(x6[40], x6[43]);
    3605    12599000 :         x7[41] = _mm256_add_epi32(x6[41], x6[42]);
    3606    12599000 :         x7[42] = _mm256_sub_epi32(x6[41], x6[42]);
    3607    12599000 :         x7[44] = _mm256_sub_epi32(x6[47], x6[44]);
    3608    12599000 :         x7[47] = _mm256_add_epi32(x6[47], x6[44]);
    3609    12599000 :         x7[45] = _mm256_sub_epi32(x6[46], x6[45]);
    3610    12599000 :         x7[46] = _mm256_add_epi32(x6[46], x6[45]);
    3611    12599000 :         x7[48] = _mm256_add_epi32(x6[48], x6[51]);
    3612    12599000 :         x7[51] = _mm256_sub_epi32(x6[48], x6[51]);
    3613    12599000 :         x7[49] = _mm256_add_epi32(x6[49], x6[50]);
    3614    12599000 :         x7[50] = _mm256_sub_epi32(x6[49], x6[50]);
    3615    12599000 :         x7[52] = _mm256_sub_epi32(x6[55], x6[52]);
    3616    12599000 :         x7[55] = _mm256_add_epi32(x6[55], x6[52]);
    3617    12599000 :         x7[53] = _mm256_sub_epi32(x6[54], x6[53]);
    3618    12599000 :         x7[54] = _mm256_add_epi32(x6[54], x6[53]);
    3619    12599000 :         x7[56] = _mm256_add_epi32(x6[56], x6[59]);
    3620    12599000 :         x7[59] = _mm256_sub_epi32(x6[56], x6[59]);
    3621    12599000 :         x7[57] = _mm256_add_epi32(x6[57], x6[58]);
    3622    12599000 :         x7[58] = _mm256_sub_epi32(x6[57], x6[58]);
    3623    12599000 :         x7[60] = _mm256_sub_epi32(x6[63], x6[60]);
    3624    12599000 :         x7[63] = _mm256_add_epi32(x6[63], x6[60]);
    3625    12599000 :         x7[61] = _mm256_sub_epi32(x6[62], x6[61]);
    3626    12599000 :         x7[62] = _mm256_add_epi32(x6[62], x6[61]);
    3627             : 
    3628             :         // stage 8
    3629             :         __m256i x8[64];
    3630    12599000 :         x8[0] = x7[0];
    3631    12599000 :         x8[1] = x7[1];
    3632    12599000 :         x8[2] = x7[2];
    3633    12599000 :         x8[3] = x7[3];
    3634    12599000 :         x8[4] = x7[4];
    3635    12599000 :         x8[5] = x7[5];
    3636    12599000 :         x8[6] = x7[6];
    3637    12599000 :         x8[7] = x7[7];
    3638             : 
    3639   125990000 :         btf_32_type1_avx2_new(cospi_p60, cospi_p04, x7[8], x7[15],
    3640             :             x8[8], x8[15], __rounding, cos_bit);
    3641   125990000 :         btf_32_type1_avx2_new(cospi_p28, cospi_p36, x7[9], x7[14],
    3642             :             x8[9], x8[14], __rounding, cos_bit);
    3643   125990000 :         btf_32_type1_avx2_new(cospi_p44, cospi_p20, x7[10], x7[13],
    3644             :             x8[10], x8[13], __rounding, cos_bit);
    3645   125990000 :         btf_32_type1_avx2_new(cospi_p12, cospi_p52, x7[11], x7[12],
    3646             :             x8[11], x8[12], __rounding, cos_bit);
    3647    12599000 :         x8[16] = _mm256_add_epi32(x7[16], x7[17]);
    3648    12599000 :         x8[17] = _mm256_sub_epi32(x7[16], x7[17]);
    3649    12599000 :         x8[18] = _mm256_sub_epi32(x7[19], x7[18]);
    3650    12599000 :         x8[19] = _mm256_add_epi32(x7[19], x7[18]);
    3651    12599000 :         x8[20] = _mm256_add_epi32(x7[20], x7[21]);
    3652    12599000 :         x8[21] = _mm256_sub_epi32(x7[20], x7[21]);
    3653    12599000 :         x8[22] = _mm256_sub_epi32(x7[23], x7[22]);
    3654    12599000 :         x8[23] = _mm256_add_epi32(x7[23], x7[22]);
    3655    12599000 :         x8[24] = _mm256_add_epi32(x7[24], x7[25]);
    3656    12599000 :         x8[25] = _mm256_sub_epi32(x7[24], x7[25]);
    3657    12599000 :         x8[26] = _mm256_sub_epi32(x7[27], x7[26]);
    3658    12599000 :         x8[27] = _mm256_add_epi32(x7[27], x7[26]);
    3659    12599000 :         x8[28] = _mm256_add_epi32(x7[28], x7[29]);
    3660    12599000 :         x8[29] = _mm256_sub_epi32(x7[28], x7[29]);
    3661    12599000 :         x8[30] = _mm256_sub_epi32(x7[31], x7[30]);
    3662    12599000 :         x8[31] = _mm256_add_epi32(x7[31], x7[30]);
    3663    12599000 :         x8[32] = x7[32];
    3664   125990000 :         btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62],
    3665             :             x8[33], x8[62], __rounding, cos_bit);
    3666   125990000 :         btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61],
    3667             :             x8[34], x8[61], __rounding, cos_bit);
    3668    12599000 :         x8[35] = x7[35];
    3669    12599000 :         x8[36] = x7[36];
    3670   125990000 :         btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58],
    3671             :             x8[37], x8[58], __rounding, cos_bit);
    3672   125990000 :         btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57],
    3673             :             x8[38], x8[57], __rounding, cos_bit);
    3674    12599000 :         x8[39] = x7[39];
    3675    12599000 :         x8[40] = x7[40];
    3676   125990000 :         btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54],
    3677             :             x8[41], x8[54], __rounding, cos_bit);
    3678   125990000 :         btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53],
    3679             :             x8[42], x8[53], __rounding, cos_bit);
    3680    12599000 :         x8[43] = x7[43];
    3681    12599000 :         x8[44] = x7[44];
    3682   125990000 :         btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50],
    3683             :             x8[45], x8[50], __rounding, cos_bit);
    3684   125990000 :         btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49],
    3685             :             x8[46], x8[49], __rounding, cos_bit);
    3686    12599000 :         x8[47] = x7[47];
    3687    12599000 :         x8[48] = x7[48];
    3688    12599000 :         x8[51] = x7[51];
    3689    12599000 :         x8[52] = x7[52];
    3690    12599000 :         x8[55] = x7[55];
    3691    12599000 :         x8[56] = x7[56];
    3692    12599000 :         x8[59] = x7[59];
    3693    12599000 :         x8[60] = x7[60];
    3694    12599000 :         x8[63] = x7[63];
    3695             : 
    3696             :         // stage 9
    3697             :         __m256i x9[64];
    3698    12599000 :         x9[0] = x8[0];
    3699    12599000 :         x9[1] = x8[1];
    3700    12599000 :         x9[2] = x8[2];
    3701    12599000 :         x9[3] = x8[3];
    3702    12599000 :         x9[4] = x8[4];
    3703    12599000 :         x9[5] = x8[5];
    3704    12599000 :         x9[6] = x8[6];
    3705    12599000 :         x9[7] = x8[7];
    3706    12599000 :         x9[8] = x8[8];
    3707    12599000 :         x9[9] = x8[9];
    3708    12599000 :         x9[10] = x8[10];
    3709    12599000 :         x9[11] = x8[11];
    3710    12599000 :         x9[12] = x8[12];
    3711    12599000 :         x9[13] = x8[13];
    3712    12599000 :         x9[14] = x8[14];
    3713    12599000 :         x9[15] = x8[15];
    3714   125990000 :         btf_32_type1_avx2_new(cospi_p62, cospi_p02, x8[16], x8[31],
    3715             :             x9[16], x9[31], __rounding, cos_bit);
    3716   125990000 :         btf_32_type1_avx2_new(cospi_p30, cospi_p34, x8[17], x8[30],
    3717             :             x9[17], x9[30], __rounding, cos_bit);
    3718   125990000 :         btf_32_type1_avx2_new(cospi_p46, cospi_p18, x8[18], x8[29],
    3719             :             x9[18], x9[29], __rounding, cos_bit);
    3720   125990000 :         btf_32_type1_avx2_new(cospi_p14, cospi_p50, x8[19], x8[28],
    3721             :             x9[19], x9[28], __rounding, cos_bit);
    3722   125990000 :         btf_32_type1_avx2_new(cospi_p54, cospi_p10, x8[20], x8[27],
    3723             :             x9[20], x9[27], __rounding, cos_bit);
    3724   125990000 :         btf_32_type1_avx2_new(cospi_p22, cospi_p42, x8[21], x8[26],
    3725             :             x9[21], x9[26], __rounding, cos_bit);
    3726   125990000 :         btf_32_type1_avx2_new(cospi_p38, cospi_p26, x8[22], x8[25],
    3727             :             x9[22], x9[25], __rounding, cos_bit);
    3728   125990000 :         btf_32_type1_avx2_new(cospi_p06, cospi_p58, x8[23], x8[24],
    3729             :             x9[23], x9[24], __rounding, cos_bit);
    3730    12599000 :         x9[32] = _mm256_add_epi32(x8[32], x8[33]);
    3731    12599000 :         x9[33] = _mm256_sub_epi32(x8[32], x8[33]);
    3732    12599000 :         x9[34] = _mm256_sub_epi32(x8[35], x8[34]);
    3733    12599000 :         x9[35] = _mm256_add_epi32(x8[35], x8[34]);
    3734    12599000 :         x9[36] = _mm256_add_epi32(x8[36], x8[37]);
    3735    12599000 :         x9[37] = _mm256_sub_epi32(x8[36], x8[37]);
    3736    12599000 :         x9[38] = _mm256_sub_epi32(x8[39], x8[38]);
    3737    12599000 :         x9[39] = _mm256_add_epi32(x8[39], x8[38]);
    3738    12599000 :         x9[40] = _mm256_add_epi32(x8[40], x8[41]);
    3739    12599000 :         x9[41] = _mm256_sub_epi32(x8[40], x8[41]);
    3740    12599000 :         x9[42] = _mm256_sub_epi32(x8[43], x8[42]);
    3741    12599000 :         x9[43] = _mm256_add_epi32(x8[43], x8[42]);
    3742    12599000 :         x9[44] = _mm256_add_epi32(x8[44], x8[45]);
    3743    12599000 :         x9[45] = _mm256_sub_epi32(x8[44], x8[45]);
    3744    12599000 :         x9[46] = _mm256_sub_epi32(x8[47], x8[46]);
    3745    12599000 :         x9[47] = _mm256_add_epi32(x8[47], x8[46]);
    3746    12599000 :         x9[48] = _mm256_add_epi32(x8[48], x8[49]);
    3747    12599000 :         x9[49] = _mm256_sub_epi32(x8[48], x8[49]);
    3748    12599000 :         x9[50] = _mm256_sub_epi32(x8[51], x8[50]);
    3749    12599000 :         x9[51] = _mm256_add_epi32(x8[51], x8[50]);
    3750    12599000 :         x9[52] = _mm256_add_epi32(x8[52], x8[53]);
    3751    12599000 :         x9[53] = _mm256_sub_epi32(x8[52], x8[53]);
    3752    12599000 :         x9[54] = _mm256_sub_epi32(x8[55], x8[54]);
    3753    12599000 :         x9[55] = _mm256_add_epi32(x8[55], x8[54]);
    3754    12599000 :         x9[56] = _mm256_add_epi32(x8[56], x8[57]);
    3755    12599000 :         x9[57] = _mm256_sub_epi32(x8[56], x8[57]);
    3756    12599000 :         x9[58] = _mm256_sub_epi32(x8[59], x8[58]);
    3757    12599000 :         x9[59] = _mm256_add_epi32(x8[59], x8[58]);
    3758    12599000 :         x9[60] = _mm256_add_epi32(x8[60], x8[61]);
    3759    12599000 :         x9[61] = _mm256_sub_epi32(x8[60], x8[61]);
    3760    12599000 :         x9[62] = _mm256_sub_epi32(x8[63], x8[62]);
    3761    12599000 :         x9[63] = _mm256_add_epi32(x8[63], x8[62]);
    3762             : 
    3763             :         // stage 10
    3764             :         __m256i x10[64];
    3765    12599000 :         out[0 * stride] = x9[0];
    3766    12599000 :         out[32 * stride] = x9[1];
    3767    12599000 :         out[16 * stride] = x9[2];
    3768    12599000 :         out[48 * stride] = x9[3];
    3769    12599000 :         out[8 * stride] = x9[4];
    3770    12599000 :         out[40 * stride] = x9[5];
    3771    12599000 :         out[24 * stride] = x9[6];
    3772    12599000 :         out[56 * stride] = x9[7];
    3773    12599000 :         out[4 * stride] = x9[8];
    3774    12599000 :         out[36 * stride] = x9[9];
    3775    12599000 :         out[20 * stride] = x9[10];
    3776    12599000 :         out[52 * stride] = x9[11];
    3777    12599000 :         out[12 * stride] = x9[12];
    3778    12599000 :         out[44 * stride] = x9[13];
    3779    12599000 :         out[28 * stride] = x9[14];
    3780    12599000 :         out[60 * stride] = x9[15];
    3781    12599000 :         out[2 * stride] = x9[16];
    3782    12599000 :         out[34 * stride] = x9[17];
    3783    12599000 :         out[18 * stride] = x9[18];
    3784    12599000 :         out[50 * stride] = x9[19];
    3785    12599000 :         out[10 * stride] = x9[20];
    3786    12599000 :         out[42 * stride] = x9[21];
    3787    12599000 :         out[26 * stride] = x9[22];
    3788    12599000 :         out[58 * stride] = x9[23];
    3789    12599000 :         out[6 * stride] = x9[24];
    3790    12599000 :         out[38 * stride] = x9[25];
    3791    12599000 :         out[22 * stride] = x9[26];
    3792    12599000 :         out[54 * stride] = x9[27];
    3793    12599000 :         out[14 * stride] = x9[28];
    3794    12599000 :         out[46 * stride] = x9[29];
    3795    12599000 :         out[30 * stride] = x9[30];
    3796    12599000 :         out[62 * stride] = x9[31];
    3797   125990000 :         btf_32_type1_avx2_new(cospi_p63, cospi_p01, x9[32], x9[63],
    3798             :             x10[32], x10[63], __rounding, cos_bit);
    3799   125990000 :         btf_32_type1_avx2_new(cospi_p31, cospi_p33, x9[33], x9[62],
    3800             :             x10[33], x10[62], __rounding, cos_bit);
    3801   125990000 :         btf_32_type1_avx2_new(cospi_p47, cospi_p17, x9[34], x9[61],
    3802             :             x10[34], x10[61], __rounding, cos_bit);
    3803   125990000 :         btf_32_type1_avx2_new(cospi_p15, cospi_p49, x9[35], x9[60],
    3804             :             x10[35], x10[60], __rounding, cos_bit);
    3805   125990000 :         btf_32_type1_avx2_new(cospi_p55, cospi_p09, x9[36], x9[59],
    3806             :             x10[36], x10[59], __rounding, cos_bit);
    3807   125990000 :         btf_32_type1_avx2_new(cospi_p23, cospi_p41, x9[37], x9[58],
    3808             :             x10[37], x10[58], __rounding, cos_bit);
    3809   125990000 :         btf_32_type1_avx2_new(cospi_p39, cospi_p25, x9[38], x9[57],
    3810             :             x10[38], x10[57], __rounding, cos_bit);
    3811   125990000 :         btf_32_type1_avx2_new(cospi_p07, cospi_p57, x9[39], x9[56],
    3812             :             x10[39], x10[56], __rounding, cos_bit);
    3813   125990000 :         btf_32_type1_avx2_new(cospi_p59, cospi_p05, x9[40], x9[55],
    3814             :             x10[40], x10[55], __rounding, cos_bit);
    3815   125990000 :         btf_32_type1_avx2_new(cospi_p27, cospi_p37, x9[41], x9[54],
    3816             :             x10[41], x10[54], __rounding, cos_bit);
    3817   125990000 :         btf_32_type1_avx2_new(cospi_p43, cospi_p21, x9[42], x9[53],
    3818             :             x10[42], x10[53], __rounding, cos_bit);
    3819   125990000 :         btf_32_type1_avx2_new(cospi_p11, cospi_p53, x9[43], x9[52],
    3820             :             x10[43], x10[52], __rounding, cos_bit);
    3821   125990000 :         btf_32_type1_avx2_new(cospi_p51, cospi_p13, x9[44], x9[51],
    3822             :             x10[44], x10[51], __rounding, cos_bit);
    3823   125990000 :         btf_32_type1_avx2_new(cospi_p19, cospi_p45, x9[45], x9[50],
    3824             :             x10[45], x10[50], __rounding, cos_bit);
    3825   125990000 :         btf_32_type1_avx2_new(cospi_p35, cospi_p29, x9[46], x9[49],
    3826             :             x10[46], x10[49], __rounding, cos_bit);
    3827   125990000 :         btf_32_type1_avx2_new(cospi_p03, cospi_p61, x9[47], x9[48],
    3828             :             x10[47], x10[48], __rounding, cos_bit);
    3829             : 
    3830             :         // stage 11
    3831    12599000 :         out[1 * stride] = x10[32];
    3832    12599000 :         out[3 * stride] = x10[48];
    3833    12599000 :         out[5 * stride] = x10[40];
    3834    12599000 :         out[7 * stride] = x10[56];
    3835    12599000 :         out[9 * stride] = x10[36];
    3836    12599000 :         out[11 * stride] = x10[52];
    3837    12599000 :         out[13 * stride] = x10[44];
    3838    12599000 :         out[15 * stride] = x10[60];
    3839    12599000 :         out[17 * stride] = x10[34];
    3840    12599000 :         out[19 * stride] = x10[50];
    3841    12599000 :         out[21 * stride] = x10[42];
    3842    12599000 :         out[23 * stride] = x10[58];
    3843    12599000 :         out[25 * stride] = x10[38];
    3844    12599000 :         out[27 * stride] = x10[54];
    3845    12599000 :         out[29 * stride] = x10[46];
    3846    12599000 :         out[31 * stride] = x10[62];
    3847    12599000 :         out[33 * stride] = x10[33];
    3848    12599000 :         out[35 * stride] = x10[49];
    3849    12599000 :         out[37 * stride] = x10[41];
    3850    12599000 :         out[39 * stride] = x10[57];
    3851    12599000 :         out[41 * stride] = x10[37];
    3852    12599000 :         out[43 * stride] = x10[53];
    3853    12599000 :         out[45 * stride] = x10[45];
    3854    12599000 :         out[47 * stride] = x10[61];
    3855    12599000 :         out[49 * stride] = x10[35];
    3856    12599000 :         out[51 * stride] = x10[51];
    3857    12599000 :         out[53 * stride] = x10[43];
    3858    12599000 :         out[55 * stride] = x10[59];
    3859    12599000 :         out[57 * stride] = x10[39];
    3860    12599000 :         out[59 * stride] = x10[55];
    3861    12599000 :         out[61 * stride] = x10[47];
    3862    12599000 :         out[63 * stride] = x10[63];
    3863             :     }
    3864     3344570 : }
    3865             : 
    3866             : typedef void(*TxfmFuncAVX2)(const __m256i *input, __m256i *output,
    3867             :     const int8_t cos_bit, const int8_t *stage_range);
    3868             : 
    3869     3245160 : static INLINE void fdct32x32_avx2(const __m256i *input, __m256i *output,
    3870             :     const int8_t cos_bit, const int8_t *stage_range) {
    3871     3245160 :     const int32_t txfm_size = 32;
    3872     3245160 :     const int32_t num_per_256 = 8;
    3873     3245160 :     int32_t col_num = txfm_size / num_per_256;
    3874             :     (void)stage_range;
    3875     3245160 :     av1_fdct32_new_avx2(input, output, cos_bit, txfm_size, col_num);
    3876     3245290 : }
    3877             : 
    3878      628650 : static INLINE void fdct64x64_avx2(const __m256i *input, __m256i *output,
    3879             :     const int8_t cos_bit) {
    3880      628650 :     const int32_t txfm_size = 64;
    3881      628650 :     const int32_t num_per_256 = 8;
    3882      628650 :     int32_t col_num = txfm_size / num_per_256;
    3883      628650 :     av1_fdct64_new_avx2(input, output, cos_bit, txfm_size, col_num);
    3884      628655 : }
    3885             : 
    3886     1827500 : static INLINE void fidtx4x8_row_avx2(__m256i *input, __m256i *output, int32_t bit, int32_t col_num) {
    3887             :     (void)bit;
    3888             :     __m256i in[4];
    3889             :     __m256i out[4];
    3890     1827500 :     __m256i fact = _mm256_set1_epi32(NewSqrt2);
    3891     1827500 :     __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1));
    3892             :     __m256i a_low;
    3893             :     __m256i v[4];
    3894             : 
    3895     1827500 :     in[0] = _mm256_permute2x128_si256(input[0], input[2], 0x20);
    3896     1827500 :     in[1] = _mm256_permute2x128_si256(input[0], input[2], 0x31);
    3897     1827500 :     in[2] = _mm256_permute2x128_si256(input[1], input[3], 0x20);
    3898     1827500 :     in[3] = _mm256_permute2x128_si256(input[1], input[3], 0x31);
    3899             : 
    3900     9137420 :     for (int32_t i = 0; i < 4; i++) {
    3901    14619800 :         a_low = _mm256_mullo_epi32(in[i * col_num], fact);
    3902     7309920 :         a_low = _mm256_add_epi32(a_low, offset);
    3903    14619800 :         out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits);
    3904             :     }
    3905             : 
    3906             :     // Transpose for 4x4
    3907     1827500 :     v[0] = _mm256_unpacklo_epi32(out[0], out[1]);
    3908     1827500 :     v[1] = _mm256_unpackhi_epi32(out[0], out[1]);
    3909     1827500 :     v[2] = _mm256_unpacklo_epi32(out[2], out[3]);
    3910     1827500 :     v[3] = _mm256_unpackhi_epi32(out[2], out[3]);
    3911             : 
    3912     1827500 :     out[0] = _mm256_unpacklo_epi64(v[0], v[2]);
    3913     1827500 :     out[1] = _mm256_unpackhi_epi64(v[0], v[2]);
    3914     1827500 :     out[2] = _mm256_unpacklo_epi64(v[1], v[3]);
    3915     1827500 :     out[3] = _mm256_unpackhi_epi64(v[1], v[3]);
    3916             : 
    3917     1827500 :     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
    3918     1827500 :     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
    3919     1827500 :     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
    3920     1827500 :     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
    3921     1827500 : }
    3922             : 
    3923     1734870 : static INLINE void fidtx4x8_col_avx2(__m256i *in, __m256i *output, int32_t bit, int32_t col_num) {
    3924             :     (void)bit;
    3925             :     __m256i out[4];
    3926     1734870 :     __m256i fact = _mm256_set1_epi32(NewSqrt2);
    3927     1734870 :     __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1));
    3928             :     __m256i a_low;
    3929             :     __m256i v[4];
    3930             : 
    3931     8674210 :     for (int32_t i = 0; i < 4; i++) {
    3932    13878700 :         a_low = _mm256_mullo_epi32(in[i * col_num], fact);
    3933     6939340 :         a_low = _mm256_add_epi32(a_low, offset);
    3934    13878700 :         out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits);
    3935             :     }
    3936             : 
    3937             :     // Transpose for 4x4
    3938     1734870 :     v[0] = _mm256_unpacklo_epi32(out[0], out[1]);
    3939     1734870 :     v[1] = _mm256_unpackhi_epi32(out[0], out[1]);
    3940     1734870 :     v[2] = _mm256_unpacklo_epi32(out[2], out[3]);
    3941     1734870 :     v[3] = _mm256_unpackhi_epi32(out[2], out[3]);
    3942             : 
    3943     1734870 :     out[0] = _mm256_unpacklo_epi64(v[0], v[2]);
    3944     1734870 :     out[1] = _mm256_unpackhi_epi64(v[0], v[2]);
    3945     1734870 :     out[2] = _mm256_unpacklo_epi64(v[1], v[3]);
    3946     1734870 :     out[3] = _mm256_unpackhi_epi64(v[1], v[3]);
    3947             : 
    3948     1734870 :     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
    3949     1734870 :     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
    3950     1734870 :     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
    3951     1734870 :     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
    3952     1734870 : }
    3953             : 
    3954     1038570 : static INLINE void fidtx8x4_avx2(__m256i *in, __m256i *out, int32_t bit) {
    3955             :     (void)bit;
    3956             : 
    3957     1038570 :     out[0] = _mm256_add_epi32(in[0], in[0]);
    3958     1038570 :     out[1] = _mm256_add_epi32(in[1], in[1]);
    3959     1038570 :     out[2] = _mm256_add_epi32(in[2], in[2]);
    3960     1038570 :     out[3] = _mm256_add_epi32(in[3], in[3]);
    3961     1038570 : }
    3962             : 
    3963     1249020 : void av1_idtx32_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit,
    3964             :     const int32_t col_num) {
    3965             :     (void)cos_bit;
    3966    41214400 :     for (int32_t i = 0; i < 32; i++)
    3967    79930700 :         output[i * col_num] = _mm256_slli_epi32(input[i * col_num], 2);
    3968     1249020 : }
    3969             : 
    3970      182030 : static void fidtx32x32_avx2(const __m256i *input, __m256i *output,
    3971             :     const int8_t cos_bit, const int8_t *stage_range) {
    3972             :     (void)stage_range;
    3973             : 
    3974      910129 :     for (int32_t i = 0; i < 4; i++)
    3975      728099 :         av1_idtx32_new_avx2(&input[i * 32], &output[i * 32], cos_bit, 1);
    3976      182030 : }
    3977             : 
    3978      701042 : static void fidtx32x8_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
    3979             :     (void)bit;
    3980             :     (void)col_num;
    3981      701042 :     out[4 * 0] = _mm256_slli_epi32(in[4 * 0], 1);
    3982      701042 :     out[4 * 1] = _mm256_slli_epi32(in[4 * 1], 1);
    3983      701042 :     out[4 * 2] = _mm256_slli_epi32(in[4 * 2], 1);
    3984      701042 :     out[4 * 3] = _mm256_slli_epi32(in[4 * 3], 1);
    3985      701042 :     out[4 * 4] = _mm256_slli_epi32(in[4 * 4], 1);
    3986      701042 :     out[4 * 5] = _mm256_slli_epi32(in[4 * 5], 1);
    3987      701042 :     out[4 * 6] = _mm256_slli_epi32(in[4 * 6], 1);
    3988      701042 :     out[4 * 7] = _mm256_slli_epi32(in[4 * 7], 1);
    3989      701042 : }
    3990             : 
    3991           0 : static void fidtx64x64_avx2(const __m256i *input, __m256i *output) {
    3992           0 :     const int32_t bits = 12;       // NewSqrt2Bits = 12
    3993           0 :     const int32_t sqrt = 4 * 5793; // 4 * NewSqrt2
    3994           0 :     const int32_t col_num = 8;
    3995           0 :     const __m256i newsqrt = _mm256_set1_epi32(sqrt);
    3996           0 :     const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
    3997             : 
    3998             :     __m256i temp;
    3999           0 :     int32_t num_iters = 64 * col_num;
    4000           0 :     for (int32_t i = 0; i < num_iters; i++) {
    4001           0 :         temp = _mm256_mullo_epi32(input[i], newsqrt);
    4002           0 :         temp = _mm256_add_epi32(temp, rounding);
    4003           0 :         output[i] = _mm256_srai_epi32(temp, bits);
    4004             :     }
    4005           0 : }
    4006             : 
    4007     3427250 : static INLINE TxfmFuncAVX2 fwd_txfm_type_to_func(TxfmType TxfmType) {
    4008     3427250 :     switch (TxfmType) {
    4009     3245250 :     case TXFM_TYPE_DCT32: return fdct32x32_avx2; break;
    4010      182027 :     case TXFM_TYPE_IDENTITY32: return fidtx32x32_avx2; break;
    4011           0 :     default: assert(0);
    4012             :     }
    4013             :     return NULL;
    4014             : }
    4015             : 
    4016     1713650 : static INLINE void load_buffer_32x32_avx2(const int16_t *input,
    4017             :     __m256i *output, int32_t stride) {
    4018             :     __m128i temp[4];
    4019             :     int32_t i;
    4020             : 
    4021    56527200 :     for (i = 0; i < 32; ++i) {
    4022    54813600 :         temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * 8));
    4023    54813600 :         temp[1] = _mm_load_si128((const __m128i *)(input + 1 * 8));
    4024    54813600 :         temp[2] = _mm_load_si128((const __m128i *)(input + 2 * 8));
    4025    54813600 :         temp[3] = _mm_load_si128((const __m128i *)(input + 3 * 8));
    4026             : 
    4027    54813600 :         output[0] = _mm256_cvtepi16_epi32(temp[0]);
    4028    54813600 :         output[1] = _mm256_cvtepi16_epi32(temp[1]);
    4029    54813600 :         output[2] = _mm256_cvtepi16_epi32(temp[2]);
    4030    54813600 :         output[3] = _mm256_cvtepi16_epi32(temp[3]);
    4031    54813600 :         input += stride;
    4032    54813600 :         output += 4;
    4033             :     }
    4034     1713650 : }
    4035             : 
    4036     1713640 : static INLINE void fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
    4037             :     const int32_t stride,
    4038             :     const Txfm2DFlipCfg *cfg,
    4039             :     int32_t *txfm_buf) {
    4040     1713640 :     assert(cfg->tx_size < TX_SIZES);
    4041     1713640 :     const int32_t txfm_size = tx_size_wide[cfg->tx_size];
    4042     1713640 :     const int8_t *shift = cfg->shift;
    4043     1713640 :     const int8_t *stage_range_col = cfg->stage_range_col;
    4044     1713640 :     const int8_t *stage_range_row = cfg->stage_range_row;
    4045     1713640 :     const int8_t cos_bit_col = cfg->cos_bit_col;
    4046     1713640 :     const int8_t cos_bit_row = cfg->cos_bit_row;
    4047     1713640 :     const TxfmFuncAVX2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
    4048     1713640 :     const TxfmFuncAVX2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
    4049             :     ASSERT(txfm_func_col);
    4050             :     ASSERT(txfm_func_row);
    4051     1713660 :     __m256i *buf_256 = (__m256i *)txfm_buf;
    4052     1713660 :     __m256i *out_256 = (__m256i *)output;
    4053     1713660 :     int32_t num_per_256 = 8;
    4054     1713660 :     int32_t txfm2d_size_256 = txfm_size * txfm_size / num_per_256;
    4055             : 
    4056     1713660 :     load_buffer_32x32_avx2(input, buf_256, stride);
    4057     1713720 :     av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256, -shift[0]);
    4058     1713690 :     txfm_func_col(out_256, buf_256, cos_bit_col, stage_range_col);
    4059     1713710 :     av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256, -shift[1]);
    4060     1713710 :     transpose_32_avx2(txfm_size, out_256, buf_256);
    4061     1713710 :     txfm_func_row(buf_256, out_256, cos_bit_row, stage_range_row);
    4062     1713700 :     av1_round_shift_array_32_avx2(out_256, buf_256, txfm2d_size_256, -shift[2]);
    4063     1713700 :     transpose_32_avx2(txfm_size, buf_256, out_256);
    4064     1713710 : }
    4065             : 
    4066     1713660 : void eb_av1_fwd_txfm2d_32x32_avx2(int16_t *input, int32_t *output,
    4067             :     uint32_t stride, TxType tx_type, uint8_t  bd)
    4068             : {
    4069             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[1024]);
    4070             :     Txfm2DFlipCfg cfg;
    4071     1713660 :     Av1TransformConfig(tx_type, TX_32X32, &cfg);
    4072             :     (void)bd;
    4073     1713660 :     fwd_txfm2d_32x32_avx2(input, output, stride, &cfg, txfm_buf);
    4074     1713700 : }
    4075             : 
    4076      314330 : static INLINE void load_buffer_64x64_avx2(const int16_t *input,
    4077             :     int32_t stride, __m256i *output) {
    4078             :     __m128i x0, x1, x2, x3, x4, x5, x6, x7;
    4079             :     __m256i v0, v1, v2, v3, v4, v5, v6, v7;
    4080             :     int32_t i;
    4081             : 
    4082    20413100 :     for (i = 0; i < 64; ++i) {
    4083    20098800 :         x0 = _mm_loadu_si128((const __m128i *)(input + 0 * 8));
    4084    20098800 :         x1 = _mm_loadu_si128((const __m128i *)(input + 1 * 8));
    4085    20098800 :         x2 = _mm_loadu_si128((const __m128i *)(input + 2 * 8));
    4086    20098800 :         x3 = _mm_loadu_si128((const __m128i *)(input + 3 * 8));
    4087    20098800 :         x4 = _mm_loadu_si128((const __m128i *)(input + 4 * 8));
    4088    20098800 :         x5 = _mm_loadu_si128((const __m128i *)(input + 5 * 8));
    4089    20098800 :         x6 = _mm_loadu_si128((const __m128i *)(input + 6 * 8));
    4090    40197500 :         x7 = _mm_loadu_si128((const __m128i *)(input + 7 * 8));
    4091             : 
    4092    20098800 :         v0 = _mm256_cvtepi16_epi32(x0);
    4093    20098800 :         v1 = _mm256_cvtepi16_epi32(x1);
    4094    20098800 :         v2 = _mm256_cvtepi16_epi32(x2);
    4095    20098800 :         v3 = _mm256_cvtepi16_epi32(x3);
    4096    20098800 :         v4 = _mm256_cvtepi16_epi32(x4);
    4097    20098800 :         v5 = _mm256_cvtepi16_epi32(x5);
    4098    20098800 :         v6 = _mm256_cvtepi16_epi32(x6);
    4099    20098800 :         v7 = _mm256_cvtepi16_epi32(x7);
    4100             : 
    4101             :         _mm256_storeu_si256(output + 0, v0);
    4102    20098800 :         _mm256_storeu_si256(output + 1, v1);
    4103    20098800 :         _mm256_storeu_si256(output + 2, v2);
    4104    20098800 :         _mm256_storeu_si256(output + 3, v3);
    4105    20098800 :         _mm256_storeu_si256(output + 4, v4);
    4106    20098800 :         _mm256_storeu_si256(output + 5, v5);
    4107    20098800 :         _mm256_storeu_si256(output + 6, v6);
    4108    20098800 :         _mm256_storeu_si256(output + 7, v7);
    4109             : 
    4110    20098800 :         input += stride;
    4111    20098800 :         output += 8;
    4112             :     }
    4113      314330 : }
    4114             : 
    4115      314332 : void eb_av1_fwd_txfm2d_64x64_avx2(int16_t *input, int32_t *output,
    4116             :     uint32_t stride, TxType tx_type, uint8_t  bd) {
    4117             :     (void)bd;
    4118             :     __m256i in[512];
    4119      314332 :     __m256i *out = (__m256i *)output;
    4120      314332 :     const int32_t txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
    4121      314332 :     const int32_t txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
    4122      314332 :     const int8_t *shift = fwd_txfm_shift_ls[TX_64X64];
    4123             : 
    4124      314332 :     switch (tx_type) {
    4125           0 :     case IDTX:
    4126           0 :         load_buffer_64x64_avx2(input, stride, out);
    4127           0 :         fidtx64x64_avx2(out, in);
    4128           0 :         av1_round_shift_array_32_avx2(in, out, 512, -shift[1]);
    4129           0 :         transpose_8nx8n(out, in, 64, 64);
    4130             : 
    4131             :         /*row wise transform*/
    4132           0 :         fidtx64x64_avx2(in, out);
    4133           0 :         av1_round_shift_array_32_avx2(out, in, 512, -shift[2]);
    4134           0 :         transpose_8nx8n(in, out, 64, 64);
    4135           0 :         break;
    4136      314330 :     case DCT_DCT:
    4137      314330 :         load_buffer_64x64_avx2(input, stride, out);
    4138      314333 :         fdct64x64_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx]);
    4139      314330 :         av1_round_shift_array_32_avx2(in, out, 512, -shift[1]);
    4140      314330 :         transpose_8nx8n(out, in, 64, 64);
    4141             : 
    4142             :         /*row wise transform*/
    4143      314333 :         fdct64x64_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
    4144      314332 :         av1_round_shift_array_32_avx2(out, in, 512, -shift[2]);
    4145      314336 :         transpose_8nx8n(in, out, 64, 64);
    4146      314333 :         break;
    4147           2 :     default: assert(0);
    4148             :     }
    4149      314333 : }
    4150             : 
    4151    90076600 : static INLINE void load_buffer_32_avx2(const int16_t *input, __m256i *in,
    4152             :     int32_t stride, int32_t flipud, int32_t fliplr,
    4153             :     int32_t shift) {
    4154             :     __m128i temp[4];
    4155    90076600 :     if (!flipud) {
    4156    90077400 :         temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
    4157    90077400 :         temp[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
    4158    90077400 :         temp[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
    4159   180155000 :         temp[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
    4160             :     }
    4161             :     else {
    4162           0 :         temp[0] = _mm_load_si128((const __m128i *)(input + 3 * stride));
    4163           0 :         temp[1] = _mm_load_si128((const __m128i *)(input + 2 * stride));
    4164           0 :         temp[2] = _mm_load_si128((const __m128i *)(input + 1 * stride));
    4165           0 :         temp[3] = _mm_load_si128((const __m128i *)(input + 0 * stride));
    4166             :     }
    4167             : 
    4168    90076600 :     if (fliplr) {
    4169           0 :         temp[0] = mm_reverse_epi16(temp[0]);
    4170           0 :         temp[1] = mm_reverse_epi16(temp[1]);
    4171           0 :         temp[2] = mm_reverse_epi16(temp[2]);
    4172           0 :         temp[3] = mm_reverse_epi16(temp[3]);
    4173             :     }
    4174             : 
    4175    90074900 :     in[0] = _mm256_cvtepi16_epi32(temp[0]);
    4176    90074900 :     in[1] = _mm256_cvtepi16_epi32(temp[1]);
    4177    90074900 :     in[2] = _mm256_cvtepi16_epi32(temp[2]);
    4178    90074900 :     in[3] = _mm256_cvtepi16_epi32(temp[3]);
    4179             : 
    4180    90074900 :     in[0] = _mm256_slli_epi32(in[0], shift);
    4181    90074900 :     in[1] = _mm256_slli_epi32(in[1], shift);
    4182    90074900 :     in[2] = _mm256_slli_epi32(in[2], shift);
    4183    90074900 :     in[3] = _mm256_slli_epi32(in[3], shift);
    4184    90074900 : }
    4185             : 
    4186   105216000 : static INLINE void load_buffer_16_avx2(const int16_t *input, __m256i *in,
    4187             :     int32_t stride, int32_t flipud, int32_t fliplr,
    4188             :     int32_t shift) {
    4189             :     __m128i temp[2];
    4190   105216000 :     if (!flipud) {
    4191   105221000 :         temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
    4192   210442000 :         temp[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
    4193             :     }
    4194             :     else {
    4195           0 :         temp[0] = _mm_load_si128((const __m128i *)(input + 1 * stride));
    4196           0 :         temp[1] = _mm_load_si128((const __m128i *)(input + 0 * stride));
    4197             :     }
    4198             : 
    4199   105216000 :     if (fliplr) {
    4200           0 :         temp[0] = mm_reverse_epi16(temp[0]);
    4201           0 :         temp[1] = mm_reverse_epi16(temp[1]);
    4202             :     }
    4203             : 
    4204   105226000 :     in[0] = _mm256_cvtepi16_epi32(temp[0]);
    4205   105226000 :     in[1] = _mm256_cvtepi16_epi32(temp[1]);
    4206             : 
    4207   105226000 :     in[0] = _mm256_slli_epi32(in[0], shift);
    4208   105226000 :     in[1] = _mm256_slli_epi32(in[1], shift);
    4209   105226000 : }
    4210             : 
    4211     3177980 : static INLINE void load_buffer_32x8n(const int16_t *input, __m256i *out,
    4212             :     int32_t stride, int32_t flipud, int32_t fliplr,
    4213             :     int32_t shift, const int32_t height) {
    4214     3177980 :     const int16_t *in = input;
    4215     3177980 :     __m256i *output = out;
    4216    60453900 :     for (int32_t col = 0; col < height; col++) {
    4217    57276200 :         in = input + col * stride;
    4218    57276200 :         output = out + col * 4;
    4219    57276200 :         load_buffer_32_avx2(in, output, 8, flipud, fliplr, shift);
    4220             :     }
    4221     3177780 : }
    4222             : 
    4223    13900600 : static INLINE void load_buffer_8x16(const int16_t *input, __m256i *out,
    4224             :     int32_t stride, int32_t flipud, int32_t fliplr,
    4225             :     int32_t shift) {
    4226    13900600 :     const int16_t *topL = input;
    4227    13900600 :     const int16_t *botL = input + 8 * stride;
    4228             : 
    4229             :     const int16_t *tmp;
    4230             : 
    4231    13900600 :     if (flipud) {
    4232      590827 :         tmp = topL;
    4233      590827 :         topL = botL;
    4234      590827 :         botL = tmp;
    4235             :     }
    4236             : 
    4237    13900600 :     load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
    4238    13903100 :     load_buffer_8x8(botL, out + 8, stride, flipud, fliplr, shift);
    4239    13903900 : }
    4240             : 
    4241    15041400 : static INLINE void col_txfm_8x4_rounding(__m256i *in, int32_t shift) {
    4242    15041400 :     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
    4243             : 
    4244    15041400 :     in[0] = _mm256_add_epi32(in[0], rounding);
    4245    15041400 :     in[1] = _mm256_add_epi32(in[1], rounding);
    4246    15041400 :     in[2] = _mm256_add_epi32(in[2], rounding);
    4247    15041400 :     in[3] = _mm256_add_epi32(in[3], rounding);
    4248             : 
    4249    15041400 :     in[0] = _mm256_srai_epi32(in[0], shift);
    4250    15041400 :     in[1] = _mm256_srai_epi32(in[1], shift);
    4251    15041400 :     in[2] = _mm256_srai_epi32(in[2], shift);
    4252    15041400 :     in[3] = _mm256_srai_epi32(in[3], shift);
    4253    15041400 : }
    4254             : 
    4255     8829060 : static INLINE void col_txfm_8x16_rounding(__m256i *in, int32_t shift) {
    4256     8829060 :     col_txfm_8x8_rounding(&in[0], shift);
    4257     8829170 :     col_txfm_8x8_rounding(&in[8], shift);
    4258     8829210 : }
    4259             : 
    4260    14563300 : static INLINE void write_buffer_16x8_avx2(const __m256i *res, int32_t *output,
    4261             :     const int32_t stride) {
    4262    14563300 :     _mm256_storeu_si256((__m256i *)(output), res[0]);
    4263    14563300 :     _mm256_storeu_si256((__m256i *)(output + stride), res[1]);
    4264    14563300 :     _mm256_storeu_si256((__m256i *)(output + (stride * 2)), res[2]);
    4265    14563300 :     _mm256_storeu_si256((__m256i *)(output + (stride * 3)), res[3]);
    4266    14563300 :     _mm256_storeu_si256((__m256i *)(output + (stride * 4)), res[4]);
    4267    14563300 :     _mm256_storeu_si256((__m256i *)(output + (stride * 5)), res[5]);
    4268    14563300 :     _mm256_storeu_si256((__m256i *)(output + (stride * 6)), res[6]);
    4269    14563300 :     _mm256_storeu_si256((__m256i *)(output + (stride * 7)), res[7]);
    4270    14563300 : }
    4271             : 
    4272      557018 : void eb_av1_fwd_txfm2d_32x64_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t  bd)
    4273             : {
    4274             :     (void)tx_type;
    4275             :     __m256i in[256];
    4276      557018 :     __m256i *outcoef256 = (__m256i *)output;
    4277      557018 :     const int8_t *shift = fwd_txfm_shift_ls[TX_32X64];
    4278      557018 :     const int32_t txw_idx = get_txw_idx(TX_32X64);
    4279      557016 :     const int32_t txh_idx = get_txh_idx(TX_32X64);
    4280      557018 :     const int32_t txfm_size_col = tx_size_wide[TX_32X64];
    4281      557018 :     const int32_t txfm_size_row = tx_size_high[TX_32X64];
    4282      557018 :     int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
    4283      557018 :     int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
    4284      557018 :     const int32_t num_row = txfm_size_row >> 3;
    4285      557018 :     const int32_t num_col = txfm_size_col >> 3;
    4286             : 
    4287             :     // column transform
    4288      557018 :     load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
    4289      557019 :     av1_fdct64_new_avx2(in, in, bitcol, txfm_size_col, num_col);
    4290             : 
    4291     5012880 :     for (int32_t i = 0; i < num_row; i++)
    4292     4455860 :         col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]);
    4293      557018 :     transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
    4294             : 
    4295             :     // row transform
    4296      557016 :     av1_fdct32_new_avx2(outcoef256, in, bitrow, txfm_size_row, num_row);
    4297      557021 :     transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
    4298      557019 :     av1_round_shift_rect_array_32_avx2(outcoef256, outcoef256, 256, -shift[2],
    4299             :         NewSqrt2);
    4300             :     (void)bd;
    4301      557015 : }
    4302             : 
    4303      513578 : void eb_av1_fwd_txfm2d_64x32_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t  bd)
    4304             : {
    4305             :     (void)tx_type;
    4306             :     __m256i in[256];
    4307      513578 :     __m256i *outcoef256 = (__m256i *)output;
    4308      513578 :     const int8_t *shift = fwd_txfm_shift_ls[TX_64X32];
    4309      513578 :     const int32_t txw_idx = get_txw_idx(TX_64X32);
    4310      513577 :     const int32_t txh_idx = get_txh_idx(TX_64X32);
    4311      513576 :     const int32_t txfm_size_col = tx_size_wide[TX_64X32];
    4312      513576 :     const int32_t txfm_size_row = tx_size_high[TX_64X32];
    4313      513576 :     int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
    4314      513576 :     int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
    4315      513576 :     const int32_t num_row = txfm_size_row >> 3;
    4316      513576 :     const int32_t num_col = txfm_size_col >> 3;
    4317             : 
    4318             :     // column transform
    4319    16943300 :     for (int32_t i = 0; i < 32; i++) {
    4320    16429700 :         load_buffer_32_avx2(input + 0 + i * stride, in + 0 + i * 8, 8, 0, 0, shift[0]);
    4321    16429700 :         load_buffer_32_avx2(input + 32 + i * stride, in + 4 + i * 8, 8, 0, 0, shift[0]);
    4322             :     }
    4323             : 
    4324      513592 :     av1_fdct32_new_avx2(in, in, bitcol, txfm_size_col, num_col);
    4325             : 
    4326     4622060 :     for (int32_t i = 0; i < num_col; i++)
    4327     4108420 :         col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]);
    4328      513641 :     transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
    4329             : 
    4330             :     // row transform
    4331      513592 :     av1_fdct64_new_avx2(outcoef256, in, bitrow, txfm_size_row, num_row);
    4332      513583 :     transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
    4333      513591 :     av1_round_shift_rect_array_32_avx2(outcoef256, outcoef256, 256,
    4334      513591 :         -shift[2], NewSqrt2);
    4335             :     (void)bd;
    4336      513589 : }
    4337             : 
    4338      855371 : void eb_av1_fwd_txfm2d_16x64_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t  bd)
    4339             : {
    4340             :     __m256i in[128];
    4341      855371 :     __m256i *outcoeff256 = (__m256i *)output;
    4342      855371 :     const int8_t *shift = fwd_txfm_shift_ls[TX_16X64];
    4343      855371 :     const int32_t txw_idx = get_txw_idx(TX_16X64);
    4344      855363 :     const int32_t txh_idx = get_txh_idx(TX_16X64);
    4345      855367 :     const int32_t txfm_size_col = tx_size_wide[TX_16X64];
    4346      855367 :     const int32_t txfm_size_row = tx_size_high[TX_16X64];
    4347      855367 :     int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
    4348      855367 :     int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
    4349             :     int32_t ud_flip, lr_flip;
    4350      855367 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    4351      855370 :     const int32_t num_row = txfm_size_row >> 3;
    4352      855370 :     const int32_t num_col = txfm_size_col >> 3;
    4353             :     // col tranform
    4354    28214300 :     for (int32_t i = 0; i < txfm_size_row; i += num_col) {
    4355    27358900 :         load_buffer_16_avx2(input + (i + 0) * stride, in + (i + 0) * num_col, 8,
    4356    27358900 :             ud_flip, lr_flip, shift[0]);
    4357    27359200 :         load_buffer_16_avx2(input + (i + 1) * stride, in + (i + 1) * num_col, 8,
    4358    27359200 :             ud_flip, lr_flip, shift[0]);
    4359             :     }
    4360             : 
    4361      855375 :     av1_fdct64_new_avx2(in, outcoeff256, bitcol, txfm_size_col, num_col);
    4362             : 
    4363      855374 :     col_txfm_16x16_rounding(outcoeff256, -shift[1]);
    4364      855379 :     col_txfm_16x16_rounding(outcoeff256 + 32, -shift[1]);
    4365      855383 :     col_txfm_16x16_rounding(outcoeff256 + 64, -shift[1]);
    4366      855378 :     col_txfm_16x16_rounding(outcoeff256 + 96, -shift[1]);
    4367      855379 :     transpose_8nx8n(outcoeff256, in, txfm_size_col, txfm_size_row);
    4368             :     // row tranform
    4369      855380 :     fdct16x16_avx2(in, in, bitrow, num_row);
    4370      855375 :     transpose_8nx8n(in, outcoeff256, txfm_size_row, txfm_size_col);
    4371             :     (void)bd;
    4372      855376 : }
    4373             : 
    4374      790600 : void eb_av1_fwd_txfm2d_64x16_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t  bd)
    4375             : {
    4376             :     __m256i in[128];
    4377      790600 :     __m256i *outcoeff256 = (__m256i *)output;
    4378      790600 :     const int8_t *shift = fwd_txfm_shift_ls[TX_64X16];
    4379      790600 :     const int32_t txw_idx = get_txw_idx(TX_64X16);
    4380      790597 :     const int32_t txh_idx = get_txh_idx(TX_64X16);
    4381      790600 :     const int32_t txfm_size_col = tx_size_wide[TX_64X16];
    4382      790600 :     const int32_t txfm_size_row = tx_size_high[TX_64X16];
    4383      790600 :     int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
    4384      790600 :     int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
    4385             :     int32_t ud_flip, lr_flip;
    4386      790600 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    4387      790599 :     const int32_t num_row = txfm_size_row >> 3;
    4388      790599 :     const int32_t num_col = txfm_size_col >> 3;
    4389             :     // col tranform
    4390    13437600 :     for (int32_t i = 0; i < txfm_size_row; i++) {
    4391    12647000 :         load_buffer_16_avx2(input + 0 + i * stride, in + 0 + i * 8, 8,
    4392    12647000 :             ud_flip, lr_flip, shift[0]);
    4393    12647200 :         load_buffer_16_avx2(input + 16 + i * stride, in + 2 + i * 8, 8,
    4394    12647200 :             ud_flip, lr_flip, shift[0]);
    4395    12647100 :         load_buffer_16_avx2(input + 32 + i * stride, in + 4 + i * 8, 8,
    4396    12647100 :             ud_flip, lr_flip, shift[0]);
    4397    12647100 :         load_buffer_16_avx2(input + 48 + i * stride, in + 6 + i * 8, 8,
    4398    12647100 :             ud_flip, lr_flip, shift[0]);
    4399             :     }
    4400             : 
    4401      790619 :     fdct16x16_avx2(in, outcoeff256, bitcol, num_col);
    4402      790614 :     col_txfm_16x16_rounding(outcoeff256, -shift[1]);
    4403      790615 :     col_txfm_16x16_rounding(outcoeff256 + 32, -shift[1]);
    4404      790617 :     col_txfm_16x16_rounding(outcoeff256 + 64, -shift[1]);
    4405      790616 :     col_txfm_16x16_rounding(outcoeff256 + 96, -shift[1]);
    4406      790612 :     transpose_8nx8n(outcoeff256, in, txfm_size_col, txfm_size_row);
    4407             :     // row tranform
    4408      790620 :     av1_fdct64_new_avx2(in, in, bitrow, txfm_size_row, num_row);
    4409      790607 :     transpose_8nx8n(in, outcoeff256, txfm_size_row, txfm_size_col);
    4410             :     (void)bd;
    4411      790611 : }
    4412             : 
    4413             : static const fwd_transform_1d_avx2 col_fwdtxfm_8x32_arr[TX_TYPES] = {
    4414             :     av1_fdct32_new_line_wraper_avx2,// DCT_DCT
    4415             :     NULL,                   // ADST_DCT
    4416             :     NULL,                   // DCT_ADST
    4417             :     NULL,                   // ADST_ADST
    4418             :     NULL,                   // FLIPADST_DCT
    4419             :     NULL,                   // DCT_FLIPADST
    4420             :     NULL,                   // FLIPADST_FLIPADST
    4421             :     NULL,                   // ADST_FLIPADST
    4422             :     NULL,                   // FLIPADST_ADST
    4423             :     av1_idtx32_new_avx2,    // IDTX
    4424             :     NULL,                   // V_DCT
    4425             :     NULL,                   // H_DCT
    4426             :     NULL,                   // V_ADST
    4427             :     NULL,                   // H_ADST
    4428             :     NULL,                   // V_FLIPADST
    4429             :     NULL                    // H_FLIPADST
    4430             : };
    4431             : 
    4432             : static const fwd_transform_1d_avx2 row_fwdtxfm_8x32_arr[TX_TYPES] = {
    4433             :     fdct16x16_avx2,    // DCT_DCT
    4434             :     NULL,              // ADST_DCT
    4435             :     NULL,              // DCT_ADST
    4436             :     NULL,              // ADST_ADST
    4437             :     NULL,              // FLIPADST_DCT
    4438             :     NULL,              // DCT_FLIPADST
    4439             :     NULL,              // FLIPADST_FLIPADST
    4440             :     NULL,              // ADST_FLIPADST
    4441             :     NULL,              // FLIPADST_ADST
    4442             :     fidtx16x16_avx2,   // IDTX
    4443             :     NULL,              // V_DCT
    4444             :     NULL,              // H_DCT
    4445             :     NULL,              // V_ADST
    4446             :     NULL,              // H_ADST
    4447             :     NULL,              // V_FLIPADST
    4448             :     NULL               // H_FLIPADST
    4449             : };
    4450             : 
    4451             : static const fwd_transform_1d_avx2 row_fwdtxfm_32x8_arr[TX_TYPES] = {
    4452             :     fdct8x8_avx2,     // DCT_DCT
    4453             :     NULL,             // ADST_DCT
    4454             :     NULL,             // DCT_ADST
    4455             :     NULL,             // ADST_ADST
    4456             :     NULL,             // FLIPADST_DCT
    4457             :     NULL,             // DCT_FLIPADST
    4458             :     NULL,             // FLIPADST_FLIPADST
    4459             :     NULL,             // ADST_FLIPADST
    4460             :     NULL,             // FLIPADST-ADST
    4461             :     fidtx32x8_avx2,   // IDTX
    4462             :     NULL,             // V_DCT
    4463             :     NULL,             // H_DCT
    4464             :     NULL,             // V_ADST
    4465             :     NULL,             // H_ADST
    4466             :     NULL,             // V_FLIPADST
    4467             :     NULL,             // H_FLIPADST
    4468             : };
    4469             : 
    4470             : static const fwd_transform_1d_avx2 col_fwdtxfm_8x16_arr[TX_TYPES] = {
    4471             :     fdct16x16_avx2,   // DCT_DCT
    4472             :     fadst16x16_avx2,  // ADST_DCT
    4473             :     fdct16x16_avx2,   // DCT_ADST
    4474             :     fadst16x16_avx2,  // ADST_ADST
    4475             :     fadst16x16_avx2,  // FLIPADST_DCT
    4476             :     fdct16x16_avx2,   // DCT_FLIPADST
    4477             :     fadst16x16_avx2,  // FLIPADST_FLIPADST
    4478             :     fadst16x16_avx2,  // ADST_FLIPADST
    4479             :     fadst16x16_avx2,  // FLIPADST_ADST
    4480             :     fidtx16x16_avx2,  // IDTX
    4481             :     fdct16x16_avx2,   // V_DCT
    4482             :     fidtx16x16_avx2,  // H_DCT
    4483             :     fadst16x16_avx2,  // V_ADST
    4484             :     fidtx16x16_avx2,  // H_ADST
    4485             :     fadst16x16_avx2,  // V_FLIPADST
    4486             :     fidtx16x16_avx2   // H_FLIPADST
    4487             : };
    4488             : 
    4489             : static const fwd_transform_1d_avx2 row_fwdtxfm_8x8_arr[TX_TYPES] = {
    4490             :     fdct8x8_avx2,   // DCT_DCT
    4491             :     fdct8x8_avx2,   // ADST_DCT
    4492             :     fadst8x8_avx2,  // DCT_ADST
    4493             :     fadst8x8_avx2,  // ADST_ADST
    4494             :     fdct8x8_avx2,   // FLIPADST_DCT
    4495             :     fadst8x8_avx2,  // DCT_FLIPADST
    4496             :     fadst8x8_avx2,  // FLIPADST_FLIPADST
    4497             :     fadst8x8_avx2,  // ADST_FLIPADST
    4498             :     fadst8x8_avx2,  // FLIPADST_ADST
    4499             :     fidtx8x8_avx2,  // IDTX
    4500             :     fidtx8x8_avx2,  // V_DCT
    4501             :     fdct8x8_avx2,   // H_DCT
    4502             :     fidtx8x8_avx2,  // V_ADST
    4503             :     fadst8x8_avx2,  // H_ADST
    4504             :     fidtx8x8_avx2,  // V_FLIPADST
    4505             :     fadst8x8_avx2   // H_FLIPADST
    4506             : };
    4507             : 
    4508             : static const fwd_transform_1d_avx2 col_fwdtxfm_8x8_arr[TX_TYPES] = {
    4509             :     fdct8x8_avx2,   // DCT_DCT
    4510             :     fadst8x8_avx2,  // ADST_DCT
    4511             :     fdct8x8_avx2,   // DCT_ADST
    4512             :     fadst8x8_avx2,  // ADST_ADST
    4513             :     fadst8x8_avx2,  // FLIPADST_DCT
    4514             :     fdct8x8_avx2,   // DCT_FLIPADST
    4515             :     fadst8x8_avx2,  // FLIPADST_FLIPADST
    4516             :     fadst8x8_avx2,  // ADST_FLIPADST
    4517             :     fadst8x8_avx2,  // FLIPADST_ADST
    4518             :     fidtx8x8_avx2,  // IDTX
    4519             :     fdct8x8_avx2,   // V_DCT
    4520             :     fidtx8x8_avx2,  // H_DCT
    4521             :     fadst8x8_avx2,  // V_ADST
    4522             :     fidtx8x8_avx2,  // H_ADST
    4523             :     fadst8x8_avx2,  // V_FLIPADST
    4524             :     fidtx8x8_avx2   // H_FLIPADST
    4525             : };
    4526             : 
    4527             : static const fwd_transform_1d_avx2 row_fwdtxfm_8x16_arr[TX_TYPES] = {
    4528             :     fdct16x16_avx2,   // DCT_DCT
    4529             :     fdct16x16_avx2,   // ADST_DCT
    4530             :     fadst16x16_avx2,  // DCT_ADST
    4531             :     fadst16x16_avx2,  // ADST_ADST
    4532             :     fdct16x16_avx2,   // FLIPADST_DCT
    4533             :     fadst16x16_avx2,  // DCT_FLIPADST
    4534             :     fadst16x16_avx2,  // FLIPADST_FLIPADST
    4535             :     fadst16x16_avx2,  // ADST_FLIPADST
    4536             :     fadst16x16_avx2,  // FLIPADST_ADST
    4537             :     fidtx16x16_avx2,  // IDTX
    4538             :     fidtx16x16_avx2,  // V_DCT
    4539             :     fdct16x16_avx2,   // H_DCT
    4540             :     fidtx16x16_avx2,  // V_ADST
    4541             :     fadst16x16_avx2,  // H_ADST
    4542             :     fidtx16x16_avx2,  // V_FLIPADST
    4543             :     fadst16x16_avx2   // H_FLIPADST
    4544             : };
    4545             : 
    4546             : /* call this function only for DCT_DCT, IDTX */
    4547     2006840 : void eb_av1_fwd_txfm2d_16x32_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t  bd)
    4548             : {
    4549             :     __m256i in[64];
    4550     2006840 :     __m256i *outcoef256 = (__m256i *)output;
    4551     2006840 :     const int8_t *shift = fwd_txfm_shift_ls[TX_16X32];
    4552     2006840 :     const int32_t txw_idx = get_txw_idx(TX_16X32);
    4553     2006820 :     const int32_t txh_idx = get_txh_idx(TX_16X32);
    4554     2006850 :     const fwd_transform_1d_avx2 col_txfm = col_fwdtxfm_8x32_arr[tx_type];
    4555     2006850 :     const fwd_transform_1d_avx2 row_txfm = row_fwdtxfm_8x32_arr[tx_type];
    4556     2006850 :     int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
    4557     2006850 :     int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
    4558     2006850 :     const int32_t txfm_size_col = tx_size_wide[TX_16X32];
    4559     2006850 :     const int32_t txfm_size_row = tx_size_high[TX_16X32];
    4560     2006850 :     const int32_t num_row = txfm_size_row >> 3;
    4561     2006850 :     const int32_t num_col = txfm_size_col >> 3;
    4562             : 
    4563             :     // column transform
    4564     2006850 :     load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
    4565     2006910 :     load_buffer_16x16(input + 16 * stride, in + 32, stride, 0, 0, shift[0]);
    4566             : 
    4567     6020630 :     for (int32_t i = 0; i < num_col; i++)
    4568     4013700 :         col_txfm((in + i), (in + i), bitcol, num_col);
    4569     2006930 :     col_txfm_16x16_rounding(&in[0], -shift[1]);
    4570     2006930 :     col_txfm_16x16_rounding(&in[32], -shift[1]);
    4571     2006940 :     transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
    4572             : 
    4573             :     // row transform
    4574     2006950 :     row_txfm(outcoef256, in, bitrow, num_row);
    4575     2006910 :     transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
    4576     2006940 :     av1_round_shift_rect_array_32_avx2(outcoef256, outcoef256, 64, -shift[2],
    4577             :         NewSqrt2);
    4578             :     (void)bd;
    4579     2006880 : }
    4580             : 
    4581             : /* call this function only for IDTX */
    4582       87136 : void eb_av1_fwd_txfm2d_32x16_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t  bd)
    4583             : {
    4584             :     __m256i in[64];
    4585       87136 :     __m256i *outcoef256 = (__m256i *)output;
    4586       87136 :     const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
    4587       87136 :     const int32_t txw_idx = get_txw_idx(TX_32X16);
    4588       87135 :     const int32_t txh_idx = get_txh_idx(TX_32X16);
    4589       87135 :     const fwd_transform_1d_avx2 col_txfm = row_fwdtxfm_8x32_arr[tx_type];
    4590       87135 :     const fwd_transform_1d_avx2 row_txfm = col_fwdtxfm_8x32_arr[tx_type];
    4591       87135 :     int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
    4592       87135 :     int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
    4593       87135 :     const int32_t txfm_size_col = tx_size_wide[TX_32X16];
    4594       87135 :     const int32_t txfm_size_row = tx_size_high[TX_32X16];
    4595       87135 :     const int32_t num_row = txfm_size_row >> 3;
    4596       87135 :     const int32_t num_col = txfm_size_col >> 3;
    4597             : 
    4598             :     // column transform
    4599       87135 :     load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
    4600       87138 :     col_txfm(in, in, bitcol, num_col);
    4601       87139 :     col_txfm_16x16_rounding(&in[0], -shift[1]);
    4602       87139 :     col_txfm_16x16_rounding(&in[32], -shift[1]);
    4603       87139 :     transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
    4604             : 
    4605             :     // row transform
    4606      261417 :     for (int32_t i = 0; i < num_row; i++)
    4607      174278 :         row_txfm((outcoef256 + i), (in + i), bitrow, num_row);
    4608       87139 :     transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
    4609       87139 :     av1_round_shift_rect_array_32_avx2(outcoef256, outcoef256, 64, -shift[2],
    4610             :         NewSqrt2);
    4611             :     (void)bd;
    4612       87138 : }
    4613             : 
    4614             : /* call this function only for DCT_DCT, IDTX */
    4615     2537690 : void eb_av1_fwd_txfm2d_8x32_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t  bd)
    4616             : {
    4617             :     __m256i in[32];
    4618     2537690 :     __m256i *outcoef256 = (__m256i *)output;
    4619     2537690 :     const int8_t *shift = fwd_txfm_shift_ls[TX_8X32];
    4620     2537690 :     const int32_t txw_idx = get_txw_idx(TX_8X32);
    4621     2537660 :     const int32_t txh_idx = get_txh_idx(TX_8X32);
    4622     2537710 :     const fwd_transform_1d_avx2 col_txfm = col_fwdtxfm_8x32_arr[tx_type];
    4623     2537710 :     const fwd_transform_1d_avx2 row_txfm = row_fwdtxfm_32x8_arr[tx_type];
    4624     2537710 :     int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
    4625     2537710 :     int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
    4626             : 
    4627     2537710 :     const int32_t txfm_size_col = tx_size_wide[TX_8X32];
    4628     2537710 :     const int32_t txfm_size_row = tx_size_high[TX_8X32];
    4629     2537710 :     const int32_t num_row = txfm_size_row >> 3;
    4630     2537710 :     const int32_t num_col = txfm_size_col >> 3;
    4631             : 
    4632             :     // column transform
    4633     2537710 :     load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
    4634     2537830 :     load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + 16,
    4635     2537830 :         stride, 0, 0, shift[0]);
    4636             : 
    4637     2537810 :     col_txfm(in, in, bitcol, num_col);
    4638     2537770 :     col_txfm_16x16_rounding(in, -shift[1]);
    4639     2537820 :     transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
    4640             : 
    4641             :     // row transform
    4642    12688000 :     for (int32_t i = 0; i < num_row; i++)
    4643    10150200 :         row_txfm((outcoef256 + i), (in + i), bitrow, num_row);
    4644     2537840 :     transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
    4645             :     (void)bd;
    4646     2537860 : }
    4647             : 
    4648             : /* call this function only for DCT_DCT, IDTX */
    4649     2533910 : void eb_av1_fwd_txfm2d_32x8_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t  bd)
    4650             : {
    4651             :     __m256i in[32];
    4652     2533910 :     __m256i *outcoef256 = (__m256i *)output;
    4653     2533910 :     const int8_t *shift = fwd_txfm_shift_ls[TX_32X8];
    4654     2533910 :     const int32_t txw_idx = get_txw_idx(TX_32X8);
    4655     2533860 :     const int32_t txh_idx = get_txh_idx(TX_32X8);
    4656     2533920 :     const fwd_transform_1d_avx2 col_txfm = row_fwdtxfm_32x8_arr[tx_type];
    4657     2533920 :     const fwd_transform_1d_avx2 row_txfm = col_fwdtxfm_8x32_arr[tx_type];
    4658     2533920 :     int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
    4659     2533920 :     int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
    4660             : 
    4661     2533920 :     const int32_t txfm_size_col = tx_size_wide[TX_32X8];
    4662     2533920 :     const int32_t txfm_size_row = tx_size_high[TX_32X8];
    4663     2533920 :     const int32_t num_row = txfm_size_row >> 3;
    4664     2533920 :     const int32_t num_col = txfm_size_col >> 3;
    4665             : 
    4666             :     // column transform
    4667     2533920 :     load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
    4668    12669100 :     for (int32_t i = 0; i < num_col; i++)
    4669    10135100 :         col_txfm((in + i), (in + i), bitcol, num_col);
    4670     2534070 :     col_txfm_16x16_rounding(&in[0], -shift[1]);
    4671     2534070 :     transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
    4672             : 
    4673             :     // row transform
    4674     2534090 :     row_txfm(outcoef256, in, bitrow, num_row);
    4675             : 
    4676     2534040 :     transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
    4677             :     (void)bd;
    4678     2534100 : }
    4679             : 
    4680             : /* call this function for all 16 transform types */
    4681     8828150 : void eb_av1_fwd_txfm2d_8x16_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t  bd)
    4682             : {
    4683             :     __m256i in[16], out[16];
    4684     8828150 :     const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
    4685     8828150 :     const int32_t txw_idx = get_txw_idx(TX_8X16);
    4686     8827910 :     const int32_t txh_idx = get_txh_idx(TX_8X16);
    4687     8828000 :     const fwd_transform_1d_avx2 col_txfm = col_fwdtxfm_8x16_arr[tx_type];
    4688     8828000 :     const fwd_transform_1d_avx2 row_txfm = row_fwdtxfm_8x8_arr[tx_type];
    4689     8828000 :     int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
    4690     8828000 :     int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
    4691             :     int32_t ud_flip, lr_flip;
    4692     8828000 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    4693     8828040 :     const int32_t txfm_size_col = tx_size_wide[TX_8X16];
    4694     8828040 :     const int32_t txfm_size_row = tx_size_high[TX_8X16];
    4695     8828040 :     const int32_t num_row = txfm_size_row >> 3;
    4696     8828040 :     const int32_t num_col = txfm_size_col >> 3;
    4697             : 
    4698     8828040 :     load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
    4699             :     // column transform
    4700     8829600 :     col_txfm(in, in, bitcol, num_col);
    4701     8829190 :     col_txfm_8x16_rounding(in, -shift[1]);
    4702     8829310 :     transpose_8x8_avx2(in, out);
    4703     8829390 :     transpose_8x8_avx2(in + 8, out + 8);
    4704             : 
    4705             :     // row transform
    4706    26485300 :     for (int32_t i = 0; i < num_row; i++) {
    4707    17655700 :         row_txfm(out + i * 8, out, bitrow, 1);
    4708    17656400 :         transpose_8x8_avx2(out, in);
    4709    17657100 :         av1_round_shift_rect_array_32_avx2(in, in, 8, -shift[2], NewSqrt2);
    4710    17656700 :         write_buffer_8x8(in, output + i * 64);
    4711             :     }
    4712             :     (void)bd;
    4713     8829630 : }
    4714             : 
    4715             : /* call this function for all 16 transform types */
    4716     7281340 : void eb_av1_fwd_txfm2d_16x8_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type, uint8_t  bd)
    4717             : {
    4718     7281340 :     __m256i in[16], out[16]={0};
    4719     7281340 :     const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
    4720     7281340 :     const int32_t txw_idx = get_txw_idx(TX_16X8);
    4721     7281320 :     const int32_t txh_idx = get_txh_idx(TX_16X8);
    4722     7281520 :     const fwd_transform_1d_avx2 col_txfm = col_fwdtxfm_8x8_arr[tx_type];
    4723     7281520 :     const fwd_transform_1d_avx2 row_txfm = row_fwdtxfm_8x16_arr[tx_type];
    4724     7281520 :     int8_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
    4725     7281520 :     int8_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
    4726             :     int32_t ud_flip, lr_flip;
    4727     7281520 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    4728     7281460 :     const int32_t txfm_size_col = tx_size_wide[TX_16X8];
    4729     7281460 :     const int32_t txfm_size_row = tx_size_high[TX_16X8];
    4730     7281460 :     const int32_t num_row = txfm_size_row >> 3;
    4731     7281460 :     const int32_t num_col = txfm_size_col >> 3;
    4732             : 
    4733             :     // column transform
    4734    21843500 :     for (int32_t i = 0; i < num_col; i++) {
    4735    14561100 :         load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
    4736    14562800 :         col_txfm(in, in, bitcol, 1);
    4737    14563300 :         col_txfm_8x8_rounding(in, -shift[1]);
    4738    14563200 :         transpose_8x8_avx2(in, out + i * 8);
    4739             :     }
    4740             : 
    4741             :     // row transform
    4742     7282450 :     if (lr_flip) {
    4743     8039470 :         for (int32_t i = 0; i < 16; i++)
    4744     7566550 :             in[16 - i - 1] = out[i];
    4745      472917 :         row_txfm(in, out, bitrow, num_row);
    4746             :     }
    4747             :     else
    4748     6809530 :         row_txfm(out, out, bitrow, num_row);
    4749             : 
    4750    21844100 :     for (int32_t i = 0; i < num_col; i++) {
    4751    14561700 :         transpose_8x8_avx2(out + i * 8, in);
    4752    14563100 :         av1_round_shift_rect_array_32_avx2(in, in, 8, -shift[2], NewSqrt2);
    4753    14562900 :         write_buffer_16x8_avx2(in, output + i * 8, 16);
    4754             :     }
    4755             :     (void)bd;
    4756     7282440 : }
    4757             : 
    4758     7683900 : void eb_av1_fwd_txfm2d_4x8_avx2(int16_t *input, int32_t *output, uint32_t stride,
    4759             :     TxType tx_type, uint8_t  bd)
    4760             : {
    4761             :     __m256i in[4];
    4762             :     __m256i outcoeff256[4];
    4763             : 
    4764     7683900 :     const int8_t *shift = fwd_txfm_shift_ls[TX_4X8];
    4765     7683900 :     const int32_t txw_idx = get_txw_idx(TX_4X8);
    4766     7683730 :     const int32_t txh_idx = get_txh_idx(TX_4X8);
    4767     7685200 :     int32_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
    4768     7685200 :     int32_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
    4769             : 
    4770     7685200 :     switch (tx_type) {
    4771     4507820 :     case DCT_DCT:
    4772     4507820 :         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
    4773     4508390 :         fdct4x8_avx2(in, in, bitcol);
    4774     4508500 :         col_txfm_8x4_rounding(in, -shift[1]);
    4775     4508440 :         transpose_4x8_avx2(in, outcoeff256);
    4776     4508160 :         fdct4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4777     4508310 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4778     4508280 :         write_buffer_4x8(outcoeff256, output);
    4779     4508180 :         break;
    4780      729726 :     case ADST_DCT:
    4781      729726 :         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
    4782      729749 :         fadst8x4_avx2(in, in, bitcol, 1);
    4783      729752 :         col_txfm_8x4_rounding(in, -shift[1]);
    4784      729751 :         transpose_4x8_avx2(in, outcoeff256);
    4785      729754 :         fdct4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4786      729753 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4787      729747 :         write_buffer_4x8(outcoeff256, output);
    4788      729746 :         break;
    4789      746998 :     case DCT_ADST:
    4790      746998 :         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
    4791      747019 :         fdct4x8_avx2(in, in, bitcol);
    4792      747024 :         col_txfm_8x4_rounding(in, -shift[1]);
    4793      747024 :         transpose_4x8_avx2(in, outcoeff256);
    4794      747013 :         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4795      747024 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4796      747025 :         write_buffer_4x8(outcoeff256, output);
    4797      747023 :         break;
    4798      430724 :     case ADST_ADST:
    4799      430724 :         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
    4800      430735 :         fadst8x4_avx2(in, in, bitcol, 1);
    4801      430734 :         col_txfm_8x4_rounding(in, -shift[1]);
    4802      430731 :         transpose_4x8_avx2(in, outcoeff256);
    4803      430729 :         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4804      430728 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4805      430727 :         write_buffer_4x8(outcoeff256, output);
    4806      430726 :         break;
    4807       81813 :     case FLIPADST_DCT:
    4808       81813 :         load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
    4809       81813 :         fadst8x4_avx2(in, in, bitcol, 1);
    4810       81813 :         col_txfm_8x4_rounding(in, -shift[1]);
    4811       81813 :         transpose_4x8_avx2(in, outcoeff256);
    4812       81813 :         fdct4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4813       81813 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4814       81813 :         write_buffer_4x8(outcoeff256, output);
    4815       81813 :         break;
    4816       81707 :     case DCT_FLIPADST:
    4817       81707 :         load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
    4818       81707 :         fdct4x8_avx2(in, in, bitcol);
    4819       81707 :         col_txfm_8x4_rounding(in, -shift[1]);
    4820       81707 :         transpose_4x8_avx2(in, outcoeff256);
    4821       81707 :         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4822       81707 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4823       81707 :         write_buffer_4x8(outcoeff256, output);
    4824       81707 :         break;
    4825       82687 :     case FLIPADST_FLIPADST:
    4826       82687 :         load_buffer_4x8_avx2(input, in, stride, 1, 1, shift[0]);
    4827       82687 :         fadst8x4_avx2(in, in, bitcol, 1);
    4828       82688 :         col_txfm_8x4_rounding(in, -shift[1]);
    4829       82688 :         transpose_4x8_avx2(in, outcoeff256);
    4830       82688 :         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4831       82687 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4832       82688 :         write_buffer_4x8(outcoeff256, output);
    4833       82688 :         break;
    4834       83243 :     case ADST_FLIPADST:
    4835       83243 :         load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
    4836       83244 :         fadst8x4_avx2(in, in, bitcol, 1);
    4837       83244 :         col_txfm_8x4_rounding(in, -shift[1]);
    4838       83244 :         transpose_4x8_avx2(in, outcoeff256);
    4839       83244 :         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4840       83244 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4841       83243 :         write_buffer_4x8(outcoeff256, output);
    4842       83243 :         break;
    4843       84355 :     case FLIPADST_ADST:
    4844       84355 :         load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
    4845       84356 :         fadst8x4_avx2(in, in, bitcol, 1);
    4846       84356 :         col_txfm_8x4_rounding(in, -shift[1]);
    4847       84356 :         transpose_4x8_avx2(in, outcoeff256);
    4848       84356 :         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4849       84356 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4850       84356 :         write_buffer_4x8(outcoeff256, output);
    4851       84356 :         break;
    4852      168663 :     case IDTX:
    4853      168663 :         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
    4854      168663 :         fidtx8x4_avx2(in, in, bitcol);
    4855      168663 :         col_txfm_8x4_rounding(in, -shift[1]);
    4856      168663 :         transpose_4x8_avx2(in, outcoeff256);
    4857      168663 :         fidtx4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4858      168665 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4859      168664 :         write_buffer_4x8(outcoeff256, output);
    4860      168664 :         break;
    4861      169658 :     case V_DCT:
    4862      169658 :         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
    4863      169661 :         fdct4x8_avx2(in, in, bitcol);
    4864      169660 :         col_txfm_8x4_rounding(in, -shift[1]);
    4865      169658 :         transpose_4x8_avx2(in, outcoeff256);
    4866      169658 :         fidtx4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4867      169659 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4868      169659 :         write_buffer_4x8(outcoeff256, output);
    4869      169659 :         break;
    4870      180298 :     case H_DCT:
    4871      180298 :         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
    4872      180300 :         fidtx8x4_avx2(in, in, bitcol);
    4873      180300 :         col_txfm_8x4_rounding(in, -shift[1]);
    4874      180300 :         transpose_4x8_avx2(in, outcoeff256);
    4875      180299 :         fdct4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4876      180298 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4877      180299 :         write_buffer_4x8(outcoeff256, output);
    4878      180299 :         break;
    4879       82169 :     case V_ADST:
    4880       82169 :         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
    4881       82169 :         fadst8x4_avx2(in, in, bitcol, 1);
    4882       82169 :         col_txfm_8x4_rounding(in, -shift[1]);
    4883       82169 :         transpose_4x8_avx2(in, outcoeff256);
    4884       82169 :         fidtx4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4885       82169 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4886       82169 :         write_buffer_4x8(outcoeff256, output);
    4887       82169 :         break;
    4888       88118 :     case H_ADST:
    4889       88118 :         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
    4890       88118 :         fidtx8x4_avx2(in, in, bitcol);
    4891       88118 :         col_txfm_8x4_rounding(in, -shift[1]);
    4892       88118 :         transpose_4x8_avx2(in, outcoeff256);
    4893       88118 :         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4894       88118 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4895       88118 :         write_buffer_4x8(outcoeff256, output);
    4896       88117 :         break;
    4897       80870 :     case V_FLIPADST:
    4898       80870 :         load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
    4899       80870 :         fadst8x4_avx2(in, in, bitcol, 1);
    4900       80870 :         col_txfm_8x4_rounding(in, -shift[1]);
    4901       80870 :         transpose_4x8_avx2(in, outcoeff256);
    4902       80870 :         fidtx4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4903       80869 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4904       80870 :         write_buffer_4x8(outcoeff256, output);
    4905       80870 :         break;
    4906       86352 :     case H_FLIPADST:
    4907       86352 :         load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
    4908       86352 :         fidtx8x4_avx2(in, in, bitcol);
    4909       86352 :         col_txfm_8x4_rounding(in, -shift[1]);
    4910       86352 :         transpose_4x8_avx2(in, outcoeff256);
    4911       86352 :         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
    4912       86352 :         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], NewSqrt2);
    4913       86352 :         write_buffer_4x8(outcoeff256, output);
    4914       86352 :         break;
    4915           0 :     default: assert(0);
    4916             :     }
    4917             :     (void)bd;
    4918     7685610 : }
    4919             : 
    4920     7357290 : void eb_av1_fwd_txfm2d_8x4_avx2(int16_t *input, int32_t *output, uint32_t stride,
    4921             :     TxType tx_type, uint8_t  bd)
    4922             : {
    4923             :     __m256i in[4];
    4924     7357290 :     __m256i *outcoeff256 = (__m256i *)output;
    4925     7357290 :     const int8_t *shift = fwd_txfm_shift_ls[TX_8X4];
    4926     7357290 :     const int32_t txw_idx = get_txw_idx(TX_8X4);
    4927     7357170 :     const int32_t txh_idx = get_txh_idx(TX_8X4);
    4928     7358570 :     int32_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
    4929     7358570 :     int32_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
    4930             : 
    4931     7358570 :     switch (tx_type) {
    4932     4264560 :     case DCT_DCT:
    4933     4264560 :         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
    4934     4265090 :         fdct4x8_row_avx2(in, in, bitcol, 1);
    4935     4265120 :         col_txfm_8x4_rounding(in, -shift[1]);
    4936     4265080 :         fdct4x8_avx2(in, outcoeff256, bitrow);
    4937     4265130 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    4938             :             NewSqrt2);
    4939     4264980 :         transpose_4x8_avx2(in, outcoeff256);
    4940     4264940 :         break;
    4941      697618 :     case ADST_DCT:
    4942      697618 :         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
    4943      697645 :         fadst4x8_row_avx2(in, in, bitcol, 1);
    4944      697642 :         col_txfm_8x4_rounding(in, -shift[1]);
    4945      697638 :         fdct4x8_avx2(in, outcoeff256, bitrow);
    4946      697648 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    4947             :             NewSqrt2);
    4948      697641 :         transpose_4x8_avx2(in, outcoeff256);
    4949      697639 :         break;
    4950      695527 :     case DCT_ADST:
    4951      695527 :         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
    4952      695550 :         fdct4x8_row_avx2(in, in, bitcol, 1);
    4953      695545 :         col_txfm_8x4_rounding(in, -shift[1]);
    4954      695544 :         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
    4955      695547 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    4956             :             NewSqrt2);
    4957      695544 :         transpose_4x8_avx2(in, outcoeff256);
    4958      695545 :         break;
    4959      415524 :     case ADST_ADST:
    4960      415524 :         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
    4961      415531 :         fadst4x8_row_avx2(in, in, bitcol, 1);
    4962      415535 :         col_txfm_8x4_rounding(in, -shift[1]);
    4963      415533 :         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
    4964      415536 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    4965             :             NewSqrt2);
    4966      415533 :         transpose_4x8_avx2(in, outcoeff256);
    4967      415532 :         break;
    4968       82951 :     case FLIPADST_DCT:
    4969       82951 :         load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
    4970       82951 :         fadst4x8_row_avx2(in, in, bitcol, 1);
    4971       82951 :         col_txfm_8x4_rounding(in, -shift[1]);
    4972       82951 :         fdct4x8_avx2(in, outcoeff256, bitrow);
    4973       82951 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    4974             :             NewSqrt2);
    4975       82951 :         transpose_4x8_avx2(in, outcoeff256);
    4976       82951 :         break;
    4977       83444 :     case DCT_FLIPADST:
    4978       83444 :         load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
    4979       83443 :         fdct4x8_row_avx2(in, in, bitcol, 1);
    4980       83444 :         col_txfm_8x4_rounding(in, -shift[1]);
    4981       83444 :         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
    4982       83444 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    4983             :             NewSqrt2);
    4984       83444 :         transpose_4x8_avx2(in, outcoeff256);
    4985       83444 :         break;
    4986       84209 :     case FLIPADST_FLIPADST:
    4987       84209 :         load_buffer_8x4_avx2(input, in, stride, 1, 1, shift[0]);
    4988       84210 :         fadst4x8_row_avx2(in, in, bitcol, 1);
    4989       84210 :         col_txfm_8x4_rounding(in, -shift[1]);
    4990       84210 :         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
    4991       84209 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    4992             :             NewSqrt2);
    4993       84210 :         transpose_4x8_avx2(in, outcoeff256);
    4994       84210 :         break;
    4995       85842 :     case ADST_FLIPADST:
    4996       85842 :         load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
    4997       85842 :         fadst4x8_row_avx2(in, in, bitcol, 1);
    4998       85842 :         col_txfm_8x4_rounding(in, -shift[1]);
    4999       85842 :         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
    5000       85842 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    5001             :             NewSqrt2);
    5002       85842 :         transpose_4x8_avx2(in, outcoeff256);
    5003       85842 :         break;
    5004       84843 :     case FLIPADST_ADST:
    5005       84843 :         load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
    5006       84842 :         fadst4x8_row_avx2(in, in, bitcol, 1);
    5007       84843 :         col_txfm_8x4_rounding(in, -shift[1]);
    5008       84843 :         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
    5009       84843 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    5010             :             NewSqrt2);
    5011       84843 :         transpose_4x8_avx2(in, outcoeff256);
    5012       84843 :         break;
    5013      171650 :     case IDTX:
    5014      171650 :         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
    5015      171653 :         fidtx4x8_row_avx2(in, in, bitcol, 1);
    5016      171653 :         col_txfm_8x4_rounding(in, -shift[1]);
    5017      171653 :         fidtx8x4_avx2(in, outcoeff256, bitrow);
    5018      171653 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    5019             :             NewSqrt2);
    5020      171652 :         transpose_4x8_avx2(in, outcoeff256);
    5021      171652 :         break;
    5022      174260 :     case V_DCT:
    5023      174260 :         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
    5024      174262 :         fdct4x8_row_avx2(in, in, bitcol, 1);
    5025      174262 :         col_txfm_8x4_rounding(in, -shift[1]);
    5026      174261 :         fidtx8x4_avx2(in, outcoeff256, bitrow);
    5027      174261 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    5028             :             NewSqrt2);
    5029      174260 :         transpose_4x8_avx2(in, outcoeff256);
    5030      174260 :         break;
    5031      178111 :     case H_DCT:
    5032      178111 :         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
    5033      178112 :         fidtx4x8_row_avx2(in, in, bitcol, 1);
    5034      178112 :         col_txfm_8x4_rounding(in, -shift[1]);
    5035      178113 :         fdct4x8_avx2(in, outcoeff256, bitrow);
    5036      178113 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    5037             :             NewSqrt2);
    5038      178112 :         transpose_4x8_avx2(in, outcoeff256);
    5039      178112 :         break;
    5040       85755 :     case V_ADST:
    5041       85755 :         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
    5042       85755 :         fadst4x8_row_avx2(in, in, bitcol, 1);
    5043       85755 :         col_txfm_8x4_rounding(in, -shift[1]);
    5044       85755 :         fidtx8x4_avx2(in, outcoeff256, bitrow);
    5045       85755 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    5046             :             NewSqrt2);
    5047       85755 :         transpose_4x8_avx2(in, outcoeff256);
    5048       85755 :         break;
    5049       85567 :     case H_ADST:
    5050       85567 :         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
    5051       85567 :         fidtx4x8_row_avx2(in, in, bitcol, 1);
    5052       85567 :         col_txfm_8x4_rounding(in, -shift[1]);
    5053       85567 :         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
    5054       85567 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    5055             :             NewSqrt2);
    5056       85567 :         transpose_4x8_avx2(in, outcoeff256);
    5057       85567 :         break;
    5058       83485 :     case V_FLIPADST:
    5059       83485 :         load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
    5060       83485 :         fadst4x8_row_avx2(in, in, bitcol, 1);
    5061       83485 :         col_txfm_8x4_rounding(in, -shift[1]);
    5062       83485 :         fidtx8x4_avx2(in, outcoeff256, bitrow);
    5063       83485 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    5064             :             NewSqrt2);
    5065       83485 :         transpose_4x8_avx2(in, outcoeff256);
    5066       83485 :         break;
    5067       85222 :     case H_FLIPADST:
    5068       85222 :         load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
    5069       85222 :         fidtx4x8_row_avx2(in, in, bitcol, 1);
    5070       85222 :         col_txfm_8x4_rounding(in, -shift[1]);
    5071       85222 :         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
    5072       85222 :         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2],
    5073             :             NewSqrt2);
    5074       85222 :         transpose_4x8_avx2(in, outcoeff256);
    5075       85222 :         break;
    5076           0 :     default: assert(0);
    5077             :     }
    5078             :     (void)bd;
    5079     7359000 : }
    5080             : 
    5081     5564310 : void eb_av1_fwd_txfm2d_4x16_avx2(int16_t *input, int32_t *output, uint32_t stride,
    5082             :     TxType tx_type, uint8_t  bd)
    5083             : {
    5084             :     __m256i in[8];
    5085             :     __m256i outcoeff256[8];
    5086     5564310 :     const int8_t *shift = fwd_txfm_shift_ls[TX_4X16];
    5087     5564310 :     const int32_t txw_idx = get_txw_idx(TX_4X16);
    5088     5564260 :     const int32_t txh_idx = get_txh_idx(TX_4X16);
    5089     5564950 :     int32_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
    5090     5564950 :     int32_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
    5091             : 
    5092     5564950 :     switch (tx_type) {
    5093     2951090 :     case DCT_DCT:
    5094     2951090 :         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
    5095     2951320 :         fdct16x4_avx2(in, outcoeff256, bitcol);
    5096     2951280 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5097     2951270 :         transpose_4x16_avx2(outcoeff256, in);
    5098     8853730 :         for (int32_t i = 0; i < 2; i++)
    5099     5902410 :             fdct4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5100     2951320 :         write_buffer_8x8(outcoeff256, output);
    5101     2951250 :         break;
    5102      342812 :     case ADST_DCT:
    5103      342812 :         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
    5104      342821 :         fadst16x4_avx2(in, outcoeff256, bitcol);
    5105      342817 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5106      342819 :         transpose_4x16_avx2(outcoeff256, in);
    5107     1028460 :         for (int32_t i = 0; i < 2; i++)
    5108      685636 :             fdct4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5109      342821 :         write_buffer_8x8(outcoeff256, output);
    5110      342820 :         break;
    5111      354921 :     case DCT_ADST:
    5112      354921 :         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
    5113      354925 :         fdct16x4_avx2(in, outcoeff256, bitcol);
    5114      354926 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5115      354927 :         transpose_4x16_avx2(outcoeff256, in);
    5116     1064760 :         for (int32_t i = 0; i < 2; i++)
    5117      709839 :             fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5118      354926 :         write_buffer_8x8(outcoeff256, output);
    5119      354924 :         break;
    5120      267241 :     case ADST_ADST:
    5121      267241 :         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
    5122      267244 :         fadst16x4_avx2(in, outcoeff256, bitcol);
    5123      267242 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5124      267244 :         transpose_4x16_avx2(outcoeff256, in);
    5125      801731 :         for (int32_t i = 0; i < 2; i++)
    5126      534487 :             fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5127      267244 :         write_buffer_8x8(outcoeff256, output);
    5128      267242 :         break;
    5129      118347 :     case FLIPADST_DCT:
    5130      118347 :         load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
    5131      118347 :         fadst16x4_avx2(in, outcoeff256, bitcol);
    5132      118348 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5133      118347 :         transpose_4x16_avx2(outcoeff256, in);
    5134      355043 :         for (int32_t i = 0; i < 2; i++)
    5135      236695 :             fdct4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5136      118348 :         write_buffer_8x8(outcoeff256, output);
    5137      118348 :         break;
    5138      118830 :     case DCT_FLIPADST:
    5139      118830 :         load_buffer_4x16_avx2(input, in, stride, 0, 1, shift[0]);
    5140      118830 :         fdct16x4_avx2(in, outcoeff256, bitcol);
    5141      118830 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5142      118830 :         transpose_4x16_avx2(outcoeff256, in);
    5143      356490 :         for (int32_t i = 0; i < 2; i++)
    5144      237660 :             fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5145      118830 :         write_buffer_8x8(outcoeff256, output);
    5146      118830 :         break;
    5147      118952 :     case FLIPADST_FLIPADST:
    5148      118952 :         load_buffer_4x16_avx2(input, in, stride, 1, 1, shift[0]);
    5149      118952 :         fadst16x4_avx2(in, outcoeff256, bitcol);
    5150      118952 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5151      118952 :         transpose_4x16_avx2(outcoeff256, in);
    5152      356852 :         for (int32_t i = 0; i < 2; i++)
    5153      237902 :             fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5154      118950 :         write_buffer_8x8(outcoeff256, output);
    5155      118951 :         break;
    5156      119179 :     case ADST_FLIPADST:
    5157      119179 :         load_buffer_4x16_avx2(input, in, stride, 0, 1, shift[0]);
    5158      119179 :         fadst16x4_avx2(in, outcoeff256, bitcol);
    5159      119179 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5160      119179 :         transpose_4x16_avx2(outcoeff256, in);
    5161      357537 :         for (int32_t i = 0; i < 2; i++)
    5162      238358 :             fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5163      119179 :         write_buffer_8x8(outcoeff256, output);
    5164      119179 :         break;
    5165      119410 :     case FLIPADST_ADST:
    5166      119410 :         load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
    5167      119410 :         fadst16x4_avx2(in, outcoeff256, bitcol);
    5168      119410 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5169      119410 :         transpose_4x16_avx2(outcoeff256, in);
    5170      358230 :         for (int32_t i = 0; i < 2; i++)
    5171      238820 :             fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5172      119410 :         write_buffer_8x8(outcoeff256, output);
    5173      119410 :         break;
    5174      193807 :     case IDTX:
    5175      193807 :         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
    5176      193809 :         fidtx16x8_avx2(in, outcoeff256, bitcol, 1);
    5177      193809 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5178      193809 :         transpose_4x16_avx2(outcoeff256, in);
    5179      581427 :         for (int32_t i = 0; i < 2; i++)
    5180      387618 :             fidtx4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5181      193809 :         write_buffer_8x8(outcoeff256, output);
    5182      193809 :         break;
    5183      185717 :     case V_DCT:
    5184      185717 :         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
    5185      185718 :         fdct16x4_avx2(in, outcoeff256, bitcol);
    5186      185717 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5187      185717 :         transpose_4x16_avx2(outcoeff256, in);
    5188      557149 :         for (int32_t i = 0; i < 2; i++)
    5189      371433 :             fidtx4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5190      185716 :         write_buffer_8x8(outcoeff256, output);
    5191      185718 :         break;
    5192      197130 :     case H_DCT:
    5193      197130 :         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
    5194      197131 :         fidtx16x8_avx2(in, outcoeff256, bitcol, 1);
    5195      197130 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5196      197130 :         transpose_4x16_avx2(outcoeff256, in);
    5197      591390 :         for (int32_t i = 0; i < 2; i++)
    5198      394259 :             fdct4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5199      197131 :         write_buffer_8x8(outcoeff256, output);
    5200      197131 :         break;
    5201      118780 :     case V_ADST:
    5202      118780 :         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
    5203      118780 :         fadst16x4_avx2(in, outcoeff256, bitcol);
    5204      118780 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5205      118780 :         transpose_4x16_avx2(outcoeff256, in);
    5206      356340 :         for (int32_t i = 0; i < 2; i++)
    5207      237560 :             fidtx4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5208      118780 :         write_buffer_8x8(outcoeff256, output);
    5209      118780 :         break;
    5210      120282 :     case H_ADST:
    5211      120282 :         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
    5212      120282 :         fidtx16x8_avx2(in, outcoeff256, bitcol, 1);
    5213      120282 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5214      120282 :         transpose_4x16_avx2(outcoeff256, in);
    5215      360846 :         for (int32_t i = 0; i < 2; i++)
    5216      240564 :             fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5217      120282 :         write_buffer_8x8(outcoeff256, output);
    5218      120282 :         break;
    5219      118469 :     case V_FLIPADST:
    5220      118469 :         load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
    5221      118469 :         fadst16x4_avx2(in, outcoeff256, bitcol);
    5222      118468 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5223      118468 :         transpose_4x16_avx2(outcoeff256, in);
    5224      355406 :         for (int32_t i = 0; i < 2; i++)
    5225      236937 :             fidtx4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5226      118469 :         write_buffer_8x8(outcoeff256, output);
    5227      118469 :         break;
    5228      119980 :     case H_FLIPADST:
    5229      119980 :         load_buffer_4x16_avx2(input, in, stride, 0, 1, shift[0]);
    5230      119981 :         fidtx16x8_avx2(in, outcoeff256, bitcol, 1);
    5231      119981 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5232      119981 :         transpose_4x16_avx2(outcoeff256, in);
    5233      359943 :         for (int32_t i = 0; i < 2; i++)
    5234      239962 :             fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
    5235      119981 :         write_buffer_8x8(outcoeff256, output);
    5236      119981 :         break;
    5237           0 :     default: assert(0);
    5238             :     }
    5239             :     (void)bd;
    5240     5565120 : }
    5241             : 
    5242     5777380 : void eb_av1_fwd_txfm2d_16x4_avx2(int16_t *input, int32_t *output, uint32_t stride,
    5243             :     TxType tx_type, uint8_t  bd) {
    5244             :     __m256i in[8];
    5245     5777380 :     __m256i *outcoeff256 = (__m256i *)output;
    5246     5777380 :     const int8_t *shift = fwd_shift_16x4;
    5247     5777380 :     const int32_t txw_idx = get_txw_idx(TX_16X4);
    5248     5777320 :     const int32_t txh_idx = get_txh_idx(TX_16X4);
    5249     5778010 :     int32_t bitcol = fwd_cos_bit_col[txw_idx][txh_idx];
    5250     5778010 :     int32_t bitrow = fwd_cos_bit_row[txw_idx][txh_idx];
    5251             : 
    5252     5778010 :     switch (tx_type) {
    5253     3049470 :     case DCT_DCT:
    5254     3049470 :         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
    5255     9148650 :         for (int32_t i = 0; i < 2; i++)
    5256     6098900 :             fdct4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5257     3049750 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5258     3049630 :         fdct16x4_avx2(outcoeff256, in, bitrow);
    5259     3049600 :         transpose_4x16_avx2(in, outcoeff256);
    5260     3049610 :         break;
    5261      363028 :     case ADST_DCT:
    5262      363028 :         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
    5263     1089090 :         for (int32_t i = 0; i < 2; i++)
    5264      726053 :             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5265      363035 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5266      363027 :         fdct16x4_avx2(outcoeff256, in, bitrow);
    5267      363028 :         transpose_4x16_avx2(in, outcoeff256);
    5268      363028 :         break;
    5269      356091 :     case DCT_ADST:
    5270      356091 :         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
    5271     1068270 :         for (int32_t i = 0; i < 2; i++)
    5272      712181 :             fdct4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5273      356091 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5274      356090 :         fadst16x4_avx2(outcoeff256, in, bitrow);
    5275      356092 :         transpose_4x16_avx2(in, outcoeff256);
    5276      356091 :         break;
    5277      277087 :     case ADST_ADST:
    5278      277087 :         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
    5279      831259 :         for (int32_t i = 0; i < 2; i++)
    5280      554174 :             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5281      277085 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5282      277088 :         fadst16x4_avx2(outcoeff256, in, bitrow);
    5283      277087 :         transpose_4x16_avx2(in, outcoeff256);
    5284      277087 :         break;
    5285      125337 :     case FLIPADST_DCT:
    5286      125337 :         load_buffer_16x4_avx2(input, in, stride, 1, 0, shift[0]);
    5287      376010 :         for (int32_t i = 0; i < 2; i++)
    5288      250670 :             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5289      125340 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5290      125337 :         fdct16x4_avx2(outcoeff256, in, bitrow);
    5291      125337 :         transpose_4x16_avx2(in, outcoeff256);
    5292      125337 :         break;
    5293      125096 :     case DCT_FLIPADST:
    5294      125096 :         load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
    5295      375288 :         for (int32_t i = 0; i < 2; i++)
    5296      250192 :             fdct4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5297      125096 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5298      125096 :         fadst16x4_avx2(outcoeff256, in, bitrow);
    5299      125095 :         transpose_4x16_avx2(in, outcoeff256);
    5300      125096 :         break;
    5301      125734 :     case FLIPADST_FLIPADST:
    5302      125734 :         load_buffer_16x4_avx2(input, in, stride, 1, 1, shift[0]);
    5303      377206 :         for (int32_t i = 0; i < 2; i++)
    5304      251471 :             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5305      125735 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5306      125734 :         fadst16x4_avx2(outcoeff256, in, bitrow);
    5307      125736 :         transpose_4x16_avx2(in, outcoeff256);
    5308      125737 :         break;
    5309      125952 :     case ADST_FLIPADST:
    5310      125952 :         load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
    5311      377857 :         for (int32_t i = 0; i < 2; i++)
    5312      251905 :             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5313      125952 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5314      125953 :         fadst16x4_avx2(outcoeff256, in, bitrow);
    5315      125952 :         transpose_4x16_avx2(in, outcoeff256);
    5316      125952 :         break;
    5317      125992 :     case FLIPADST_ADST:
    5318      125992 :         load_buffer_16x4_avx2(input, in, stride, 1, 0, shift[0]);
    5319      377976 :         for (int32_t i = 0; i < 2; i++)
    5320      251984 :             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5321      125992 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5322      125992 :         fadst16x4_avx2(outcoeff256, in, bitrow);
    5323      125992 :         transpose_4x16_avx2(in, outcoeff256);
    5324      125992 :         break;
    5325      204819 :     case IDTX:
    5326      204819 :         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
    5327      614459 :         for (int32_t i = 0; i < 2; i++)
    5328      409638 :             fidtx4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5329      204821 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5330      204819 :         fidtx16x8_avx2(outcoeff256, in, bitrow, 1);
    5331      204820 :         transpose_4x16_avx2(in, outcoeff256);
    5332      204820 :         break;
    5333      199009 :     case V_DCT:
    5334      199009 :         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
    5335      597032 :         for (int32_t i = 0; i < 2; i++)
    5336      398021 :             fdct4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5337      199011 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5338      199011 :         fidtx16x8_avx2(outcoeff256, in, bitrow, 1);
    5339      199011 :         transpose_4x16_avx2(in, outcoeff256);
    5340      199011 :         break;
    5341      197798 :     case H_DCT:
    5342      197798 :         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
    5343      593396 :         for (int32_t i = 0; i < 2; i++)
    5344      395597 :             fidtx4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5345      197799 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5346      197799 :         fdct16x4_avx2(outcoeff256, in, bitrow);
    5347      197798 :         transpose_4x16_avx2(in, outcoeff256);
    5348      197798 :         break;
    5349      126143 :     case V_ADST:
    5350      126143 :         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
    5351      378429 :         for (int32_t i = 0; i < 2; i++)
    5352      252286 :             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5353      126143 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5354      126143 :         fidtx16x8_avx2(outcoeff256, in, bitrow, 1);
    5355      126143 :         transpose_4x16_avx2(in, outcoeff256);
    5356      126143 :         break;
    5357      125542 :     case H_ADST:
    5358      125542 :         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
    5359      376626 :         for (int32_t i = 0; i < 2; i++)
    5360      251084 :             fidtx4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5361      125542 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5362      125542 :         fadst16x4_avx2(outcoeff256, in, bitrow);
    5363      125542 :         transpose_4x16_avx2(in, outcoeff256);
    5364      125542 :         break;
    5365      125586 :     case V_FLIPADST:
    5366      125586 :         load_buffer_16x4_avx2(input, in, stride, 1, 0, shift[0]);
    5367      376758 :         for (int32_t i = 0; i < 2; i++)
    5368      251172 :             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5369      125586 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5370      125586 :         fidtx16x8_avx2(outcoeff256, in, bitrow, 1);
    5371      125586 :         transpose_4x16_avx2(in, outcoeff256);
    5372      125586 :         break;
    5373      125330 :     case H_FLIPADST:
    5374      125330 :         load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
    5375      375992 :         for (int32_t i = 0; i < 2; i++)
    5376      250662 :             fidtx4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
    5377      125330 :         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
    5378      125331 :         fadst16x4_avx2(outcoeff256, in, bitrow);
    5379      125331 :         transpose_4x16_avx2(in, outcoeff256);
    5380      125331 :         break;
    5381           0 :     default: assert(0);
    5382             :     }
    5383             :     (void)bd;
    5384     5778160 : }

Generated by: LCOV version 1.14