LCOV - code coverage report
Current view: top level - ASM_AVX2 - highbd_inv_txfm_avx2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 4911 0.0 %
Date: 2019-11-25 17:38:06 Functions: 0 86 0.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : #include <assert.h>
      12             : #include <immintrin.h>
      13             : #include "EbDefinitions.h"
      14             : #include "aom_dsp_rtcd.h"
      15             : #include "EbTransforms.h"
      16             : #include "av1_inv_txfm_ssse3.h"
      17             : #include "txfm_common_avx2.h"
      18             : 
      19           0 : static INLINE void highbd_clamp_epi32(__m256i *x, int32_t bd) {
      20           0 :     const __m256i zero = _mm256_setzero_si256();
      21           0 :     const __m256i max = _mm256_set1_epi32((1 << bd) - 1);
      22             : 
      23           0 :     *x = _mm256_min_epi32(*x, max);
      24           0 :     *x = _mm256_max_epi32(*x, zero);
      25           0 : }
      26             : 
      27           0 : static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int32_t bd) {
      28           0 :     const __m256i zero = _mm256_setzero_si256();
      29           0 :     const __m256i one = _mm256_set1_epi16(1);
      30           0 :     const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
      31             :     __m256i clamped, mask;
      32             : 
      33           0 :     mask = _mm256_cmpgt_epi16(u, max);
      34           0 :     clamped = _mm256_andnot_si256(mask, u);
      35           0 :     mask = _mm256_and_si256(mask, max);
      36           0 :     clamped = _mm256_or_si256(mask, clamped);
      37           0 :     mask = _mm256_cmpgt_epi16(clamped, zero);
      38           0 :     clamped = _mm256_and_si256(clamped, mask);
      39             : 
      40           0 :     return clamped;
      41             : }
      42             : 
      43           0 : static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
      44             :     const __m256i *w1, const __m256i *n1,
      45             :     const __m256i *rounding, int32_t bit) {
      46             :     __m256i x, y;
      47             : 
      48           0 :     x = _mm256_mullo_epi32(*w0, *n0);
      49           0 :     y = _mm256_mullo_epi32(*w1, *n1);
      50           0 :     x = _mm256_add_epi32(x, y);
      51           0 :     x = _mm256_add_epi32(x, *rounding);
      52           0 :     x = _mm256_srai_epi32(x, bit);
      53           0 :     return x;
      54             : }
      55             : 
      56           0 : static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
      57             :     __m256i *out1, const __m256i *clamp_lo,
      58             :     const __m256i *clamp_hi) {
      59           0 :     __m256i a0 = _mm256_add_epi32(in0, in1);
      60           0 :     __m256i a1 = _mm256_sub_epi32(in0, in1);
      61             : 
      62           0 :     a0 = _mm256_max_epi32(a0, *clamp_lo);
      63           0 :     a0 = _mm256_min_epi32(a0, *clamp_hi);
      64           0 :     a1 = _mm256_max_epi32(a1, *clamp_lo);
      65           0 :     a1 = _mm256_min_epi32(a1, *clamp_hi);
      66             : 
      67           0 :     *out0 = a0;
      68           0 :     *out1 = a1;
      69           0 : }
      70             : 
      71           0 : static void addsub_no_clamp_avx2(const __m256i in0, const __m256i in1,
      72             :     __m256i *out0, __m256i *out1) {
      73           0 :     __m256i a0 = _mm256_add_epi32(in0, in1);
      74           0 :     __m256i a1 = _mm256_sub_epi32(in0, in1);
      75             : 
      76           0 :     *out0 = a0;
      77           0 :     *out1 = a1;
      78           0 : }
      79             : 
      80           0 : static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
      81             :     __m256i *out0, __m256i *out1,
      82             :     const __m256i *clamp_lo, const __m256i *clamp_hi,
      83             :     int32_t shift) {
      84           0 :     __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
      85           0 :     __m256i in0_w_offset = _mm256_add_epi32(in0, offset);
      86           0 :     __m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
      87           0 :     __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
      88             : 
      89           0 :     a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
      90           0 :     a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
      91             : 
      92           0 :     a0 = _mm256_max_epi32(a0, *clamp_lo);
      93           0 :     a0 = _mm256_min_epi32(a0, *clamp_hi);
      94           0 :     a1 = _mm256_max_epi32(a1, *clamp_lo);
      95           0 :     a1 = _mm256_min_epi32(a1, *clamp_hi);
      96             : 
      97           0 :     *out0 = a0;
      98           0 :     *out1 = a1;
      99           0 : }
     100             : 
     101           0 : static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
     102             :     __m256i *out1, const __m256i *clamp_lo,
     103             :     const __m256i *clamp_hi, int32_t shift) {
     104           0 :     __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
     105           0 :     __m256i a0 = _mm256_add_epi32(offset, in0);
     106           0 :     __m256i a1 = _mm256_sub_epi32(offset, in1);
     107             : 
     108           0 :     a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
     109           0 :     a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
     110             : 
     111           0 :     a0 = _mm256_max_epi32(a0, *clamp_lo);
     112           0 :     a0 = _mm256_min_epi32(a0, *clamp_hi);
     113           0 :     a1 = _mm256_max_epi32(a1, *clamp_lo);
     114           0 :     a1 = _mm256_min_epi32(a1, *clamp_hi);
     115             : 
     116           0 :     *out0 = a0;
     117           0 :     *out1 = a1;
     118           0 : }
     119             : 
     120           0 : static INLINE void idct32_stage4_avx2(
     121             :     __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
     122             :     const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
     123             :     const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
     124             :     const __m256i *rounding, int32_t bit) {
     125             :     __m256i temp1, temp2;
     126           0 :     temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
     127           0 :     bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
     128           0 :     bf1[17] = temp1;
     129             : 
     130           0 :     temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
     131           0 :     bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
     132           0 :     bf1[18] = temp2;
     133             : 
     134           0 :     temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
     135           0 :     bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
     136           0 :     bf1[21] = temp1;
     137             : 
     138           0 :     temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
     139           0 :     bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
     140           0 :     bf1[22] = temp2;
     141           0 : }
     142             : 
     143           0 : static INLINE void idct32_stage5_avx2(
     144             :     __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
     145             :     const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
     146             :     const __m256i *clamp_hi, const __m256i *rounding, int32_t bit) {
     147             :     __m256i temp1, temp2;
     148           0 :     temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
     149           0 :     bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
     150           0 :     bf1[9] = temp1;
     151             : 
     152           0 :     temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
     153           0 :     bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
     154           0 :     bf1[10] = temp2;
     155             : 
     156           0 :     addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
     157           0 :     addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
     158           0 :     addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
     159           0 :     addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
     160           0 :     addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
     161           0 :     addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
     162           0 :     addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
     163           0 :     addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
     164           0 : }
     165             : 
     166           0 : static INLINE void idct32_stage6_avx2(
     167             :     __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
     168             :     const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
     169             :     const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
     170             :     const __m256i *rounding, int32_t bit) {
     171             :     __m256i temp1, temp2;
     172           0 :     temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
     173           0 :     bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
     174           0 :     bf1[5] = temp1;
     175             : 
     176           0 :     addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
     177           0 :     addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
     178           0 :     addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
     179           0 :     addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
     180             : 
     181           0 :     temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
     182           0 :     bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
     183           0 :     bf1[18] = temp1;
     184           0 :     temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
     185           0 :     bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
     186           0 :     bf1[19] = temp2;
     187           0 :     temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
     188           0 :     bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
     189           0 :     bf1[20] = temp1;
     190           0 :     temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
     191           0 :     bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
     192           0 :     bf1[21] = temp2;
     193           0 : }
     194             : 
     195           0 : static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
     196             :     const __m256i *cospi32,
     197             :     const __m256i *clamp_lo,
     198             :     const __m256i *clamp_hi,
     199             :     const __m256i *rounding, int32_t bit) {
     200             :     __m256i temp1, temp2;
     201           0 :     addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
     202           0 :     addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
     203           0 :     addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
     204           0 :     addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
     205             : 
     206           0 :     temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
     207           0 :     bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
     208           0 :     bf1[10] = temp1;
     209           0 :     temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
     210           0 :     bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
     211           0 :     bf1[11] = temp2;
     212             : 
     213           0 :     addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
     214           0 :     addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
     215           0 :     addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
     216           0 :     addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
     217           0 :     addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
     218           0 :     addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
     219           0 :     addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
     220           0 :     addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
     221           0 : }
     222             : 
     223           0 : static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
     224             :     const __m256i *cospi32,
     225             :     const __m256i *clamp_lo,
     226             :     const __m256i *clamp_hi,
     227             :     const __m256i *rounding, int32_t bit) {
     228             :     __m256i temp1, temp2;
     229           0 :     addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
     230           0 :     addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
     231           0 :     addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
     232           0 :     addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
     233           0 :     addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
     234           0 :     addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
     235           0 :     addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
     236           0 :     addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
     237             : 
     238           0 :     temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
     239           0 :     bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
     240           0 :     bf1[20] = temp1;
     241           0 :     temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
     242           0 :     bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
     243           0 :     bf1[21] = temp2;
     244           0 :     temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
     245           0 :     bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
     246           0 :     bf1[22] = temp1;
     247           0 :     temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
     248           0 :     bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
     249           0 :     bf1[23] = temp2;
     250           0 : }
     251             : 
     252           0 : static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
     253             :     const int32_t do_cols, const int32_t bd,
     254             :     const int32_t out_shift,
     255             :     const int32_t log_range) {
     256           0 :     if (do_cols) {
     257           0 :         addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31);
     258           0 :         addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30);
     259           0 :         addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29);
     260           0 :         addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28);
     261           0 :         addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27);
     262           0 :         addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26);
     263           0 :         addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25);
     264           0 :         addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24);
     265           0 :         addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23);
     266           0 :         addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22);
     267           0 :         addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21);
     268           0 :         addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20);
     269           0 :         addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19);
     270           0 :         addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18);
     271           0 :         addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17);
     272           0 :         addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16);
     273             :     }
     274             :     else {
     275           0 :         const int32_t log_range_out = AOMMAX(16, bd + 6);
     276           0 :         const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
     277             :             -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
     278           0 :         const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
     279             :             (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
     280             : 
     281           0 :         addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
     282             :             &clamp_hi_out, out_shift);
     283           0 :         addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
     284             :             &clamp_hi_out, out_shift);
     285           0 :         addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
     286             :             &clamp_hi_out, out_shift);
     287           0 :         addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
     288             :             &clamp_hi_out, out_shift);
     289           0 :         addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
     290             :             &clamp_hi_out, out_shift);
     291           0 :         addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
     292             :             &clamp_hi_out, out_shift);
     293           0 :         addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
     294             :             &clamp_hi_out, out_shift);
     295           0 :         addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
     296             :             &clamp_hi_out, out_shift);
     297           0 :         addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
     298             :             &clamp_hi_out, out_shift);
     299           0 :         addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
     300             :             &clamp_hi_out, out_shift);
     301           0 :         addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
     302             :             &clamp_hi_out, out_shift);
     303           0 :         addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
     304             :             &clamp_hi_out, out_shift);
     305           0 :         addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
     306             :             &clamp_hi_out, out_shift);
     307           0 :         addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
     308             :             &clamp_hi_out, out_shift);
     309           0 :         addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
     310             :             &clamp_hi_out, out_shift);
     311           0 :         addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
     312             :             &clamp_hi_out, out_shift);
     313             :     }
     314           0 : }
     315             : 
     316           0 : static INLINE void idct64_stage8_avx2(
     317             :     __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
     318             :     const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
     319             :     const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
     320             :     const __m256i *rnding, int32_t bit) {
     321             :     int32_t i;
     322             :     __m256i temp1, temp2, temp3, temp4;
     323           0 :     temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit);
     324           0 :     u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit);
     325           0 :     u[10] = temp1;
     326           0 :     temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit);
     327           0 :     u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit);
     328           0 :     u[11] = temp2;
     329             : 
     330           0 :     for (i = 16; i < 20; ++i) {
     331           0 :         addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
     332           0 :         addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
     333             :     }
     334             : 
     335           0 :     temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit);
     336           0 :     temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit);
     337           0 :     temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit);
     338           0 :     temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit);
     339           0 :     u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit);
     340           0 :     u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit);
     341           0 :     u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit);
     342           0 :     u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit);
     343           0 :     u[36] = temp1;
     344           0 :     u[37] = temp2;
     345           0 :     u[38] = temp3;
     346           0 :     u[39] = temp4;
     347             : 
     348           0 :     temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit);
     349           0 :     temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit);
     350           0 :     temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit);
     351           0 :     temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit);
     352           0 :     u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit);
     353           0 :     u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit);
     354           0 :     u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit);
     355           0 :     u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit);
     356           0 :     u[40] = temp1;
     357           0 :     u[41] = temp2;
     358           0 :     u[42] = temp3;
     359           0 :     u[43] = temp4;
     360           0 : }
     361             : 
     362           0 : static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
     363             :     const __m256i *cospi32,
     364             :     const __m256i *clamp_lo,
     365             :     const __m256i *clamp_hi,
     366             :     const __m256i *rnding, int32_t bit) {
     367             :     int32_t i;
     368             :     __m256i temp1, temp2, temp3, temp4;
     369           0 :     for (i = 0; i < 8; ++i)
     370           0 :         addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
     371           0 :     temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit);
     372           0 :     temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit);
     373           0 :     temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit);
     374           0 :     temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit);
     375           0 :     u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit);
     376           0 :     u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit);
     377           0 :     u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit);
     378           0 :     u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit);
     379           0 :     u[20] = temp1;
     380           0 :     u[21] = temp2;
     381           0 :     u[22] = temp3;
     382           0 :     u[23] = temp4;
     383           0 :     for (i = 32; i < 40; i++)
     384           0 :         addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
     385           0 :     for (i = 48; i < 56; i++)
     386           0 :         addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
     387           0 : }
     388             : 
     389           0 : static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
     390             :     const __m256i *cospi32,
     391             :     const __m256i *clamp_lo,
     392             :     const __m256i *clamp_hi,
     393             :     const __m256i *rnding, int32_t bit) {
     394             :     __m256i temp1, temp2, temp3, temp4;
     395           0 :     for (int32_t i = 0; i < 16; i++)
     396           0 :         addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
     397           0 :     temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit);
     398           0 :     temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit);
     399           0 :     temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit);
     400           0 :     temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit);
     401           0 :     u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit);
     402           0 :     u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit);
     403           0 :     u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit);
     404           0 :     u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit);
     405           0 :     u[40] = temp1;
     406           0 :     u[41] = temp2;
     407           0 :     u[42] = temp3;
     408           0 :     u[43] = temp4;
     409             : 
     410           0 :     temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit);
     411           0 :     temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit);
     412           0 :     temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit);
     413           0 :     temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit);
     414           0 :     u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit);
     415           0 :     u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit);
     416           0 :     u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit);
     417           0 :     u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit);
     418           0 :     u[44] = temp1;
     419           0 :     u[45] = temp2;
     420           0 :     u[46] = temp3;
     421           0 :     u[47] = temp4;
     422           0 : }
     423             : 
     424           0 : static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int32_t do_cols,
     425             :     int32_t bd, int32_t out_shift,
     426             :     const int32_t log_range) {
     427           0 :     if (do_cols) {
     428           0 :         for (int32_t i = 0; i < 32; i++)
     429           0 :             addsub_no_clamp_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
     430             :     }
     431             :     else {
     432           0 :         const int32_t log_range_out = AOMMAX(16, bd + 6);
     433           0 :         const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
     434             :             -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
     435           0 :         const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
     436             :             (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
     437             : 
     438           0 :         for (int32_t i = 0; i < 32; i++) {
     439           0 :             addsub_shift_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
     440             :                 &clamp_lo_out, &clamp_hi_out, out_shift);
     441             :         }
     442             :     }
     443           0 : }
     444             : 
     445           0 : static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
     446             :     __m256i u0, u1, u2, u3, u4, u5, u6, u7;
     447             :     __m256i x0, x1;
     448             : 
     449           0 :     u0 = _mm256_unpacklo_epi32(in[7], in[6]);
     450           0 :     u1 = _mm256_unpackhi_epi32(in[7], in[6]);
     451             : 
     452           0 :     u2 = _mm256_unpacklo_epi32(in[5], in[4]);
     453           0 :     u3 = _mm256_unpackhi_epi32(in[5], in[4]);
     454             : 
     455           0 :     u4 = _mm256_unpacklo_epi32(in[3], in[2]);
     456           0 :     u5 = _mm256_unpackhi_epi32(in[3], in[2]);
     457             : 
     458           0 :     u6 = _mm256_unpacklo_epi32(in[1], in[0]);
     459           0 :     u7 = _mm256_unpackhi_epi32(in[1], in[0]);
     460             : 
     461           0 :     x0 = _mm256_unpacklo_epi64(u0, u2);
     462           0 :     x1 = _mm256_unpacklo_epi64(u4, u6);
     463           0 :     out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
     464           0 :     out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
     465             : 
     466           0 :     x0 = _mm256_unpackhi_epi64(u0, u2);
     467           0 :     x1 = _mm256_unpackhi_epi64(u4, u6);
     468           0 :     out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
     469           0 :     out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
     470             : 
     471           0 :     x0 = _mm256_unpacklo_epi64(u1, u3);
     472           0 :     x1 = _mm256_unpacklo_epi64(u5, u7);
     473           0 :     out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
     474           0 :     out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
     475             : 
     476           0 :     x0 = _mm256_unpackhi_epi64(u1, u3);
     477           0 :     x1 = _mm256_unpackhi_epi64(u5, u7);
     478           0 :     out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
     479           0 :     out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
     480           0 : }
     481             : 
     482           0 : static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
     483             :     const int32_t bd) {
     484           0 :     __m256i x0 = pred;
     485           0 :     x0 = _mm256_add_epi32(res, x0);
     486           0 :     x0 = _mm256_packus_epi32(x0, x0);
     487           0 :     x0 = _mm256_permute4x64_epi64(x0, 0xd8);
     488           0 :     x0 = highbd_clamp_epi16_avx2(x0, bd);
     489           0 :     return x0;
     490             : }
     491             : 
     492           0 : static INLINE void highbd_write_buffer_8xn_avx2(__m256i *in,
     493             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
     494             :     int32_t flipud, int32_t height, const int32_t bd) {
     495           0 :     int32_t j = flipud ? (height - 1) : 0;
     496             :     __m128i temp;
     497           0 :     const int32_t step = flipud ? -1 : 1;
     498           0 :     for (int32_t i = 0; i < height; ++i, j += step) {
     499           0 :         temp = _mm_loadu_si128((__m128i const *)(output_r + i * stride_r));
     500           0 :         __m256i v = _mm256_cvtepi16_epi32(temp);
     501           0 :         __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd);
     502           0 :         __m128i u1 = _mm256_castsi256_si128(u);
     503           0 :         _mm_storeu_si128((__m128i *)(output_w + i * stride_w), u1);
     504             :     }
     505           0 : }
     506             : 
     507           0 : static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
     508             :     __m256i res0, __m256i res1,
     509             :     const int32_t bd) {
     510           0 :     __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
     511           0 :     __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
     512             : 
     513           0 :     x0 = _mm256_add_epi32(res0, x0);
     514           0 :     x1 = _mm256_add_epi32(res1, x1);
     515           0 :     x0 = _mm256_packus_epi32(x0, x1);
     516           0 :     x0 = _mm256_permute4x64_epi64(x0, 0xd8);
     517           0 :     x0 = highbd_clamp_epi16_avx2(x0, bd);
     518           0 :     return x0;
     519             : }
     520             : 
     521           0 : static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in,
     522             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
     523             :     int32_t flipud, int32_t height, const int32_t bd) {
     524           0 :     int32_t j = flipud ? (height - 1) : 0;
     525           0 :     const int32_t step = flipud ? -1 : 1;
     526           0 :     for (int32_t i = 0; i < height; ++i, j += step) {
     527           0 :         __m256i v = _mm256_loadu_si256((__m256i const *)(output_r + i * stride_r));
     528           0 :         __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
     529             : 
     530           0 :         _mm256_storeu_si256((__m256i *)(output_w + i * stride_w), u);
     531             :     }
     532           0 : }
     533             : 
     534           0 : static INLINE void load_buffer_4x4(const int32_t *coeff, __m256i *in) {
     535           0 :     in[0] = _mm256_loadu_si256((const __m256i *)coeff);
     536           0 :     in[1] = _mm256_loadu_si256((const __m256i *)(coeff + 8));
     537           0 : }
     538             : 
     539           0 : static INLINE void write_buffer_4x4(__m256i *in,
     540             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
     541             :     int32_t fliplr, int32_t flipud, int32_t bd) {
     542             :     __m256i u0, x0, x1, v0, v1;
     543           0 :     const __m256i zero = _mm256_setzero_si256();
     544             : 
     545           0 :     if (fliplr) {
     546           0 :         in[0] = _mm256_shuffle_epi32(in[0], 0x1B);
     547           0 :         in[1] = _mm256_shuffle_epi32(in[1], 0x1B);
     548             :     }
     549             : 
     550           0 :     if (flipud) {
     551           0 :         u0 = _mm256_set_epi64x(*(uint64_t*)(output_r + 0 * stride_r),
     552           0 :             *(uint64_t*)(output_r + 2 * stride_r),
     553           0 :             *(uint64_t*)(output_r + 1 * stride_r),
     554           0 :             *(uint64_t*)(output_r + 3 * stride_r));
     555             :     }
     556             :     else {
     557             :         // Load 64bits in order ACBD
     558           0 :         u0 = _mm256_set_epi64x(*(uint64_t*)(output_r + 3 * stride_r),
     559           0 :             *(uint64_t*)(output_r + 1 * stride_r),
     560           0 :             *(uint64_t*)(output_r + 2 * stride_r),
     561           0 :             *(uint64_t*)(output_r + 0 * stride_r));
     562             :     }
     563             : 
     564             :     // Unpack and Swap 128bits from ACBD to ABCD
     565           0 :     x0 = _mm256_unpacklo_epi16(u0, zero);
     566           0 :     x1 = _mm256_unpackhi_epi16(u0, zero);
     567             : 
     568           0 :     v0 = _mm256_add_epi32(in[0], x0);
     569           0 :     v1 = _mm256_add_epi32(in[1], x1);
     570             : 
     571           0 :     highbd_clamp_epi32(&v0, bd);
     572           0 :     highbd_clamp_epi32(&v1, bd);
     573             : 
     574             :     // Pack and Swap 128bits from ABCD to ACBD
     575           0 :     v0 = _mm256_packus_epi32(v0, v1);
     576             : 
     577           0 :     if (flipud) {
     578           0 :         _mm_storel_epi64((__m128i *)(output_w + 3 * stride_w),
     579             :             _mm256_castsi256_si128(v0));
     580           0 :         _mm_storel_epi64((__m128i *)(output_w + 2 * stride_w),
     581           0 :             _mm256_extractf128_si256(v0, 0x1));
     582             :         //Move up  memory 64bites
     583           0 :         v0 = _mm256_permute4x64_epi64(v0, 1 + (3 << 4));
     584           0 :         _mm_storel_epi64((__m128i *)(output_w + 1 * stride_w),
     585             :             _mm256_castsi256_si128(v0));
     586           0 :         _mm_storel_epi64((__m128i *)(output_w + 0 * stride_w),
     587           0 :             _mm256_extractf128_si256(v0, 0x1));
     588             :     }
     589             :     else {
     590             :         // Store in order from ACBD to ABCD
     591           0 :         _mm_storel_epi64((__m128i *)(output_w + 0 * stride_w),
     592             :             _mm256_castsi256_si128(v0));
     593           0 :         _mm_storel_epi64((__m128i *)(output_w + 1 * stride_w),
     594           0 :             _mm256_extractf128_si256(v0, 0x1));
     595             :         //Move up  memory 64bites
     596           0 :         v0 = _mm256_permute4x64_epi64(v0, 1 + (3 << 4));
     597           0 :         _mm_storel_epi64((__m128i *)(output_w + 2 * stride_w),
     598             :             _mm256_castsi256_si128(v0));
     599           0 :         _mm_storel_epi64((__m128i *)(output_w + 3 * stride_w),
     600           0 :             _mm256_extractf128_si256(v0, 0x1));
     601             :     }
     602           0 : }
     603             : 
     604           0 : static INLINE void round_shift_4x4(__m256i *in, int32_t shift) {
     605           0 :     __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
     606             : 
     607           0 :     in[0] = _mm256_add_epi32(in[0], rnding);
     608           0 :     in[0] = _mm256_srai_epi32(in[0], shift);
     609           0 :     in[1] = _mm256_add_epi32(in[1], rnding);
     610           0 :     in[1] = _mm256_srai_epi32(in[1], shift);
     611           0 : }
     612             : 
     613           0 : static INLINE void iidentity4_and_round_shift_avx2(__m256i *input, int32_t shift)
     614             : {
     615             :     // Input takes 18 bits, can be multiplied with NewSqrt2 in 32 bits space.
     616             :     // round_shift(NewSqrt2Bits) and next round_shift(shift) in one pass.
     617           0 :     const __m256i scalar = _mm256_set1_epi32(NewSqrt2);
     618           0 :     const __m256i rnding = _mm256_set1_epi32((1 << (NewSqrt2Bits - 1)) +
     619           0 :         (!!(shift) << (shift + NewSqrt2Bits - 1)));
     620             : 
     621           0 :     for (int32_t i = 0; i < 2; i++) {
     622           0 :         input[i] = _mm256_mullo_epi32(input[i], scalar);
     623           0 :         input[i] = _mm256_add_epi32(input[i], rnding);
     624           0 :         input[i] = _mm256_srai_epi32(input[i], NewSqrt2Bits + shift);
     625             :     }
     626           0 : }
     627             : 
     628           0 : static INLINE void idct4_row_avx2(__m256i *in, int8_t cos_bit) {
     629           0 :     const int32_t *cospi = cospi_arr(cos_bit);
     630           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
     631           0 :     const __m256i cospi32x16 = _mm256_blend_epi32(cospi32,
     632             :         _mm256_set1_epi32(cospi[16]), 0xAA);
     633           0 :     const __m256i cospi32x48 = _mm256_blend_epi32(cospi32,
     634             :         _mm256_set1_epi32(cospi[48]), 0xAA);
     635           0 :     const __m256i rnding = _mm256_set1_epi32((1 << (cos_bit - 1)));
     636           0 :     const __m256i minplus = _mm256_blend_epi32(_mm256_set1_epi32(-1),
     637             :         _mm256_set1_epi32(1), 0xAA);
     638             :     __m256i v0, v1, x, y;
     639             :     __m256i step[2];
     640             : 
     641           0 :     v0 = _mm256_unpacklo_epi64(in[0], in[1]);
     642           0 :     v1 = _mm256_unpackhi_epi64(in[0], in[1]);
     643             : 
     644           0 :     x = _mm256_mullo_epi32(cospi32x16, v0);
     645           0 :     y = _mm256_mullo_epi32(cospi32x48, v1);
     646           0 :     step[0] = _mm256_add_epi32(x, y);
     647             : 
     648           0 :     x = _mm256_mullo_epi32(cospi32x48, v0);
     649           0 :     y = _mm256_mullo_epi32(cospi32x16, v1);
     650           0 :     step[1] = _mm256_sub_epi32(x, y);
     651             : 
     652           0 :     step[0] = _mm256_add_epi32(step[0], rnding);
     653           0 :     step[0] = _mm256_srai_epi32(step[0], cos_bit);
     654           0 :     step[1] = _mm256_add_epi32(step[1], rnding);
     655           0 :     step[1] = _mm256_srai_epi32(step[1], cos_bit);
     656             : 
     657           0 :     v0 = _mm256_shuffle_epi32(step[0], 0xB1);
     658           0 :     v1 = _mm256_shuffle_epi32(step[1], 0xB1);
     659             : 
     660           0 :     v0 = _mm256_mullo_epi32(minplus, v0);
     661           0 :     v1 = _mm256_mullo_epi32(minplus, v1);
     662             : 
     663           0 :     v0 = _mm256_add_epi32(v0, step[0]);
     664           0 :     v1 = _mm256_add_epi32(v1, step[1]);
     665             : 
     666           0 :     v0 = _mm256_shuffle_epi32(v0, 0x2D);
     667           0 :     v1 = _mm256_shuffle_epi32(v1, 0x87);
     668             : 
     669           0 :     in[0] = _mm256_blend_epi32(v0, v1, 0x66);
     670             : 
     671           0 :     v0 = _mm256_blend_epi32(v0, v1, 0x99);
     672           0 :     in[1] = _mm256_shuffle_epi32(v0, 0xB1);
     673           0 : }
     674             : 
     675           0 : static INLINE void idct4_col_avx2(__m256i *in, int8_t cos_bit) {
     676           0 :     const int32_t *cospi = cospi_arr(cos_bit);
     677           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
     678           0 :     const __m256i cospi32x16 =
     679           0 :         _mm256_blend_epi32(_mm256_set1_epi32(cospi[16]), cospi32, 0x0F);
     680           0 :     const __m256i cospi32x48 =
     681           0 :         _mm256_blend_epi32(_mm256_set1_epi32(cospi[48]), cospi32, 0x0F);
     682           0 :     const __m256i rnding = _mm256_set1_epi32((1 << (cos_bit - 1)));
     683             :     __m256i x, y;
     684             :     __m256i step[2];
     685             : 
     686           0 :     x = _mm256_mullo_epi32(cospi32x16, in[0]);
     687           0 :     y = _mm256_mullo_epi32(cospi32x48, in[1]);
     688           0 :     step[0] = _mm256_add_epi32(x, y);
     689             : 
     690           0 :     x = _mm256_mullo_epi32(cospi32x48, in[0]);
     691           0 :     y = _mm256_mullo_epi32(cospi32x16, in[1]);
     692           0 :     step[1] = _mm256_sub_epi32(x, y);
     693             : 
     694           0 :     step[0] = _mm256_add_epi32(step[0], rnding);
     695           0 :     step[0] = _mm256_srai_epi32(step[0], cos_bit);
     696           0 :     step[1] = _mm256_add_epi32(step[1], rnding);
     697           0 :     step[1] = _mm256_srai_epi32(step[1], cos_bit);
     698             : 
     699           0 :     x = _mm256_permute2x128_si256(step[0], step[1], 0x20);
     700           0 :     y = _mm256_permute2x128_si256(step[0], step[1], 0x31);
     701           0 :     in[0] = _mm256_add_epi32(x, y);
     702             : 
     703           0 :     x = _mm256_permute2x128_si256(step[0], step[1], 0x02);
     704           0 :     y = _mm256_permute2x128_si256(step[0], step[1], 0x13);
     705           0 :     in[1] = _mm256_sub_epi32(x, y);
     706           0 : }
     707             : 
     708           0 : static INLINE void iadst4_row_avx2(__m256i *in, int8_t cos_bit) {
     709           0 :     const int32_t bit = cos_bit;
     710           0 :     const int32_t *sinpi = sinpi_arr(bit);
     711           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
     712           0 :     const __m128i sinpi1 = _mm_set1_epi32((int32_t)sinpi[1]);
     713           0 :     const __m128i sinpi2 = _mm_set1_epi32((int32_t)sinpi[2]);
     714           0 :     const __m128i sinpi3 = _mm_set1_epi32((int32_t)sinpi[3]);
     715           0 :     const __m128i sinpi4 = _mm_set1_epi32((int32_t)sinpi[4]);
     716             :     __m128i t;
     717             :     __m128i s0, s1, s2, s3, s4, s5, s6, s7;
     718             :     __m128i x0, x1, x2, x3;
     719             :     __m128i u0, u1, u2, u3;
     720             :     __m256i y0;
     721             : 
     722           0 :     u0 = _mm256_extractf128_si256(in[0], 0x1);
     723           0 :     u1 = _mm256_extractf128_si256(in[1], 0x1);
     724             : 
     725           0 :     s0 = _mm_unpacklo_epi32(_mm256_castsi256_si128(in[0]), u0);
     726           0 :     s1 = _mm_unpackhi_epi32(_mm256_castsi256_si128(in[0]), u0);
     727           0 :     s2 = _mm_unpacklo_epi32(_mm256_castsi256_si128(in[1]), u1);
     728           0 :     s3 = _mm_unpackhi_epi32(_mm256_castsi256_si128(in[1]), u1);
     729             : 
     730           0 :     x0 = _mm_unpacklo_epi64(s0, s2);
     731           0 :     x1 = _mm_unpackhi_epi64(s0, s2);
     732           0 :     x2 = _mm_unpacklo_epi64(s1, s3);
     733           0 :     x3 = _mm_unpackhi_epi64(s1, s3);
     734             : 
     735           0 :     s0 = _mm_mullo_epi32(x0, sinpi1);
     736           0 :     s1 = _mm_mullo_epi32(x0, sinpi2);
     737           0 :     s2 = _mm_mullo_epi32(x1, sinpi3);
     738           0 :     s3 = _mm_mullo_epi32(x2, sinpi4);
     739           0 :     s4 = _mm_mullo_epi32(x2, sinpi1);
     740           0 :     s5 = _mm_mullo_epi32(x3, sinpi2);
     741           0 :     s6 = _mm_mullo_epi32(x3, sinpi4);
     742           0 :     t = _mm_sub_epi32(x0, x2);
     743           0 :     s7 = _mm_add_epi32(t, x3);
     744             : 
     745           0 :     t = _mm_add_epi32(s0, s3);
     746           0 :     s0 = _mm_add_epi32(t, s5);
     747           0 :     t = _mm_sub_epi32(s1, s4);
     748           0 :     s1 = _mm_sub_epi32(t, s6);
     749           0 :     u2 = _mm_mullo_epi32(s7, sinpi3);
     750             : 
     751           0 :     u0 = _mm_add_epi32(s0, s2);
     752           0 :     u1 = _mm_add_epi32(s1, s2);
     753           0 :     t = _mm_add_epi32(s0, s1);
     754           0 :     u3 = _mm_sub_epi32(t, s2);
     755             : 
     756           0 :     s0 = _mm_unpacklo_epi32(u0, u1);
     757           0 :     s1 = _mm_unpackhi_epi32(u0, u1);
     758           0 :     s2 = _mm_unpacklo_epi32(u2, u3);
     759           0 :     s3 = _mm_unpackhi_epi32(u2, u3);
     760             : 
     761           0 :     u0 = _mm_unpacklo_epi64(s0, s2);
     762           0 :     u1 = _mm_unpackhi_epi64(s0, s2);
     763           0 :     u2 = _mm_unpacklo_epi64(s1, s3);
     764           0 :     u3 = _mm_unpackhi_epi64(s1, s3);
     765             : 
     766           0 :     y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(u0), (u1), 0x1);
     767           0 :     y0 = _mm256_add_epi32(y0, rnding);
     768           0 :     in[0] = _mm256_srai_epi32(y0, bit);
     769             : 
     770           0 :     y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(u2), (u3), 0x1);
     771           0 :     y0 = _mm256_add_epi32(y0, rnding);
     772           0 :     in[1] = _mm256_srai_epi32(y0, bit);
     773           0 : }
     774             : 
     775           0 : static INLINE void iadst4_col_avx2(__m256i *in, int8_t cos_bit) {
     776           0 :     const int32_t bit = cos_bit;
     777           0 :     const int32_t *sinpi = sinpi_arr(bit);
     778           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
     779           0 :     const __m128i sinpi1 = _mm_set1_epi32((int32_t)sinpi[1]);
     780           0 :     const __m128i sinpi2 = _mm_set1_epi32((int32_t)sinpi[2]);
     781           0 :     const __m128i sinpi3 = _mm_set1_epi32((int32_t)sinpi[3]);
     782           0 :     const __m128i sinpi4 = _mm_set1_epi32((int32_t)sinpi[4]);
     783             :     __m128i t;
     784             :     __m128i s0, s1, s2, s3, s4, s5, s6, s7;
     785             :     __m128i x0, x1;
     786             :     __m128i u0, u1, u3;
     787             :     __m256i y0;
     788             : 
     789           0 :     x0 = _mm256_extractf128_si256(in[0], 0x1);
     790           0 :     x1 = _mm256_extractf128_si256(in[1], 0x1);
     791             : 
     792           0 :     s0 = _mm_mullo_epi32(_mm256_castsi256_si128(in[0]), sinpi1);
     793           0 :     s1 = _mm_mullo_epi32(_mm256_castsi256_si128(in[0]), sinpi2);
     794           0 :     s2 = _mm_mullo_epi32(x0, sinpi3);
     795           0 :     s3 = _mm_mullo_epi32(_mm256_castsi256_si128(in[1]), sinpi4);
     796           0 :     s4 = _mm_mullo_epi32(_mm256_castsi256_si128(in[1]), sinpi1);
     797           0 :     s5 = _mm_mullo_epi32(x1, sinpi2);
     798           0 :     s6 = _mm_mullo_epi32(x1, sinpi4);
     799           0 :     t = _mm_sub_epi32(_mm256_castsi256_si128(in[0]),
     800           0 :         _mm256_castsi256_si128(in[1]));
     801           0 :     s7 = _mm_add_epi32(t, x1);
     802             : 
     803           0 :     t = _mm_add_epi32(s0, s3);
     804           0 :     s0 = _mm_add_epi32(t, s5);
     805           0 :     t = _mm_sub_epi32(s1, s4);
     806           0 :     s1 = _mm_sub_epi32(t, s6);
     807           0 :     s3 = _mm_mullo_epi32(s7, sinpi3);
     808             : 
     809           0 :     u0 = _mm_add_epi32(s0, s2);
     810           0 :     u1 = _mm_add_epi32(s1, s2);
     811             : 
     812           0 :     t = _mm_add_epi32(s0, s1);
     813           0 :     u3 = _mm_sub_epi32(t, s2);
     814             : 
     815           0 :     y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(u0), (u1), 0x1);
     816           0 :     y0 = _mm256_add_epi32(y0, rnding);
     817           0 :     in[0] = _mm256_srai_epi32(y0, bit);
     818             : 
     819           0 :     y0 = _mm256_insertf128_si256(_mm256_castsi128_si256(s3), (u3), 0x1);
     820           0 :     y0 = _mm256_add_epi32(y0, rnding);
     821           0 :     in[1] = _mm256_srai_epi32(y0, bit);
     822           0 : }
     823             : 
     824           0 : void eb_av1_inv_txfm2d_add_4x4_avx2(const int32_t *input,
     825             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
     826             :     TxType tx_type, int32_t bd) {
     827             :     __m256i in[2];
     828           0 :     const int8_t *shift = eb_inv_txfm_shift_ls[TX_4X4];
     829           0 :     const int32_t txw_idx = get_txw_idx(TX_4X4);
     830           0 :     const int32_t txh_idx = get_txh_idx(TX_4X4);
     831             : 
     832           0 :     switch (tx_type) {
     833           0 :     case IDTX:
     834           0 :         load_buffer_4x4(input, in);
     835           0 :         iidentity4_and_round_shift_avx2(in, -shift[0]);
     836           0 :         iidentity4_and_round_shift_avx2(in, -shift[1]);
     837           0 :         write_buffer_4x4(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
     838           0 :         break;
     839           0 :     case V_DCT:
     840           0 :         load_buffer_4x4(input, in);
     841           0 :         iidentity4_and_round_shift_avx2(in, -shift[0]);
     842           0 :         idct4_col_avx2(in, inv_cos_bit_col[txw_idx][txh_idx]);
     843           0 :         round_shift_4x4(in, -shift[1]);
     844           0 :         write_buffer_4x4(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
     845           0 :         break;
     846           0 :     case H_DCT:
     847           0 :         load_buffer_4x4(input, in);
     848           0 :         idct4_row_avx2(in, inv_cos_bit_row[txw_idx][txh_idx]);
     849           0 :         iidentity4_and_round_shift_avx2(in, -shift[1]);
     850           0 :         write_buffer_4x4(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
     851           0 :         break;
     852           0 :     case V_ADST:
     853           0 :         load_buffer_4x4(input, in);
     854           0 :         iidentity4_and_round_shift_avx2(in, -shift[0]);
     855           0 :         iadst4_col_avx2(in, inv_cos_bit_col[txw_idx][txh_idx]);
     856           0 :         round_shift_4x4(in, -shift[1]);
     857           0 :         write_buffer_4x4(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
     858           0 :         break;
     859           0 :     case H_ADST:
     860           0 :         load_buffer_4x4(input, in);
     861           0 :         iadst4_row_avx2(in, inv_cos_bit_row[txw_idx][txh_idx]);
     862           0 :         iidentity4_and_round_shift_avx2(in, -shift[1]);
     863           0 :         write_buffer_4x4(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
     864           0 :         break;
     865           0 :     case V_FLIPADST:
     866           0 :         load_buffer_4x4(input, in);
     867           0 :         iidentity4_and_round_shift_avx2(in, -shift[0]);
     868           0 :         iadst4_col_avx2(in, inv_cos_bit_col[txw_idx][txh_idx]);
     869           0 :         round_shift_4x4(in, -shift[1]);
     870           0 :         write_buffer_4x4(in, output_r, stride_r, output_w, stride_w, 0, 1, bd);
     871           0 :         break;
     872           0 :     case H_FLIPADST:
     873           0 :         load_buffer_4x4(input, in);
     874           0 :         iadst4_row_avx2(in, inv_cos_bit_row[txw_idx][txh_idx]);
     875           0 :         iidentity4_and_round_shift_avx2(in, -shift[1]);
     876           0 :         write_buffer_4x4(in, output_r, stride_r, output_w, stride_w, 1, 0, bd);
     877           0 :         break;
     878           0 :     default:
     879           0 :         eb_av1_inv_txfm2d_add_4x4_sse4_1(input,
     880             :             output_r, stride_r, output_w, stride_w, tx_type, bd);
     881           0 :         break;
     882             :     }
     883           0 : }
     884             : 
     885             : #define TRANSPOSE_4X4_AVX2(x0, x1, x2, x3, y0, y1, y2, y3) \
     886             :   do {                                                \
     887             :     __m256i u0, u1, u2, u3;                           \
     888             :     u0 = _mm256_unpacklo_epi32(x0, x1);                  \
     889             :     u1 = _mm256_unpackhi_epi32(x0, x1);                  \
     890             :     u2 = _mm256_unpacklo_epi32(x2, x3);                  \
     891             :     u3 = _mm256_unpackhi_epi32(x2, x3);                  \
     892             :     y0 = _mm256_unpacklo_epi64(u0, u2);                  \
     893             :     y1 = _mm256_unpackhi_epi64(u0, u2);                  \
     894             :     y2 = _mm256_unpacklo_epi64(u1, u3);                  \
     895             :     y3 = _mm256_unpackhi_epi64(u1, u3);                  \
     896             :     } while (0)
     897             : 
     898           0 : static INLINE void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
     899             :     __m256i out1[8];
     900           0 :     TRANSPOSE_4X4_AVX2(
     901             :         in[0], in[1], in[2], in[3], out1[0], out1[1], out1[4], out1[5]);
     902           0 :     TRANSPOSE_4X4_AVX2(
     903             :         in[4], in[5], in[6], in[7], out1[2], out1[3], out1[6], out1[7]);
     904           0 :     out[0] = _mm256_permute2x128_si256(out1[0], out1[2], 0x20);
     905           0 :     out[1] = _mm256_permute2x128_si256(out1[1], out1[3], 0x20);
     906           0 :     out[2] = _mm256_permute2x128_si256(out1[4], out1[6], 0x20);
     907           0 :     out[3] = _mm256_permute2x128_si256(out1[5], out1[7], 0x20);
     908           0 :     out[4] = _mm256_permute2x128_si256(out1[0], out1[2], 0x31);
     909           0 :     out[5] = _mm256_permute2x128_si256(out1[1], out1[3], 0x31);
     910           0 :     out[6] = _mm256_permute2x128_si256(out1[4], out1[6], 0x31);
     911           0 :     out[7] = _mm256_permute2x128_si256(out1[5], out1[7], 0x31);
     912           0 : }
     913             : 
     914           0 : static INLINE void transpose_16x16_avx2(const __m256i *in, __m256i *out) {
     915             :     __m256i temp[32];
     916           0 :     TRANSPOSE_4X4_AVX2(
     917             :         in[0], in[2], in[4], in[6], temp[0], temp[2], temp[4], temp[6]);
     918           0 :     TRANSPOSE_4X4_AVX2(
     919             :         in[8], in[10], in[12], in[14], temp[17], temp[19], temp[21], temp[23]);
     920           0 :     TRANSPOSE_4X4_AVX2(
     921             :         in[1], in[3], in[5], in[7], temp[16], temp[18], temp[20], temp[22]);
     922           0 :     TRANSPOSE_4X4_AVX2(
     923             :         in[9], in[11], in[13], in[15], temp[25], temp[27], temp[29], temp[31]);
     924           0 :     TRANSPOSE_4X4_AVX2(
     925             :         in[16], in[18], in[20], in[22], temp[1], temp[3], temp[5], temp[7]);
     926           0 :     TRANSPOSE_4X4_AVX2(
     927             :         in[24], in[26], in[28], in[30], temp[9], temp[11], temp[13], temp[15]);
     928           0 :     TRANSPOSE_4X4_AVX2(
     929             :         in[17], in[19], in[21], in[23], temp[8], temp[10], temp[12], temp[14]);
     930           0 :     TRANSPOSE_4X4_AVX2(
     931             :         in[25], in[27], in[29], in[31], temp[24], temp[26], temp[28], temp[30]);
     932             : 
     933           0 :     out[0] = _mm256_permute2x128_si256(temp[0], temp[17], 0x20);
     934           0 :     out[1] = _mm256_permute2x128_si256(temp[1], temp[9], 0x20);
     935           0 :     out[2] = _mm256_permute2x128_si256(temp[2], temp[19], 0x20);
     936           0 :     out[3] = _mm256_permute2x128_si256(temp[3], temp[11], 0x20);
     937           0 :     out[4] = _mm256_permute2x128_si256(temp[4], temp[21], 0x20);
     938           0 :     out[5] = _mm256_permute2x128_si256(temp[5], temp[13], 0x20);
     939           0 :     out[6] = _mm256_permute2x128_si256(temp[6], temp[23], 0x20);
     940           0 :     out[7] = _mm256_permute2x128_si256(temp[7], temp[15], 0x20);
     941           0 :     out[8] = _mm256_permute2x128_si256(temp[0], temp[17], 0x31);
     942           0 :     out[9] = _mm256_permute2x128_si256(temp[1], temp[9], 0x31);
     943           0 :     out[10] = _mm256_permute2x128_si256(temp[2], temp[19], 0x31);
     944           0 :     out[11] = _mm256_permute2x128_si256(temp[3], temp[11], 0x31);
     945           0 :     out[12] = _mm256_permute2x128_si256(temp[4], temp[21], 0x31);
     946           0 :     out[13] = _mm256_permute2x128_si256(temp[5], temp[13], 0x31);
     947           0 :     out[14] = _mm256_permute2x128_si256(temp[6], temp[23], 0x31);
     948           0 :     out[15] = _mm256_permute2x128_si256(temp[7], temp[15], 0x31);
     949           0 :     out[16] = _mm256_permute2x128_si256(temp[16], temp[25], 0x20);
     950           0 :     out[17] = _mm256_permute2x128_si256(temp[8], temp[24], 0x20);
     951           0 :     out[18] = _mm256_permute2x128_si256(temp[18], temp[27], 0x20);
     952           0 :     out[19] = _mm256_permute2x128_si256(temp[10], temp[26], 0x20);
     953           0 :     out[20] = _mm256_permute2x128_si256(temp[20], temp[29], 0x20);
     954           0 :     out[21] = _mm256_permute2x128_si256(temp[12], temp[28], 0x20);
     955           0 :     out[22] = _mm256_permute2x128_si256(temp[22], temp[31], 0x20);
     956           0 :     out[23] = _mm256_permute2x128_si256(temp[14], temp[30], 0x20);
     957           0 :     out[24] = _mm256_permute2x128_si256(temp[16], temp[25], 0x31);
     958           0 :     out[25] = _mm256_permute2x128_si256(temp[8], temp[24], 0x31);
     959           0 :     out[26] = _mm256_permute2x128_si256(temp[18], temp[27], 0x31);
     960           0 :     out[27] = _mm256_permute2x128_si256(temp[10], temp[26], 0x31);
     961           0 :     out[28] = _mm256_permute2x128_si256(temp[20], temp[29], 0x31);
     962           0 :     out[29] = _mm256_permute2x128_si256(temp[12], temp[28], 0x31);
     963           0 :     out[30] = _mm256_permute2x128_si256(temp[22], temp[31], 0x31);
     964           0 :     out[31] = _mm256_permute2x128_si256(temp[14], temp[30], 0x31);
     965           0 : }
     966             : 
     967           0 : static void load_buffer_8x8(const int32_t *coeff, __m256i *in) {
     968             :     int32_t i;
     969           0 :     for (i = 0; i < 8; ++i) {
     970           0 :         in[i] = _mm256_loadu_si256((const __m256i *)coeff);
     971           0 :         coeff += 8;
     972             :     }
     973           0 : }
     974             : 
     975           0 : static INLINE void write_buffer_8x8(__m256i *in,
     976             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
     977             :     int32_t fliplr, int32_t flipud, int32_t bd) {
     978             :     __m256i u0, x0, x1, v0, v1;
     979           0 :     const __m256i zero = _mm256_setzero_si256();
     980           0 :     int32_t i = 0;
     981           0 :     int32_t step = 1;
     982             : 
     983           0 :     if (flipud) {
     984           0 :         i = 7;
     985           0 :         step = -1;
     986             :     }
     987             : 
     988           0 :     while (i < 8 && i > -1) {
     989           0 :         u0 = _mm256_inserti128_si256(_mm256_castsi128_si256(
     990             :             _mm_loadu_si128((__m128i*)(output_r))),
     991             :             _mm_loadu_si128((__m128i*)(output_r + stride_r)), 1);
     992             : 
     993             :         // Swap 64bits from ABCD to ACBD
     994           0 :         u0 = _mm256_permute4x64_epi64(u0, 0xD8);
     995             : 
     996             :         // Unpack and Swap 128bits from ACBD to ABCD
     997           0 :         x0 = _mm256_unpacklo_epi16(u0, zero);
     998           0 :         x1 = _mm256_unpackhi_epi16(u0, zero);
     999             : 
    1000           0 :         if (fliplr) {
    1001           0 :             v0 = _mm256_permute4x64_epi64(in[i], 0x1B);
    1002           0 :             v0 = _mm256_shuffle_epi32(v0, 0xB1);
    1003           0 :             v0 = _mm256_add_epi32(v0, x0);
    1004           0 :             i += step;
    1005           0 :             v1 = _mm256_permute4x64_epi64(in[i], 0x1B);
    1006           0 :             v1 = _mm256_shuffle_epi32(v1, 0xB1);
    1007           0 :             v1 = _mm256_add_epi32(v1, x1);
    1008           0 :             i += step;
    1009             :         }
    1010             :         else {
    1011           0 :             v0 = _mm256_add_epi32(in[i], x0);
    1012           0 :             i += step;
    1013           0 :             v1 = _mm256_add_epi32(in[i], x1);
    1014           0 :             i += step;
    1015             :         }
    1016             : 
    1017           0 :         highbd_clamp_epi32(&v0, bd);
    1018           0 :         highbd_clamp_epi32(&v1, bd);
    1019             : 
    1020             :         // Pack and Swap 128bits from ABCD to ACBD
    1021           0 :         v0 = _mm256_packus_epi32(v0, v1);
    1022             :         // Swap 64bits from ACBD to ABCD
    1023           0 :         v0 = _mm256_permute4x64_epi64(v0, 0xD8);
    1024             : 
    1025           0 :         _mm_storeu_si128((__m128i *)output_w, _mm256_castsi256_si128(v0));
    1026           0 :         _mm_storeu_si128((__m128i *)(output_w + stride_w),
    1027           0 :             _mm256_extractf128_si256(v0, 0x1));
    1028             : 
    1029           0 :         output_r += 2 * stride_r;
    1030           0 :         output_w += 2 * stride_w;
    1031             :     }
    1032           0 : }
    1033             : 
    1034           0 : static INLINE void round_shift_8x8(__m256i *in, int32_t shift) {
    1035           0 :     __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
    1036           0 :     int32_t i = 0;
    1037             : 
    1038           0 :     while (i < 8) {
    1039           0 :         in[i] = _mm256_add_epi32(in[i], rnding);
    1040           0 :         in[i] = _mm256_srai_epi32(in[i], shift);
    1041           0 :         i++;
    1042             :     }
    1043           0 : }
    1044             : 
    1045           0 : static INLINE void round_shift_8x8_double(__m256i *in, int32_t first, int32_t second) {
    1046           0 :     __m256i rnding = _mm256_set1_epi32(
    1047           0 :         (1 << (first - 1)) + (1 << (first + second - 1)));
    1048           0 :     int32_t i = 0;
    1049             : 
    1050           0 :     while (i < 8) {
    1051           0 :         in[i] = _mm256_add_epi32(in[i], rnding);
    1052           0 :         in[i] = _mm256_srai_epi32(in[i], first + second);
    1053           0 :         i++;
    1054             :     }
    1055           0 : }
    1056             : 
    1057           0 : static INLINE void idct8_col_avx2(__m256i *in, __m256i *out, int32_t bit) {
    1058           0 :     const int32_t *cospi = cospi_arr(bit);
    1059           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    1060           0 :     const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
    1061           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    1062           0 :     const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
    1063           0 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    1064           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    1065           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    1066           0 :     const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
    1067           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    1068           0 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    1069           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    1070           0 :     const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
    1071             :     __m256i tmp[8], tmp2[8];
    1072             : 
    1073             :     //stage 1
    1074             : 
    1075             :     //stage 2
    1076           0 :     tmp[4] = half_btf_avx2(&cospi56, &in[1], &cospim8, &in[7], &rounding, bit);
    1077           0 :     tmp[5] = half_btf_avx2(&cospi24, &in[5], &cospim40, &in[3], &rounding, bit);
    1078           0 :     tmp[6] = half_btf_avx2(&cospi40, &in[5], &cospi24, &in[3], &rounding, bit);
    1079           0 :     tmp[7] = half_btf_avx2(&cospi8, &in[1], &cospi56, &in[7], &rounding, bit);
    1080             : 
    1081             :     //stage 3
    1082           0 :     tmp2[0] = half_btf_avx2(&cospi32, &in[0], &cospi32, &in[4], &rounding, bit);
    1083           0 :     tmp2[1] = half_btf_avx2(&cospi32, &in[0], &cospim32, &in[4], &rounding, bit);
    1084           0 :     tmp2[2] = half_btf_avx2(&cospi48, &in[2], &cospim16, &in[6], &rounding, bit);
    1085           0 :     tmp2[3] = half_btf_avx2(&cospi16, &in[2], &cospi48, &in[6], &rounding, bit);
    1086           0 :     tmp2[4] = _mm256_add_epi32(tmp[4], tmp[5]);
    1087           0 :     tmp2[5] = _mm256_sub_epi32(tmp[4], tmp[5]);
    1088           0 :     tmp2[6] = _mm256_sub_epi32(tmp[7], tmp[6]);
    1089           0 :     tmp2[7] = _mm256_add_epi32(tmp[6], tmp[7]);
    1090             : 
    1091             :     //stage 4
    1092           0 :     tmp[0] = _mm256_add_epi32(tmp2[0], tmp2[3]);
    1093           0 :     tmp[1] = _mm256_add_epi32(tmp2[1], tmp2[2]);
    1094           0 :     tmp[2] = _mm256_sub_epi32(tmp2[1], tmp2[2]);
    1095           0 :     tmp[3] = _mm256_sub_epi32(tmp2[0], tmp2[3]);
    1096           0 :     tmp[5] = half_btf_avx2(&cospim32, &tmp2[5], &cospi32, &tmp2[6],
    1097             :         &rounding, bit);
    1098           0 :     tmp[6] = half_btf_avx2(&cospi32, &tmp2[5], &cospi32, &tmp2[6],
    1099             :         &rounding, bit);
    1100             : 
    1101             :     //stage 5
    1102           0 :     out[0] = _mm256_add_epi32(tmp[0], tmp2[7]);
    1103           0 :     out[1] = _mm256_add_epi32(tmp[1], tmp[6]);
    1104           0 :     out[2] = _mm256_add_epi32(tmp[2], tmp[5]);
    1105           0 :     out[3] = _mm256_add_epi32(tmp[3], tmp2[4]);
    1106           0 :     out[4] = _mm256_sub_epi32(tmp[3], tmp2[4]);
    1107           0 :     out[5] = _mm256_sub_epi32(tmp[2], tmp[5]);
    1108           0 :     out[6] = _mm256_sub_epi32(tmp[1], tmp[6]);
    1109           0 :     out[7] = _mm256_sub_epi32(tmp[0], tmp2[7]);
    1110           0 : }
    1111             : 
    1112           0 : static INLINE void iadst8_col_avx2(__m256i *in, __m256i *out, int8_t cos_bit) {
    1113           0 :     const int32_t *cospi = cospi_arr(cos_bit);
    1114           0 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    1115           0 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    1116           0 :     const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
    1117           0 :     const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
    1118           0 :     const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
    1119           0 :     const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
    1120           0 :     const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
    1121           0 :     const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    1122           0 :     const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
    1123           0 :     const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
    1124           0 :     const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
    1125           0 :     const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
    1126           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    1127           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    1128           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    1129           0 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    1130           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    1131           0 :     const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
    1132           0 :     const __m256i negative = _mm256_set1_epi32(-1);
    1133           0 :     const __m256i rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
    1134             :     __m256i tmp[8], tmp2[4];
    1135             : 
    1136             :     //stage 1
    1137             :     //stage 2
    1138           0 :     tmp[0] =
    1139           0 :         half_btf_avx2(&cospi4, &in[7], &cospi60, &in[0], &rounding, cos_bit);
    1140           0 :     tmp[1] =
    1141           0 :         half_btf_avx2(&cospi60, &in[7], &cospim4, &in[0], &rounding, cos_bit);
    1142           0 :     tmp[2] =
    1143           0 :         half_btf_avx2(&cospi20, &in[5], &cospi44, &in[2], &rounding, cos_bit);
    1144           0 :     tmp[3] =
    1145           0 :         half_btf_avx2(&cospi44, &in[5], &cospim20, &in[2], &rounding, cos_bit);
    1146           0 :     tmp[4] =
    1147           0 :         half_btf_avx2(&cospi36, &in[3], &cospi28, &in[4], &rounding, cos_bit);
    1148           0 :     tmp[5] =
    1149           0 :         half_btf_avx2(&cospi28, &in[3], &cospim36, &in[4], &rounding, cos_bit);
    1150           0 :     tmp[6] =
    1151           0 :         half_btf_avx2(&cospi52, &in[1], &cospi12, &in[6], &rounding, cos_bit);
    1152           0 :     tmp[7] =
    1153           0 :         half_btf_avx2(&cospi12, &in[1], &cospim52, &in[6], &rounding, cos_bit);
    1154             : 
    1155             :     //stage 3
    1156           0 :     out[7] = _mm256_add_epi32(tmp[0], tmp[4]);
    1157           0 :     out[1] = _mm256_add_epi32(tmp[1], tmp[5]);
    1158           0 :     out[2] = _mm256_add_epi32(tmp[2], tmp[6]);
    1159           0 :     out[3] = _mm256_add_epi32(tmp[3], tmp[7]);
    1160           0 :     tmp2[0] = _mm256_sub_epi32(tmp[0], tmp[4]);
    1161           0 :     tmp2[1] = _mm256_sub_epi32(tmp[1], tmp[5]);
    1162           0 :     tmp2[2] = _mm256_sub_epi32(tmp[2], tmp[6]);
    1163           0 :     tmp2[3] = _mm256_sub_epi32(tmp[3], tmp[7]);
    1164             : 
    1165             :     //stage 4
    1166           0 :     tmp[4] = half_btf_avx2(
    1167             :         &cospi16, &tmp2[0], &cospi48, &tmp2[1], &rounding, cos_bit);
    1168           0 :     tmp[5] = half_btf_avx2(
    1169             :         &cospi48, &tmp2[0], &cospim16, &tmp2[1], &rounding, cos_bit);
    1170           0 :     tmp[6] = half_btf_avx2(
    1171             :         &cospim48, &tmp2[2], &cospi16, &tmp2[3], &rounding, cos_bit);
    1172           0 :     tmp[7] = half_btf_avx2(
    1173             :         &cospi16, &tmp2[2], &cospi48, &tmp2[3], &rounding, cos_bit);
    1174             : 
    1175             :     //stage 5
    1176           0 :     out[0] = _mm256_add_epi32(out[7], out[2]);
    1177           0 :     tmp[1] = _mm256_add_epi32(out[1], out[3]);
    1178           0 :     tmp2[0] = _mm256_sub_epi32(out[7], out[2]);
    1179           0 :     tmp2[1] = _mm256_sub_epi32(out[1], out[3]);
    1180           0 :     out[1] = _mm256_add_epi32(tmp[4], tmp[6]);
    1181           0 :     out[6] = _mm256_add_epi32(tmp[5], tmp[7]);
    1182           0 :     tmp2[2] = _mm256_sub_epi32(tmp[4], tmp[6]);
    1183           0 :     tmp2[3] = _mm256_sub_epi32(tmp[5], tmp[7]);
    1184             : 
    1185             :     //stage 6
    1186           0 :     tmp[2] = half_btf_avx2(
    1187             :         &cospi32, &tmp2[0], &cospi32, &tmp2[1], &rounding, cos_bit);
    1188           0 :     out[4] = half_btf_avx2(
    1189             :         &cospi32, &tmp2[0], &cospim32, &tmp2[1], &rounding, cos_bit);
    1190           0 :     out[2] = half_btf_avx2(
    1191             :         &cospi32, &tmp2[2], &cospi32, &tmp2[3], &rounding, cos_bit);
    1192           0 :     tmp[7] = half_btf_avx2(
    1193             :         &cospi32, &tmp2[2], &cospim32, &tmp2[3], &rounding, cos_bit);
    1194             : 
    1195             :     //stage 7
    1196           0 :     out[1] = _mm256_sign_epi32(out[1], negative);
    1197           0 :     out[3] = _mm256_sign_epi32(tmp[2], negative);
    1198           0 :     out[5] = _mm256_sign_epi32(tmp[7], negative);
    1199           0 :     out[7] = _mm256_sign_epi32(tmp[1], negative);
    1200           0 : }
    1201             : 
    1202           0 : void eb_av1_inv_txfm2d_add_8x8_avx2(const int32_t *input,
    1203             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    1204             :     TxType tx_type, int32_t bd) {
    1205             :     __m256i in[8], out[8];
    1206           0 :     const int8_t *shift = eb_inv_txfm_shift_ls[TX_8X8];
    1207           0 :     const int32_t txw_idx = get_txw_idx(TX_8X8);
    1208           0 :     const int32_t txh_idx = get_txh_idx(TX_8X8);
    1209             : 
    1210           0 :     switch (tx_type) {
    1211           0 :     case IDTX:
    1212           0 :         load_buffer_8x8(input, in);
    1213             :         // Operations can be joined together without losing precision
    1214             :         // eb_av1_iidentity8_c() shift left 1 bits
    1215             :         // round_shift_8x8(, -shift[0]) shift right 1 bits
    1216             :         // eb_av1_iidentity8_c() shift left 1 bits
    1217             :         // round_shift_8x8(, -shift[1]) shift right 4 bits with complement
    1218           0 :         round_shift_8x8(in, -shift[0] - shift[1] - 2);
    1219           0 :         write_buffer_8x8(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
    1220           0 :         break;
    1221           0 :     case V_DCT:
    1222           0 :         load_buffer_8x8(input, in);
    1223             :         // eb_av1_iidentity8_c() shift left 1 bits
    1224             :         // round_shift_8x8(, -shift[0]) shift right 1 bits
    1225           0 :         idct8_col_avx2(in, out, inv_cos_bit_col[txw_idx][txh_idx]);
    1226           0 :         round_shift_8x8(out, -shift[1]);
    1227           0 :         write_buffer_8x8(out, output_r, stride_r, output_w, stride_w, 0, 0, bd);
    1228           0 :         break;
    1229           0 :     case H_DCT:
    1230           0 :         load_buffer_8x8(input, in);
    1231           0 :         transpose_8x8_avx2(in, out);
    1232           0 :         idct8_col_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
    1233           0 :         transpose_8x8_avx2(in, out);
    1234             :         // eb_av1_iidentity8_c() shift left 1 bits
    1235             :         // round_shift_8x8(, -shift[1]) shift right 4 bits with complement
    1236           0 :         round_shift_8x8_double(out, -shift[0], -shift[1] - 1);
    1237           0 :         write_buffer_8x8(out, output_r, stride_r, output_w, stride_w, 0, 0, bd);
    1238           0 :         break;
    1239           0 :     case V_ADST:
    1240           0 :         load_buffer_8x8(input, in);
    1241             :         // eb_av1_iidentity8_c() shift left 1 bits
    1242             :         // round_shift_8x8(, -shift[0]) shift right 1 bits
    1243           0 :         iadst8_col_avx2(in, out, inv_cos_bit_col[txw_idx][txh_idx]);
    1244           0 :         round_shift_8x8(out, -shift[1]);
    1245           0 :         write_buffer_8x8(out, output_r, stride_r, output_w, stride_w, 0, 0, bd);
    1246           0 :         break;
    1247           0 :     case H_ADST:
    1248           0 :         load_buffer_8x8(input, in);
    1249           0 :         transpose_8x8_avx2(in, out);
    1250           0 :         iadst8_col_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
    1251           0 :         transpose_8x8_avx2(in, out);
    1252             :         // eb_av1_iidentity8_c() shift left 1 bits
    1253             :         // round_shift_8x8(, -shift[1]) shift right 4 bits with complement
    1254           0 :         round_shift_8x8_double(out, -shift[0], -shift[1] - 1);
    1255           0 :         write_buffer_8x8(out, output_r, stride_r, output_w, stride_w, 0, 0, bd);
    1256           0 :         break;
    1257           0 :     case V_FLIPADST:
    1258           0 :         load_buffer_8x8(input, in);
    1259             :         // eb_av1_iidentity8_c() shift left 1 bits
    1260             :         // round_shift_8x8(, -shift[0]) shift right 1 bits
    1261           0 :         iadst8_col_avx2(in, out, inv_cos_bit_col[txw_idx][txh_idx]);
    1262           0 :         round_shift_8x8(out, -shift[1]);
    1263           0 :         write_buffer_8x8(out, output_r, stride_r, output_w, stride_w, 0, 1, bd);
    1264           0 :         break;
    1265           0 :     case H_FLIPADST:
    1266           0 :         load_buffer_8x8(input, in);
    1267           0 :         transpose_8x8_avx2(in, out);
    1268           0 :         iadst8_col_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
    1269           0 :         transpose_8x8_avx2(in, out);
    1270             :         // eb_av1_iidentity8_c() shift left 1 bits
    1271             :         // round_shift_8x8(, -shift[1]) shift right 4 bits with complement
    1272           0 :         round_shift_8x8_double(out, -shift[0], -shift[1] - 1);
    1273           0 :         write_buffer_8x8(out, output_r, stride_r, output_w, stride_w, 1, 0, bd);
    1274           0 :         break;
    1275           0 :     default:
    1276           0 :         eb_av1_inv_txfm2d_add_8x8_sse4_1(input,
    1277             :             output_r, stride_r, output_w, stride_w, tx_type, bd);
    1278           0 :         break;
    1279             :     }
    1280           0 : }
    1281             : 
    1282           0 : static void load_buffer_16x16(const int32_t *coeff, __m256i *in) {
    1283             :     int32_t i;
    1284           0 :     for (i = 0; i < 32; ++i) {
    1285           0 :         in[i] = _mm256_loadu_si256((const __m256i *)coeff);
    1286           0 :         coeff += 8;
    1287             :     }
    1288           0 : }
    1289             : 
    1290           0 : static INLINE void write_buffer_16x16(__m256i *in,
    1291             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    1292             :     int32_t fliplr, int32_t flipud, int32_t bd) {
    1293             :     __m256i u0, x0, x1, v0, v1;
    1294           0 :     const __m256i zero = _mm256_setzero_si256();
    1295           0 :     int32_t i = 0;
    1296             : 
    1297           0 :     if (flipud) {
    1298           0 :         output_r += stride_r * 15;
    1299           0 :         stride_r = -stride_r;
    1300           0 :         output_w += stride_w * 15;
    1301           0 :         stride_w = -stride_w;
    1302             :     }
    1303             : 
    1304           0 :     while (i < 32) {
    1305           0 :         u0 = _mm256_loadu_si256((const __m256i *)output_r);
    1306             : 
    1307           0 :         x0 = _mm256_unpacklo_epi16(u0, zero);
    1308           0 :         x1 = _mm256_unpackhi_epi16(u0, zero);
    1309             : 
    1310           0 :         if (fliplr) {
    1311           0 :             v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x13);
    1312           0 :             v0 = _mm256_shuffle_epi32(v0, 0x1B);
    1313           0 :             v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x02);
    1314           0 :             v1 = _mm256_shuffle_epi32(v1, 0x1B);
    1315             :         }
    1316             :         else {
    1317           0 :             v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x20);
    1318           0 :             v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x31);
    1319             :         }
    1320             : 
    1321           0 :         v0 = _mm256_add_epi32(v0, x0);
    1322           0 :         v1 = _mm256_add_epi32(v1, x1);
    1323           0 :         highbd_clamp_epi32(&v0, bd);
    1324           0 :         highbd_clamp_epi32(&v1, bd);
    1325             : 
    1326           0 :         v0 = _mm256_packus_epi32(v0, v1);
    1327             : 
    1328           0 :         _mm256_storeu_si256((__m256i *)output_w, v0);
    1329             : 
    1330           0 :         output_r += stride_r;
    1331           0 :         output_w += stride_w;
    1332           0 :         i += 2;
    1333             :     }
    1334           0 : }
    1335             : 
    1336           0 : static INLINE void round_shift_16x16(__m256i *in, int32_t shift) {
    1337           0 :     __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
    1338           0 :     int32_t i = 0;
    1339             : 
    1340           0 :     while (i < 32) {
    1341           0 :         in[i] = _mm256_add_epi32(in[i], rnding);
    1342           0 :         in[i] = _mm256_srai_epi32(in[i], shift);
    1343           0 :         i++;
    1344             :     }
    1345           0 : }
    1346             : 
    1347           0 : static INLINE void iidentity16_and_round_shift_avx2(__m256i *input, int32_t shift)
    1348             : {
    1349             :     // Input takes 18 bits, can be multiplied with NewSqrt2 in 32 bits space.
    1350             :     // Multiplied by half value NewSqrt2, instead (2*NewSqrt2),
    1351             :     // and round_shift() by one bit less (NewSqrt2Bits-1).
    1352             :     // round_shift(NewSqrt2Bits-1) and next round_shift(shift) in one pass.
    1353           0 :     const __m256i scalar = _mm256_set1_epi32(NewSqrt2);
    1354           0 :     const __m256i rnding = _mm256_set1_epi32((1 << (NewSqrt2Bits - 2)) +
    1355           0 :         (!!(shift) << (shift + NewSqrt2Bits - 2)));
    1356             : 
    1357           0 :     for (int32_t i = 0; i < 32; i++) {
    1358           0 :         input[i] = _mm256_mullo_epi32(input[i], scalar);
    1359           0 :         input[i] = _mm256_add_epi32(input[i], rnding);
    1360           0 :         input[i] = _mm256_srai_epi32(input[i], NewSqrt2Bits - 1 + shift);
    1361             :     }
    1362           0 : }
    1363             : 
    1364           0 : static INLINE void idct16_col_avx2(__m256i *in, __m256i *out, int32_t bit,
    1365             :     const int8_t *shift) {
    1366             :     (void) shift;
    1367           0 :     const int32_t *cospi = cospi_arr(bit);
    1368           0 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    1369           0 :     const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
    1370           0 :     const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
    1371           0 :     const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    1372           0 :     const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
    1373           0 :     const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
    1374           0 :     const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
    1375           0 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    1376           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    1377           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    1378           0 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    1379           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    1380           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    1381           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    1382           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    1383           0 :     const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
    1384           0 :     const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
    1385           0 :     const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
    1386           0 :     const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
    1387           0 :     const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
    1388           0 :     const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
    1389           0 :     const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
    1390           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    1391           0 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    1392           0 :     const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
    1393             :     __m256i tmp[16], tmp2[16];
    1394           0 :     int32_t col = 0;
    1395             : 
    1396           0 :     for (col = 0; col < 2; ++col) {
    1397             :         //stage 1
    1398             : 
    1399             :         //stage 2
    1400           0 :         tmp[8] = half_btf_avx2(&cospi60, &in[1 * 2 + col],
    1401           0 :             &cospim4, &in[15 * 2 + col], &rounding, bit);
    1402           0 :         tmp[9] = half_btf_avx2(&cospi28, &in[9 * 2 + col],
    1403           0 :             &cospim36, &in[7 * 2 + col], &rounding, bit);
    1404           0 :         tmp[10] = half_btf_avx2(&cospi44, &in[5 * 2 + col],
    1405           0 :             &cospim20, &in[11 * 2 + col], &rounding, bit);
    1406           0 :         tmp[11] = half_btf_avx2(&cospi12, &in[13 * 2 + col],
    1407           0 :             &cospim52, &in[3 * 2 + col], &rounding, bit);
    1408           0 :         tmp[12] = half_btf_avx2(&cospi52, &in[13 * 2 + col],
    1409           0 :             &cospi12, &in[3 * 2 + col], &rounding, bit);
    1410           0 :         tmp[13] = half_btf_avx2(&cospi20, &in[5 * 2 + col],
    1411           0 :             &cospi44, &in[11 * 2 + col], &rounding, bit);
    1412           0 :         tmp[14] = half_btf_avx2(&cospi36, &in[9 * 2 + col],
    1413           0 :             &cospi28, &in[7 * 2 + col], &rounding, bit);
    1414           0 :         tmp[15] = half_btf_avx2(&cospi4, &in[1 * 2 + col],
    1415           0 :             &cospi60, &in[15 * 2 + col], &rounding, bit);
    1416             : 
    1417             :         //stage 3
    1418           0 :         tmp2[0] = half_btf_avx2(&cospi56, &in[2 * 2 + col],
    1419           0 :             &cospim8, &in[14 * 2 + col], &rounding, bit);
    1420           0 :         tmp2[1] = half_btf_avx2(&cospi24, &in[10 * 2 + col],
    1421           0 :             &cospim40, &in[6 * 2 + col], &rounding, bit);
    1422           0 :         tmp2[2] = half_btf_avx2(&cospi40, &in[10 * 2 + col],
    1423           0 :             &cospi24, &in[6 * 2 + col], &rounding, bit);
    1424           0 :         tmp2[3] = half_btf_avx2(&cospi8, &in[2 * 2 + col],
    1425           0 :             &cospi56, &in[14 * 2 + col], &rounding, bit);
    1426           0 :         tmp2[4] = _mm256_add_epi32(tmp[8], tmp[9]);
    1427           0 :         tmp2[5] = _mm256_sub_epi32(tmp[8], tmp[9]);
    1428           0 :         tmp2[6] = _mm256_sub_epi32(tmp[11], tmp[10]);
    1429           0 :         tmp2[7] = _mm256_add_epi32(tmp[10], tmp[11]);
    1430           0 :         tmp2[8] = _mm256_add_epi32(tmp[12], tmp[13]);
    1431           0 :         tmp2[9] = _mm256_sub_epi32(tmp[12], tmp[13]);
    1432           0 :         tmp2[10] = _mm256_sub_epi32(tmp[15], tmp[14]);
    1433           0 :         tmp2[11] = _mm256_add_epi32(tmp[14], tmp[15]);
    1434             : 
    1435             :         //stage 4
    1436           0 :         tmp[0] = half_btf_avx2(&cospi32, &in[0 * 2 + col],
    1437           0 :             &cospi32, &in[8 * 2 + col], &rounding, bit);
    1438           0 :         tmp[1] = half_btf_avx2(&cospi32, &in[0 * 2 + col],
    1439           0 :             &cospim32, &in[8 * 2 + col], &rounding, bit);
    1440           0 :         tmp[2] = half_btf_avx2(&cospi48, &in[4 * 2 + col],
    1441           0 :             &cospim16, &in[12 * 2 + col], &rounding, bit);
    1442           0 :         tmp[3] = half_btf_avx2(&cospi16, &in[4 * 2 + col],
    1443           0 :             &cospi48, &in[12 * 2 + col], &rounding, bit);
    1444           0 :         tmp[4] = _mm256_add_epi32(tmp2[0], tmp2[1]);
    1445           0 :         tmp[5] = _mm256_sub_epi32(tmp2[0], tmp2[1]);
    1446           0 :         tmp[6] = _mm256_sub_epi32(tmp2[3], tmp2[2]);
    1447           0 :         tmp[7] = _mm256_add_epi32(tmp2[2], tmp2[3]);
    1448           0 :         tmp[9] = half_btf_avx2(&cospim16, &tmp2[5],
    1449             :             &cospi48, &tmp2[10], &rounding, bit);
    1450           0 :         tmp[10] = half_btf_avx2(&cospim48, &tmp2[6],
    1451             :             &cospim16, &tmp2[9], &rounding, bit);
    1452           0 :         tmp[13] = half_btf_avx2(&cospim16, &tmp2[6],
    1453             :             &cospi48, &tmp2[9], &rounding, bit);
    1454           0 :         tmp[14] = half_btf_avx2(&cospi48, &tmp2[5],
    1455             :             &cospi16, &tmp2[10], &rounding, bit);
    1456             : 
    1457             :         //stage 5
    1458           0 :         tmp2[12] = _mm256_sub_epi32(tmp2[11], tmp2[8]);
    1459           0 :         tmp2[15] = _mm256_add_epi32(tmp2[8], tmp2[11]);
    1460           0 :         tmp2[8] = _mm256_add_epi32(tmp2[4], tmp2[7]);
    1461           0 :         tmp2[11] = _mm256_sub_epi32(tmp2[4], tmp2[7]);
    1462           0 :         tmp2[0] = _mm256_add_epi32(tmp[0], tmp[3]);
    1463           0 :         tmp2[1] = _mm256_add_epi32(tmp[1], tmp[2]);
    1464           0 :         tmp2[2] = _mm256_sub_epi32(tmp[1], tmp[2]);
    1465           0 :         tmp2[3] = _mm256_sub_epi32(tmp[0], tmp[3]);
    1466           0 :         tmp2[5] = half_btf_avx2(&cospim32, &tmp[5],
    1467             :             &cospi32, &tmp[6], &rounding, bit);
    1468           0 :         tmp2[6] = half_btf_avx2(&cospi32, &tmp[5],
    1469             :             &cospi32, &tmp[6], &rounding, bit);
    1470           0 :         tmp2[9] = _mm256_add_epi32(tmp[9], tmp[10]);
    1471           0 :         tmp2[10] = _mm256_sub_epi32(tmp[9], tmp[10]);
    1472           0 :         tmp2[13] = _mm256_sub_epi32(tmp[14], tmp[13]);
    1473           0 :         tmp2[14] = _mm256_add_epi32(tmp[13], tmp[14]);
    1474             : 
    1475             :         //stage 6
    1476           0 :         tmp[0] = _mm256_add_epi32(tmp2[0], tmp[7]);
    1477           0 :         tmp[1] = _mm256_add_epi32(tmp2[1], tmp2[6]);
    1478           0 :         tmp[2] = _mm256_add_epi32(tmp2[2], tmp2[5]);
    1479           0 :         tmp[3] = _mm256_add_epi32(tmp2[3], tmp[4]);
    1480           0 :         tmp[4] = _mm256_sub_epi32(tmp2[3], tmp[4]);
    1481           0 :         tmp[5] = _mm256_sub_epi32(tmp2[2], tmp2[5]);
    1482           0 :         tmp[6] = _mm256_sub_epi32(tmp2[1], tmp2[6]);
    1483           0 :         tmp[7] = _mm256_sub_epi32(tmp2[0], tmp[7]);
    1484           0 :         tmp[10] = half_btf_avx2(&cospim32, &tmp2[10],
    1485             :             &cospi32, &tmp2[13], &rounding, bit);
    1486           0 :         tmp[11] = half_btf_avx2(&cospim32, &tmp2[11],
    1487             :             &cospi32, &tmp2[12], &rounding, bit);
    1488           0 :         tmp[12] = half_btf_avx2(&cospi32, &tmp2[11],
    1489             :             &cospi32, &tmp2[12], &rounding, bit);
    1490           0 :         tmp[13] = half_btf_avx2(&cospi32, &tmp2[10],
    1491             :             &cospi32, &tmp2[13], &rounding, bit);
    1492             : 
    1493             :         //stage 7
    1494           0 :         out[0 * 2 + col] = _mm256_add_epi32(tmp[0], tmp2[15]);
    1495           0 :         out[1 * 2 + col] = _mm256_add_epi32(tmp[1], tmp2[14]);
    1496           0 :         out[2 * 2 + col] = _mm256_add_epi32(tmp[2], tmp[13]);
    1497           0 :         out[3 * 2 + col] = _mm256_add_epi32(tmp[3], tmp[12]);
    1498           0 :         out[4 * 2 + col] = _mm256_add_epi32(tmp[4], tmp[11]);
    1499           0 :         out[5 * 2 + col] = _mm256_add_epi32(tmp[5], tmp[10]);
    1500           0 :         out[6 * 2 + col] = _mm256_add_epi32(tmp[6], tmp2[9]);
    1501           0 :         out[7 * 2 + col] = _mm256_add_epi32(tmp[7], tmp2[8]);
    1502           0 :         out[8 * 2 + col] = _mm256_sub_epi32(tmp[7], tmp2[8]);
    1503           0 :         out[9 * 2 + col] = _mm256_sub_epi32(tmp[6], tmp2[9]);
    1504           0 :         out[10 * 2 + col] = _mm256_sub_epi32(tmp[5], tmp[10]);
    1505           0 :         out[11 * 2 + col] = _mm256_sub_epi32(tmp[4], tmp[11]);
    1506           0 :         out[12 * 2 + col] = _mm256_sub_epi32(tmp[3], tmp[12]);
    1507           0 :         out[13 * 2 + col] = _mm256_sub_epi32(tmp[2], tmp[13]);
    1508           0 :         out[14 * 2 + col] = _mm256_sub_epi32(tmp[1], tmp2[14]);
    1509           0 :         out[15 * 2 + col] = _mm256_sub_epi32(tmp[0], tmp2[15]);
    1510             :     }
    1511           0 : }
    1512             : 
    1513           0 : static INLINE void iadst16_col_avx2(__m256i *in, __m256i *out,
    1514             :     int8_t cos_bit) {
    1515           0 :     const int32_t *cospi = cospi_arr(cos_bit);
    1516           0 :     const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
    1517           0 :     const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
    1518           0 :     const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
    1519           0 :     const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
    1520           0 :     const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
    1521           0 :     const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
    1522           0 :     const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
    1523           0 :     const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
    1524           0 :     const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
    1525           0 :     const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
    1526           0 :     const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
    1527           0 :     const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
    1528           0 :     const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
    1529           0 :     const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
    1530           0 :     const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
    1531           0 :     const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
    1532             : 
    1533           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    1534           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    1535           0 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    1536           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    1537             : 
    1538           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    1539           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    1540           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    1541             : 
    1542           0 :     const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
    1543           0 :     const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
    1544           0 :     const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
    1545           0 :     const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
    1546           0 :     const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
    1547           0 :     const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
    1548           0 :     const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
    1549           0 :     const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
    1550           0 :     const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
    1551           0 :     const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
    1552           0 :     const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
    1553           0 :     const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
    1554           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    1555           0 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    1556           0 :     const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
    1557             : 
    1558           0 :     const __m256i negative = _mm256_set1_epi32(-1);
    1559           0 :     const __m256i rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
    1560             : 
    1561             :     __m256i tmp[16], tmp2[16], tmp3[16];
    1562             : 
    1563           0 :     int32_t col = 0;
    1564             : 
    1565           0 :     for (col = 0; col < 2; ++col) {
    1566             :         //stage 1
    1567             : 
    1568             :         //stage 2
    1569           0 :         tmp[0] = half_btf_avx2(&cospi2, &in[15 * 2 + col],
    1570           0 :             &cospi62, &in[0 * 2 + col], &rounding, cos_bit);
    1571           0 :         tmp[1] = half_btf_avx2(&cospi62, &in[15 * 2 + col],
    1572           0 :             &cospim2, &in[0 * 2 + col], &rounding, cos_bit);
    1573           0 :         tmp[2] = half_btf_avx2(&cospi10, &in[13 * 2 + col],
    1574           0 :             &cospi54, &in[2 * 2 + col], &rounding, cos_bit);
    1575           0 :         tmp[3] = half_btf_avx2(&cospi54, &in[13 * 2 + col],
    1576           0 :             &cospim10, &in[2 * 2 + col], &rounding, cos_bit);
    1577           0 :         tmp[4] = half_btf_avx2(&cospi18, &in[11 * 2 + col],
    1578           0 :             &cospi46, &in[4 * 2 + col], &rounding, cos_bit);
    1579           0 :         tmp[5] = half_btf_avx2(&cospi46, &in[11 * 2 + col],
    1580           0 :             &cospim18, &in[4 * 2 + col], &rounding, cos_bit);
    1581           0 :         tmp[6] = half_btf_avx2(&cospi26, &in[9 * 2 + col],
    1582           0 :             &cospi38, &in[6 * 2 + col], &rounding, cos_bit);
    1583           0 :         tmp[7] = half_btf_avx2(&cospi38, &in[9 * 2 + col],
    1584           0 :             &cospim26, &in[6 * 2 + col], &rounding, cos_bit);
    1585           0 :         tmp[8] = half_btf_avx2(&cospi34, &in[7 * 2 + col],
    1586           0 :             &cospi30, &in[8 * 2 + col], &rounding, cos_bit);
    1587           0 :         tmp[9] = half_btf_avx2(&cospi30, &in[7 * 2 + col],
    1588           0 :             &cospim34, &in[8 * 2 + col], &rounding, cos_bit);
    1589           0 :         tmp[10] = half_btf_avx2(&cospi42, &in[5 * 2 + col],
    1590           0 :             &cospi22, &in[10 * 2 + col], &rounding, cos_bit);
    1591           0 :         tmp[11] = half_btf_avx2(&cospi22, &in[5 * 2 + col],
    1592           0 :             &cospim42, &in[10 * 2 + col], &rounding, cos_bit);
    1593           0 :         tmp[12] = half_btf_avx2(&cospi50, &in[3 * 2 + col],
    1594           0 :             &cospi14, &in[12 * 2 + col], &rounding, cos_bit);
    1595           0 :         tmp[13] = half_btf_avx2(&cospi14, &in[3 * 2 + col],
    1596           0 :             &cospim50, &in[12 * 2 + col], &rounding, cos_bit);
    1597           0 :         tmp[14] = half_btf_avx2(&cospi58, &in[1 * 2 + col],
    1598           0 :             &cospi6, &in[14 * 2 + col], &rounding, cos_bit);
    1599           0 :         tmp[15] = half_btf_avx2(&cospi6, &in[1 * 2 + col],
    1600           0 :             &cospim58, &in[14 * 2 + col], &rounding, cos_bit);
    1601             : 
    1602             :         //stage 3
    1603           0 :         tmp3[0] = _mm256_add_epi32(tmp[0], tmp[8]);
    1604           0 :         tmp3[1] = _mm256_add_epi32(tmp[1], tmp[9]);
    1605           0 :         tmp3[2] = _mm256_add_epi32(tmp[2], tmp[10]);
    1606           0 :         tmp3[3] = _mm256_add_epi32(tmp[3], tmp[11]);
    1607           0 :         tmp3[4] = _mm256_add_epi32(tmp[4], tmp[12]);
    1608           0 :         tmp3[5] = _mm256_add_epi32(tmp[5], tmp[13]);
    1609           0 :         tmp3[6] = _mm256_add_epi32(tmp[6], tmp[14]);
    1610           0 :         tmp3[7] = _mm256_add_epi32(tmp[7], tmp[15]);
    1611           0 :         tmp2[8] = _mm256_sub_epi32(tmp[0], tmp[8]);
    1612           0 :         tmp2[9] = _mm256_sub_epi32(tmp[1], tmp[9]);
    1613           0 :         tmp2[10] = _mm256_sub_epi32(tmp[2], tmp[10]);
    1614           0 :         tmp2[11] = _mm256_sub_epi32(tmp[3], tmp[11]);
    1615           0 :         tmp2[12] = _mm256_sub_epi32(tmp[4], tmp[12]);
    1616           0 :         tmp2[13] = _mm256_sub_epi32(tmp[5], tmp[13]);
    1617           0 :         tmp2[14] = _mm256_sub_epi32(tmp[6], tmp[14]);
    1618           0 :         tmp2[15] = _mm256_sub_epi32(tmp[7], tmp[15]);
    1619             : 
    1620             :         //stage 4
    1621           0 :         tmp[8] = half_btf_avx2(
    1622             :             &cospi8, &tmp2[8], &cospi56, &tmp2[9], &rounding, cos_bit);
    1623           0 :         tmp[9] = half_btf_avx2(
    1624             :             &cospi56, &tmp2[8], &cospim8, &tmp2[9], &rounding, cos_bit);
    1625           0 :         tmp[10] = half_btf_avx2(
    1626             :             &cospi40, &tmp2[10], &cospi24, &tmp2[11], &rounding, cos_bit);
    1627           0 :         tmp[11] = half_btf_avx2(
    1628             :             &cospi24, &tmp2[10], &cospim40, &tmp2[11], &rounding, cos_bit);
    1629           0 :         tmp[12] = half_btf_avx2(
    1630             :             &cospim56, &tmp2[12], &cospi8, &tmp2[13], &rounding, cos_bit);
    1631           0 :         tmp[13] = half_btf_avx2(
    1632             :             &cospi8, &tmp2[12], &cospi56, &tmp2[13], &rounding, cos_bit);
    1633           0 :         tmp[14] = half_btf_avx2(
    1634             :             &cospim24, &tmp2[14], &cospi40, &tmp2[15], &rounding, cos_bit);
    1635           0 :         tmp[15] = half_btf_avx2(
    1636             :             &cospi40, &tmp2[14], &cospi24, &tmp2[15], &rounding, cos_bit);
    1637             : 
    1638             :         //stage 5
    1639           0 :         tmp3[8] = _mm256_add_epi32(tmp3[0], tmp3[4]);
    1640           0 :         tmp3[9] = _mm256_add_epi32(tmp3[1], tmp3[5]);
    1641           0 :         tmp3[10] = _mm256_add_epi32(tmp3[2], tmp3[6]);
    1642           0 :         tmp3[11] = _mm256_add_epi32(tmp3[3], tmp3[7]);
    1643           0 :         tmp2[4] = _mm256_sub_epi32(tmp3[0], tmp3[4]);
    1644           0 :         tmp2[5] = _mm256_sub_epi32(tmp3[1], tmp3[5]);
    1645           0 :         tmp2[6] = _mm256_sub_epi32(tmp3[2], tmp3[6]);
    1646           0 :         tmp2[7] = _mm256_sub_epi32(tmp3[3], tmp3[7]);
    1647           0 :         tmp3[12] = _mm256_add_epi32(tmp[8], tmp[12]);
    1648           0 :         tmp3[13] = _mm256_add_epi32(tmp[9], tmp[13]);
    1649           0 :         tmp3[14] = _mm256_add_epi32(tmp[10], tmp[14]);
    1650           0 :         tmp3[15] = _mm256_add_epi32(tmp[11], tmp[15]);
    1651           0 :         tmp2[12] = _mm256_sub_epi32(tmp[8], tmp[12]);
    1652           0 :         tmp2[13] = _mm256_sub_epi32(tmp[9], tmp[13]);
    1653           0 :         tmp2[14] = _mm256_sub_epi32(tmp[10], tmp[14]);
    1654           0 :         tmp2[15] = _mm256_sub_epi32(tmp[11], tmp[15]);
    1655             : 
    1656             :         //stage 6
    1657           0 :         tmp[4] = half_btf_avx2(
    1658             :             &cospi16, &tmp2[4], &cospi48, &tmp2[5], &rounding, cos_bit);
    1659           0 :         tmp[5] = half_btf_avx2(
    1660             :             &cospi48, &tmp2[4], &cospim16, &tmp2[5], &rounding, cos_bit);
    1661           0 :         tmp[6] = half_btf_avx2(
    1662             :             &cospim48, &tmp2[6], &cospi16, &tmp2[7], &rounding, cos_bit);
    1663           0 :         tmp[7] = half_btf_avx2(
    1664             :             &cospi16, &tmp2[6], &cospi48, &tmp2[7], &rounding, cos_bit);
    1665           0 :         tmp[12] = half_btf_avx2(
    1666             :             &cospi16, &tmp2[12], &cospi48, &tmp2[13], &rounding, cos_bit);
    1667           0 :         tmp[13] = half_btf_avx2(
    1668             :             &cospi48, &tmp2[12], &cospim16, &tmp2[13], &rounding, cos_bit);
    1669           0 :         tmp[14] = half_btf_avx2(
    1670             :             &cospim48, &tmp2[14], &cospi16, &tmp2[15], &rounding, cos_bit);
    1671           0 :         tmp[15] = half_btf_avx2(
    1672             :             &cospi16, &tmp2[14], &cospi48, &tmp2[15], &rounding, cos_bit);
    1673             : 
    1674             :         //stage 7
    1675           0 :         out[0 * 2 + col] = _mm256_add_epi32(tmp3[8], tmp3[10]);
    1676           0 :         out[2 * 2 + col] = _mm256_add_epi32(tmp[12], tmp[14]);
    1677           0 :         out[12 * 2 + col] = _mm256_add_epi32(tmp[5], tmp[7]);
    1678           0 :         out[14 * 2 + col] = _mm256_add_epi32(tmp3[13], tmp3[15]);
    1679           0 :         tmp2[1] = _mm256_add_epi32(tmp3[9], tmp3[11]);
    1680           0 :         tmp2[2] = _mm256_sub_epi32(tmp3[8], tmp3[10]);
    1681           0 :         tmp2[3] = _mm256_sub_epi32(tmp3[9], tmp3[11]);
    1682           0 :         tmp2[4] = _mm256_add_epi32(tmp[4], tmp[6]);
    1683           0 :         tmp2[6] = _mm256_sub_epi32(tmp[4], tmp[6]);
    1684           0 :         tmp2[7] = _mm256_sub_epi32(tmp[5], tmp[7]);
    1685           0 :         tmp2[8] = _mm256_add_epi32(tmp3[12], tmp3[14]);
    1686           0 :         tmp2[10] = _mm256_sub_epi32(tmp3[12], tmp3[14]);
    1687           0 :         tmp2[11] = _mm256_sub_epi32(tmp3[13], tmp3[15]);
    1688           0 :         tmp2[13] = _mm256_add_epi32(tmp[13], tmp[15]);
    1689           0 :         tmp2[14] = _mm256_sub_epi32(tmp[12], tmp[14]);
    1690           0 :         tmp2[15] = _mm256_sub_epi32(tmp[13], tmp[15]);
    1691             : 
    1692             :         //stage 8
    1693           0 :         out[4 * 2 + col] = half_btf_avx2(
    1694             :             &cospi32, &tmp2[6], &cospi32, &tmp2[7], &rounding, cos_bit);
    1695           0 :         out[6 * 2 + col] = half_btf_avx2(
    1696             :             &cospi32, &tmp2[10], &cospi32, &tmp2[11], &rounding, cos_bit);
    1697           0 :         out[8 * 2 + col] = half_btf_avx2(
    1698             :             &cospi32, &tmp2[2], &cospim32, &tmp2[3], &rounding, cos_bit);
    1699           0 :         out[10 * 2 + col] = half_btf_avx2(
    1700             :             &cospi32, &tmp2[14], &cospim32, &tmp2[15], &rounding, cos_bit);
    1701           0 :         tmp[2] = half_btf_avx2(
    1702             :             &cospi32, &tmp2[2], &cospi32, &tmp2[3], &rounding, cos_bit);
    1703           0 :         tmp[7] = half_btf_avx2(
    1704             :             &cospi32, &tmp2[6], &cospim32, &tmp2[7], &rounding, cos_bit);
    1705           0 :         tmp[11] = half_btf_avx2(
    1706             :             &cospi32, &tmp2[10], &cospim32, &tmp2[11], &rounding, cos_bit);
    1707           0 :         tmp[14] = half_btf_avx2(
    1708             :             &cospi32, &tmp2[14], &cospi32, &tmp2[15], &rounding, cos_bit);
    1709             :         //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    1710             : 
    1711             :         //stage 9
    1712           0 :         out[1 * 2 + col] = _mm256_sign_epi32(tmp2[8], negative);
    1713           0 :         out[3 * 2 + col] = _mm256_sign_epi32(tmp2[4], negative);
    1714           0 :         out[5 * 2 + col] = _mm256_sign_epi32(tmp[14], negative);
    1715           0 :         out[7 * 2 + col] = _mm256_sign_epi32(tmp[2], negative);
    1716           0 :         out[9 * 2 + col] = _mm256_sign_epi32(tmp[11], negative);
    1717           0 :         out[11 * 2 + col] = _mm256_sign_epi32(tmp[7], negative);
    1718           0 :         out[13 * 2 + col] = _mm256_sign_epi32(tmp2[13], negative);
    1719           0 :         out[15 * 2 + col] = _mm256_sign_epi32(tmp2[1], negative);
    1720             :     }
    1721           0 : }
    1722             : 
    1723           0 : void eb_av1_inv_txfm2d_add_16x16_avx2(const int32_t *input,
    1724             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w,
    1725             :     int32_t stride_w, TxType tx_type, int32_t bd) {
    1726             :     __m256i in[32], out[32];
    1727           0 :     const int8_t *shift = eb_inv_txfm_shift_ls[TX_16X16];
    1728           0 :     const int32_t txw_idx = get_txw_idx(TX_16X16);
    1729           0 :     const int32_t txh_idx = get_txh_idx(TX_16X16);
    1730             : 
    1731           0 :     switch (tx_type) {
    1732           0 :     case IDTX:
    1733           0 :         load_buffer_16x16(input, in);
    1734           0 :         iidentity16_and_round_shift_avx2(in, -shift[0]);
    1735           0 :         iidentity16_and_round_shift_avx2(in, -shift[1]);
    1736           0 :         write_buffer_16x16(in, output_r, stride_r, output_w, stride_w,
    1737             :             0, 0, bd);
    1738           0 :         break;
    1739           0 :     case V_DCT:
    1740           0 :         load_buffer_16x16(input, in);
    1741           0 :         iidentity16_and_round_shift_avx2(in, -shift[0]);
    1742           0 :         idct16_col_avx2(in, out, inv_cos_bit_col[txw_idx][txh_idx], shift);
    1743           0 :         round_shift_16x16(out, -shift[1]);
    1744           0 :         write_buffer_16x16(out, output_r, stride_r, output_w, stride_w,
    1745             :             0, 0, bd);
    1746           0 :         break;
    1747           0 :     case H_DCT:
    1748           0 :         load_buffer_16x16(input, in);
    1749           0 :         transpose_16x16_avx2(in, out);
    1750           0 :         idct16_col_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx], shift);
    1751           0 :         transpose_16x16_avx2(in, out);
    1752           0 :         round_shift_16x16(out, -shift[0]);
    1753           0 :         iidentity16_and_round_shift_avx2(out, -shift[1]);
    1754           0 :         write_buffer_16x16(out, output_r, stride_r, output_w, stride_w,
    1755             :             0, 0, bd);
    1756           0 :         break;
    1757           0 :     case V_ADST:
    1758           0 :         load_buffer_16x16(input, in);
    1759           0 :         iidentity16_and_round_shift_avx2(in, -shift[0]);
    1760           0 :         iadst16_col_avx2(in, out, inv_cos_bit_col[txw_idx][txh_idx]);
    1761           0 :         round_shift_16x16(out, -shift[1]);
    1762           0 :         write_buffer_16x16(out, output_r, stride_r, output_w, stride_w,
    1763             :             0, 0, bd);
    1764           0 :         break;
    1765           0 :     case H_ADST:
    1766           0 :         load_buffer_16x16(input, in);
    1767           0 :         transpose_16x16_avx2(in, out);
    1768           0 :         iadst16_col_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
    1769           0 :         transpose_16x16_avx2(in, out);
    1770           0 :         round_shift_16x16(out, -shift[0]);
    1771           0 :         iidentity16_and_round_shift_avx2(out, -shift[1]);
    1772           0 :         write_buffer_16x16(out, output_r, stride_r, output_w, stride_w,
    1773             :             0, 0, bd);
    1774           0 :         break;
    1775           0 :     case V_FLIPADST:
    1776           0 :         load_buffer_16x16(input, in);
    1777           0 :         iidentity16_and_round_shift_avx2(in, -shift[0]);
    1778           0 :         iadst16_col_avx2(in, out, inv_cos_bit_col[txw_idx][txh_idx]);
    1779           0 :         round_shift_16x16(out, -shift[1]);
    1780           0 :         write_buffer_16x16(out, output_r, stride_r, output_w, stride_w,
    1781             :             0, 1, bd);
    1782           0 :         break;
    1783           0 :     case H_FLIPADST:
    1784           0 :         load_buffer_16x16(input, in);
    1785           0 :         transpose_16x16_avx2(in, out);
    1786           0 :         iadst16_col_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
    1787           0 :         transpose_16x16_avx2(in, out);
    1788           0 :         round_shift_16x16(out, -shift[0]);
    1789           0 :         iidentity16_and_round_shift_avx2(out, -shift[1]);
    1790           0 :         write_buffer_16x16(out, output_r, stride_r, output_w, stride_w,
    1791             :             1, 0, bd);
    1792           0 :         break;
    1793           0 :     default:
    1794           0 :         eb_av1_inv_txfm2d_add_16x16_sse4_1(input,
    1795             :             output_r, stride_r, output_w, stride_w, tx_type, bd);
    1796           0 :         break;
    1797             :     }
    1798           0 : }
    1799             : 
    1800             : // Note:
    1801             : //  Total 32x4 registers to represent 32x32 block coefficients.
    1802             : //  For high bit depth, each coefficient is 4-byte.
    1803             : //  Each __m256i register holds 8 coefficients.
    1804             : //  So each "row" we needs 4 register. Totally 32 rows
    1805             : //  Register layout:
    1806             : //   v0,   v1,   v2,   v3,
    1807             : //   v4,   v5,   v6,   v7,
    1808             : //   ... ...
    1809             : //   v124, v125, v126, v127
    1810             : 
    1811           0 : static void transpose_32x32_8x8(const __m256i *in, __m256i *out) {
    1812             :     __m256i u0, u1, u2, u3, u4, u5, u6, u7;
    1813             :     __m256i x0, x1;
    1814             : 
    1815           0 :     u0 = _mm256_unpacklo_epi32(in[0], in[4]);
    1816           0 :     u1 = _mm256_unpackhi_epi32(in[0], in[4]);
    1817             : 
    1818           0 :     u2 = _mm256_unpacklo_epi32(in[8], in[12]);
    1819           0 :     u3 = _mm256_unpackhi_epi32(in[8], in[12]);
    1820             : 
    1821           0 :     u4 = _mm256_unpacklo_epi32(in[16], in[20]);
    1822           0 :     u5 = _mm256_unpackhi_epi32(in[16], in[20]);
    1823             : 
    1824           0 :     u6 = _mm256_unpacklo_epi32(in[24], in[28]);
    1825           0 :     u7 = _mm256_unpackhi_epi32(in[24], in[28]);
    1826             : 
    1827           0 :     x0 = _mm256_unpacklo_epi64(u0, u2);
    1828           0 :     x1 = _mm256_unpacklo_epi64(u4, u6);
    1829           0 :     out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
    1830           0 :     out[16] = _mm256_permute2f128_si256(x0, x1, 0x31);
    1831             : 
    1832           0 :     x0 = _mm256_unpackhi_epi64(u0, u2);
    1833           0 :     x1 = _mm256_unpackhi_epi64(u4, u6);
    1834           0 :     out[4] = _mm256_permute2f128_si256(x0, x1, 0x20);
    1835           0 :     out[20] = _mm256_permute2f128_si256(x0, x1, 0x31);
    1836             : 
    1837           0 :     x0 = _mm256_unpacklo_epi64(u1, u3);
    1838           0 :     x1 = _mm256_unpacklo_epi64(u5, u7);
    1839           0 :     out[8] = _mm256_permute2f128_si256(x0, x1, 0x20);
    1840           0 :     out[24] = _mm256_permute2f128_si256(x0, x1, 0x31);
    1841             : 
    1842           0 :     x0 = _mm256_unpackhi_epi64(u1, u3);
    1843           0 :     x1 = _mm256_unpackhi_epi64(u5, u7);
    1844           0 :     out[12] = _mm256_permute2f128_si256(x0, x1, 0x20);
    1845           0 :     out[28] = _mm256_permute2f128_si256(x0, x1, 0x31);
    1846           0 : }
    1847             : 
    1848           0 : static void transpose_32x32_16x16(const __m256i *in, __m256i *out) {
    1849           0 :     transpose_32x32_8x8(&in[0], &out[0]);
    1850           0 :     transpose_32x32_8x8(&in[1], &out[32]);
    1851           0 :     transpose_32x32_8x8(&in[32], &out[1]);
    1852           0 :     transpose_32x32_8x8(&in[33], &out[33]);
    1853           0 : }
    1854             : 
    1855           0 : static void transpose_32x32(const __m256i *in, __m256i *out) {
    1856           0 :     transpose_32x32_16x16(&in[0], &out[0]);
    1857           0 :     transpose_32x32_16x16(&in[2], &out[64]);
    1858           0 :     transpose_32x32_16x16(&in[64], &out[2]);
    1859           0 :     transpose_32x32_16x16(&in[66], &out[66]);
    1860           0 : }
    1861             : 
    1862           0 : static void load_buffer_32x32_new(const int32_t *coeff, __m256i *in,
    1863             :     int32_t input_stiride, int32_t size) {
    1864             :     int32_t i;
    1865           0 :     for (i = 0; i < size; ++i)
    1866           0 :         in[i] = _mm256_loadu_si256((const __m256i *)(coeff + i * input_stiride));
    1867           0 : }
    1868             : 
    1869           0 : static void load_buffer_32x32(const int32_t *coeff, __m256i *in) {
    1870             :     int32_t i;
    1871           0 :     for (i = 0; i < 128; ++i) {
    1872           0 :         in[i] = _mm256_loadu_si256((const __m256i *)coeff);
    1873           0 :         coeff += 8;
    1874             :     }
    1875           0 : }
    1876             : 
    1877           0 : static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
    1878             :     const __m256i *rounding, int32_t bit) {
    1879             :     __m256i x;
    1880           0 :     x = _mm256_mullo_epi32(*w0, *n0);
    1881           0 :     x = _mm256_add_epi32(x, *rounding);
    1882           0 :     x = _mm256_srai_epi32(x, bit);
    1883           0 :     return x;
    1884             : }
    1885             : 
    1886           0 : static INLINE void round_shift_32x32(__m256i *in, int32_t shift) {
    1887           0 :     __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
    1888           0 :     int32_t i = 0;
    1889             : 
    1890           0 :     while (i < 128) {
    1891           0 :         in[i] = _mm256_add_epi32(in[i], rnding);
    1892           0 :         in[i] = _mm256_srai_epi32(in[i], shift);
    1893           0 :         i++;
    1894             :     }
    1895           0 : }
    1896             : 
    1897           0 : static void write_buffer_32x32(__m256i *in,
    1898             :     uint16_t *output_r, int32_t stride_r,
    1899             :     uint16_t *output_w, int32_t stride_w,
    1900             :     int32_t fliplr, int32_t flipud, int32_t bd) {
    1901             :     __m256i u0, u1, x0, x1, x2, x3, v0, v1, v2, v3;
    1902           0 :     const __m256i zero = _mm256_setzero_si256();
    1903           0 :     int32_t i = 0;
    1904             :     (void)fliplr;
    1905             :     (void)flipud;
    1906             : 
    1907           0 :     while (i < 128) {
    1908           0 :         u0 = _mm256_loadu_si256((const __m256i *)output_r);
    1909           0 :         u1 = _mm256_loadu_si256((const __m256i *)(output_r + 16));
    1910             : 
    1911           0 :         x0 = _mm256_unpacklo_epi16(u0, zero);
    1912           0 :         x1 = _mm256_unpackhi_epi16(u0, zero);
    1913           0 :         x2 = _mm256_unpacklo_epi16(u1, zero);
    1914           0 :         x3 = _mm256_unpackhi_epi16(u1, zero);
    1915             : 
    1916           0 :         v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x20);
    1917           0 :         v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x31);
    1918           0 :         v2 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x20);
    1919           0 :         v3 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x31);
    1920             : 
    1921           0 :         v0 = _mm256_add_epi32(v0, x0);
    1922           0 :         v1 = _mm256_add_epi32(v1, x1);
    1923           0 :         v2 = _mm256_add_epi32(v2, x2);
    1924           0 :         v3 = _mm256_add_epi32(v3, x3);
    1925             : 
    1926           0 :         highbd_clamp_epi32(&v0, bd);
    1927           0 :         highbd_clamp_epi32(&v1, bd);
    1928           0 :         highbd_clamp_epi32(&v2, bd);
    1929           0 :         highbd_clamp_epi32(&v3, bd);
    1930             : 
    1931           0 :         v0 = _mm256_packus_epi32(v0, v1);
    1932           0 :         v2 = _mm256_packus_epi32(v2, v3);
    1933             : 
    1934           0 :         _mm256_storeu_si256((__m256i *)output_w, v0);
    1935           0 :         _mm256_storeu_si256((__m256i *)(output_w + 16), v2);
    1936           0 :         output_r += stride_r;
    1937           0 :         output_w += stride_w;
    1938           0 :         i += 4;
    1939             :     }
    1940           0 : }
    1941             : 
    1942           0 : static void idct32_avx2(__m256i *in, __m256i *out, int32_t bit) {
    1943           0 :     const int32_t *cospi = cospi_arr(bit);
    1944           0 :     const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
    1945           0 :     const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
    1946           0 :     const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
    1947           0 :     const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
    1948           0 :     const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
    1949           0 :     const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
    1950           0 :     const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
    1951           0 :     const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
    1952           0 :     const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
    1953           0 :     const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
    1954           0 :     const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
    1955           0 :     const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
    1956           0 :     const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
    1957           0 :     const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
    1958           0 :     const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
    1959           0 :     const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
    1960           0 :     const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
    1961           0 :     const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
    1962           0 :     const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
    1963           0 :     const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
    1964           0 :     const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
    1965           0 :     const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
    1966           0 :     const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
    1967           0 :     const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
    1968           0 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    1969           0 :     const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
    1970           0 :     const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
    1971           0 :     const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    1972           0 :     const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
    1973           0 :     const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
    1974           0 :     const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
    1975           0 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    1976           0 :     const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
    1977           0 :     const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
    1978           0 :     const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
    1979           0 :     const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
    1980           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    1981           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    1982           0 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    1983           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    1984           0 :     const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
    1985           0 :     const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
    1986           0 :     const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
    1987           0 :     const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
    1988           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    1989           0 :     const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
    1990           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    1991           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    1992           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    1993           0 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    1994           0 :     const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
    1995             :     __m256i bf1[32], bf0[32];
    1996             :     int32_t col;
    1997             : 
    1998           0 :     for (col = 0; col < 4; ++col) {
    1999             :         // stage 0
    2000             :         // stage 1
    2001           0 :         bf1[0] = in[0 * 4 + col];
    2002           0 :         bf1[1] = in[16 * 4 + col];
    2003           0 :         bf1[2] = in[8 * 4 + col];
    2004           0 :         bf1[3] = in[24 * 4 + col];
    2005           0 :         bf1[4] = in[4 * 4 + col];
    2006           0 :         bf1[5] = in[20 * 4 + col];
    2007           0 :         bf1[6] = in[12 * 4 + col];
    2008           0 :         bf1[7] = in[28 * 4 + col];
    2009           0 :         bf1[8] = in[2 * 4 + col];
    2010           0 :         bf1[9] = in[18 * 4 + col];
    2011           0 :         bf1[10] = in[10 * 4 + col];
    2012           0 :         bf1[11] = in[26 * 4 + col];
    2013           0 :         bf1[12] = in[6 * 4 + col];
    2014           0 :         bf1[13] = in[22 * 4 + col];
    2015           0 :         bf1[14] = in[14 * 4 + col];
    2016           0 :         bf1[15] = in[30 * 4 + col];
    2017           0 :         bf1[16] = in[1 * 4 + col];
    2018           0 :         bf1[17] = in[17 * 4 + col];
    2019           0 :         bf1[18] = in[9 * 4 + col];
    2020           0 :         bf1[19] = in[25 * 4 + col];
    2021           0 :         bf1[20] = in[5 * 4 + col];
    2022           0 :         bf1[21] = in[21 * 4 + col];
    2023           0 :         bf1[22] = in[13 * 4 + col];
    2024           0 :         bf1[23] = in[29 * 4 + col];
    2025           0 :         bf1[24] = in[3 * 4 + col];
    2026           0 :         bf1[25] = in[19 * 4 + col];
    2027           0 :         bf1[26] = in[11 * 4 + col];
    2028           0 :         bf1[27] = in[27 * 4 + col];
    2029           0 :         bf1[28] = in[7 * 4 + col];
    2030           0 :         bf1[29] = in[23 * 4 + col];
    2031           0 :         bf1[30] = in[15 * 4 + col];
    2032           0 :         bf1[31] = in[31 * 4 + col];
    2033             : 
    2034             :         // stage 2
    2035           0 :         bf0[0] = bf1[0];
    2036           0 :         bf0[1] = bf1[1];
    2037           0 :         bf0[2] = bf1[2];
    2038           0 :         bf0[3] = bf1[3];
    2039           0 :         bf0[4] = bf1[4];
    2040           0 :         bf0[5] = bf1[5];
    2041           0 :         bf0[6] = bf1[6];
    2042           0 :         bf0[7] = bf1[7];
    2043           0 :         bf0[8] = bf1[8];
    2044           0 :         bf0[9] = bf1[9];
    2045           0 :         bf0[10] = bf1[10];
    2046           0 :         bf0[11] = bf1[11];
    2047           0 :         bf0[12] = bf1[12];
    2048           0 :         bf0[13] = bf1[13];
    2049           0 :         bf0[14] = bf1[14];
    2050           0 :         bf0[15] = bf1[15];
    2051           0 :         bf0[16] =
    2052           0 :             half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
    2053           0 :         bf0[17] =
    2054           0 :             half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
    2055           0 :         bf0[18] =
    2056           0 :             half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
    2057           0 :         bf0[19] =
    2058           0 :             half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
    2059           0 :         bf0[20] =
    2060           0 :             half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
    2061           0 :         bf0[21] =
    2062           0 :             half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
    2063           0 :         bf0[22] =
    2064           0 :             half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
    2065           0 :         bf0[23] =
    2066           0 :             half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
    2067           0 :         bf0[24] =
    2068           0 :             half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
    2069           0 :         bf0[25] =
    2070           0 :             half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
    2071           0 :         bf0[26] =
    2072           0 :             half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
    2073           0 :         bf0[27] =
    2074           0 :             half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
    2075           0 :         bf0[28] =
    2076           0 :             half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
    2077           0 :         bf0[29] =
    2078           0 :             half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
    2079           0 :         bf0[30] =
    2080           0 :             half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
    2081           0 :         bf0[31] =
    2082           0 :             half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
    2083             : 
    2084             :         // stage 3
    2085           0 :         bf1[0] = bf0[0];
    2086           0 :         bf1[1] = bf0[1];
    2087           0 :         bf1[2] = bf0[2];
    2088           0 :         bf1[3] = bf0[3];
    2089           0 :         bf1[4] = bf0[4];
    2090           0 :         bf1[5] = bf0[5];
    2091           0 :         bf1[6] = bf0[6];
    2092           0 :         bf1[7] = bf0[7];
    2093           0 :         bf1[8] =
    2094           0 :             half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
    2095           0 :         bf1[9] =
    2096           0 :             half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
    2097           0 :         bf1[10] =
    2098           0 :             half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
    2099           0 :         bf1[11] =
    2100           0 :             half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
    2101           0 :         bf1[12] =
    2102           0 :             half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
    2103           0 :         bf1[13] =
    2104           0 :             half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
    2105           0 :         bf1[14] =
    2106           0 :             half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
    2107           0 :         bf1[15] =
    2108           0 :             half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
    2109           0 :         bf1[16] = _mm256_add_epi32(bf0[16], bf0[17]);
    2110           0 :         bf1[17] = _mm256_sub_epi32(bf0[16], bf0[17]);
    2111           0 :         bf1[18] = _mm256_sub_epi32(bf0[19], bf0[18]);
    2112           0 :         bf1[19] = _mm256_add_epi32(bf0[18], bf0[19]);
    2113           0 :         bf1[20] = _mm256_add_epi32(bf0[20], bf0[21]);
    2114           0 :         bf1[21] = _mm256_sub_epi32(bf0[20], bf0[21]);
    2115           0 :         bf1[22] = _mm256_sub_epi32(bf0[23], bf0[22]);
    2116           0 :         bf1[23] = _mm256_add_epi32(bf0[22], bf0[23]);
    2117           0 :         bf1[24] = _mm256_add_epi32(bf0[24], bf0[25]);
    2118           0 :         bf1[25] = _mm256_sub_epi32(bf0[24], bf0[25]);
    2119           0 :         bf1[26] = _mm256_sub_epi32(bf0[27], bf0[26]);
    2120           0 :         bf1[27] = _mm256_add_epi32(bf0[26], bf0[27]);
    2121           0 :         bf1[28] = _mm256_add_epi32(bf0[28], bf0[29]);
    2122           0 :         bf1[29] = _mm256_sub_epi32(bf0[28], bf0[29]);
    2123           0 :         bf1[30] = _mm256_sub_epi32(bf0[31], bf0[30]);
    2124           0 :         bf1[31] = _mm256_add_epi32(bf0[30], bf0[31]);
    2125             : 
    2126             :         // stage 4
    2127           0 :         bf0[0] = bf1[0];
    2128           0 :         bf0[1] = bf1[1];
    2129           0 :         bf0[2] = bf1[2];
    2130           0 :         bf0[3] = bf1[3];
    2131           0 :         bf0[4] =
    2132           0 :             half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
    2133           0 :         bf0[5] =
    2134           0 :             half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
    2135           0 :         bf0[6] =
    2136           0 :             half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
    2137           0 :         bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
    2138           0 :         bf0[8] = _mm256_add_epi32(bf1[8], bf1[9]);
    2139           0 :         bf0[9] = _mm256_sub_epi32(bf1[8], bf1[9]);
    2140           0 :         bf0[10] = _mm256_sub_epi32(bf1[11], bf1[10]);
    2141           0 :         bf0[11] = _mm256_add_epi32(bf1[10], bf1[11]);
    2142           0 :         bf0[12] = _mm256_add_epi32(bf1[12], bf1[13]);
    2143           0 :         bf0[13] = _mm256_sub_epi32(bf1[12], bf1[13]);
    2144           0 :         bf0[14] = _mm256_sub_epi32(bf1[15], bf1[14]);
    2145           0 :         bf0[15] = _mm256_add_epi32(bf1[14], bf1[15]);
    2146           0 :         bf0[16] = bf1[16];
    2147           0 :         bf0[17] =
    2148           0 :             half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
    2149           0 :         bf0[18] =
    2150           0 :             half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
    2151           0 :         bf0[19] = bf1[19];
    2152           0 :         bf0[20] = bf1[20];
    2153           0 :         bf0[21] =
    2154           0 :             half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
    2155           0 :         bf0[22] =
    2156           0 :             half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
    2157           0 :         bf0[23] = bf1[23];
    2158           0 :         bf0[24] = bf1[24];
    2159           0 :         bf0[25] =
    2160           0 :             half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
    2161           0 :         bf0[26] =
    2162           0 :             half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
    2163           0 :         bf0[27] = bf1[27];
    2164           0 :         bf0[28] = bf1[28];
    2165           0 :         bf0[29] =
    2166           0 :             half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
    2167           0 :         bf0[30] =
    2168           0 :             half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
    2169           0 :         bf0[31] = bf1[31];
    2170             : 
    2171             :         // stage 5
    2172           0 :         bf1[0] =
    2173           0 :             half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
    2174           0 :         bf1[1] =
    2175           0 :             half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
    2176           0 :         bf1[2] =
    2177           0 :             half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
    2178           0 :         bf1[3] =
    2179           0 :             half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
    2180           0 :         bf1[4] = _mm256_add_epi32(bf0[4], bf0[5]);
    2181           0 :         bf1[5] = _mm256_sub_epi32(bf0[4], bf0[5]);
    2182           0 :         bf1[6] = _mm256_sub_epi32(bf0[7], bf0[6]);
    2183           0 :         bf1[7] = _mm256_add_epi32(bf0[6], bf0[7]);
    2184           0 :         bf1[8] = bf0[8];
    2185           0 :         bf1[9] =
    2186           0 :             half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
    2187           0 :         bf1[10] =
    2188           0 :             half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
    2189           0 :         bf1[11] = bf0[11];
    2190           0 :         bf1[12] = bf0[12];
    2191           0 :         bf1[13] =
    2192           0 :             half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
    2193           0 :         bf1[14] =
    2194           0 :             half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
    2195           0 :         bf1[15] = bf0[15];
    2196           0 :         bf1[16] = _mm256_add_epi32(bf0[16], bf0[19]);
    2197           0 :         bf1[17] = _mm256_add_epi32(bf0[17], bf0[18]);
    2198           0 :         bf1[18] = _mm256_sub_epi32(bf0[17], bf0[18]);
    2199           0 :         bf1[19] = _mm256_sub_epi32(bf0[16], bf0[19]);
    2200           0 :         bf1[20] = _mm256_sub_epi32(bf0[23], bf0[20]);
    2201           0 :         bf1[21] = _mm256_sub_epi32(bf0[22], bf0[21]);
    2202           0 :         bf1[22] = _mm256_add_epi32(bf0[21], bf0[22]);
    2203           0 :         bf1[23] = _mm256_add_epi32(bf0[20], bf0[23]);
    2204           0 :         bf1[24] = _mm256_add_epi32(bf0[24], bf0[27]);
    2205           0 :         bf1[25] = _mm256_add_epi32(bf0[25], bf0[26]);
    2206           0 :         bf1[26] = _mm256_sub_epi32(bf0[25], bf0[26]);
    2207           0 :         bf1[27] = _mm256_sub_epi32(bf0[24], bf0[27]);
    2208           0 :         bf1[28] = _mm256_sub_epi32(bf0[31], bf0[28]);
    2209           0 :         bf1[29] = _mm256_sub_epi32(bf0[30], bf0[29]);
    2210           0 :         bf1[30] = _mm256_add_epi32(bf0[29], bf0[30]);
    2211           0 :         bf1[31] = _mm256_add_epi32(bf0[28], bf0[31]);
    2212             : 
    2213             :         // stage 6
    2214           0 :         bf0[0] = _mm256_add_epi32(bf1[0], bf1[3]);
    2215           0 :         bf0[1] = _mm256_add_epi32(bf1[1], bf1[2]);
    2216           0 :         bf0[2] = _mm256_sub_epi32(bf1[1], bf1[2]);
    2217           0 :         bf0[3] = _mm256_sub_epi32(bf1[0], bf1[3]);
    2218           0 :         bf0[4] = bf1[4];
    2219           0 :         bf0[5] =
    2220           0 :             half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
    2221           0 :         bf0[6] =
    2222           0 :             half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
    2223           0 :         bf0[7] = bf1[7];
    2224           0 :         bf0[8] = _mm256_add_epi32(bf1[8], bf1[11]);
    2225           0 :         bf0[9] = _mm256_add_epi32(bf1[9], bf1[10]);
    2226           0 :         bf0[10] = _mm256_sub_epi32(bf1[9], bf1[10]);
    2227           0 :         bf0[11] = _mm256_sub_epi32(bf1[8], bf1[11]);
    2228           0 :         bf0[12] = _mm256_sub_epi32(bf1[15], bf1[12]);
    2229           0 :         bf0[13] = _mm256_sub_epi32(bf1[14], bf1[13]);
    2230           0 :         bf0[14] = _mm256_add_epi32(bf1[13], bf1[14]);
    2231           0 :         bf0[15] = _mm256_add_epi32(bf1[12], bf1[15]);
    2232           0 :         bf0[16] = bf1[16];
    2233           0 :         bf0[17] = bf1[17];
    2234           0 :         bf0[18] =
    2235           0 :             half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
    2236           0 :         bf0[19] =
    2237           0 :             half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
    2238           0 :         bf0[20] =
    2239           0 :             half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
    2240           0 :         bf0[21] =
    2241           0 :             half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
    2242           0 :         bf0[22] = bf1[22];
    2243           0 :         bf0[23] = bf1[23];
    2244           0 :         bf0[24] = bf1[24];
    2245           0 :         bf0[25] = bf1[25];
    2246           0 :         bf0[26] =
    2247           0 :             half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
    2248           0 :         bf0[27] =
    2249           0 :             half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
    2250           0 :         bf0[28] =
    2251           0 :             half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
    2252           0 :         bf0[29] =
    2253           0 :             half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
    2254           0 :         bf0[30] = bf1[30];
    2255           0 :         bf0[31] = bf1[31];
    2256             : 
    2257             :         // stage 7
    2258           0 :         bf1[0] = _mm256_add_epi32(bf0[0], bf0[7]);
    2259           0 :         bf1[1] = _mm256_add_epi32(bf0[1], bf0[6]);
    2260           0 :         bf1[2] = _mm256_add_epi32(bf0[2], bf0[5]);
    2261           0 :         bf1[3] = _mm256_add_epi32(bf0[3], bf0[4]);
    2262           0 :         bf1[4] = _mm256_sub_epi32(bf0[3], bf0[4]);
    2263           0 :         bf1[5] = _mm256_sub_epi32(bf0[2], bf0[5]);
    2264           0 :         bf1[6] = _mm256_sub_epi32(bf0[1], bf0[6]);
    2265           0 :         bf1[7] = _mm256_sub_epi32(bf0[0], bf0[7]);
    2266           0 :         bf1[8] = bf0[8];
    2267           0 :         bf1[9] = bf0[9];
    2268           0 :         bf1[10] =
    2269           0 :             half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
    2270           0 :         bf1[11] =
    2271           0 :             half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
    2272           0 :         bf1[12] =
    2273           0 :             half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
    2274           0 :         bf1[13] =
    2275           0 :             half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
    2276           0 :         bf1[14] = bf0[14];
    2277           0 :         bf1[15] = bf0[15];
    2278           0 :         bf1[16] = _mm256_add_epi32(bf0[16], bf0[23]);
    2279           0 :         bf1[17] = _mm256_add_epi32(bf0[17], bf0[22]);
    2280           0 :         bf1[18] = _mm256_add_epi32(bf0[18], bf0[21]);
    2281           0 :         bf1[19] = _mm256_add_epi32(bf0[19], bf0[20]);
    2282           0 :         bf1[20] = _mm256_sub_epi32(bf0[19], bf0[20]);
    2283           0 :         bf1[21] = _mm256_sub_epi32(bf0[18], bf0[21]);
    2284           0 :         bf1[22] = _mm256_sub_epi32(bf0[17], bf0[22]);
    2285           0 :         bf1[23] = _mm256_sub_epi32(bf0[16], bf0[23]);
    2286           0 :         bf1[24] = _mm256_sub_epi32(bf0[31], bf0[24]);
    2287           0 :         bf1[25] = _mm256_sub_epi32(bf0[30], bf0[25]);
    2288           0 :         bf1[26] = _mm256_sub_epi32(bf0[29], bf0[26]);
    2289           0 :         bf1[27] = _mm256_sub_epi32(bf0[28], bf0[27]);
    2290           0 :         bf1[28] = _mm256_add_epi32(bf0[27], bf0[28]);
    2291           0 :         bf1[29] = _mm256_add_epi32(bf0[26], bf0[29]);
    2292           0 :         bf1[30] = _mm256_add_epi32(bf0[25], bf0[30]);
    2293           0 :         bf1[31] = _mm256_add_epi32(bf0[24], bf0[31]);
    2294             : 
    2295             :         // stage 8
    2296           0 :         bf0[0] = _mm256_add_epi32(bf1[0], bf1[15]);
    2297           0 :         bf0[1] = _mm256_add_epi32(bf1[1], bf1[14]);
    2298           0 :         bf0[2] = _mm256_add_epi32(bf1[2], bf1[13]);
    2299           0 :         bf0[3] = _mm256_add_epi32(bf1[3], bf1[12]);
    2300           0 :         bf0[4] = _mm256_add_epi32(bf1[4], bf1[11]);
    2301           0 :         bf0[5] = _mm256_add_epi32(bf1[5], bf1[10]);
    2302           0 :         bf0[6] = _mm256_add_epi32(bf1[6], bf1[9]);
    2303           0 :         bf0[7] = _mm256_add_epi32(bf1[7], bf1[8]);
    2304           0 :         bf0[8] = _mm256_sub_epi32(bf1[7], bf1[8]);
    2305           0 :         bf0[9] = _mm256_sub_epi32(bf1[6], bf1[9]);
    2306           0 :         bf0[10] = _mm256_sub_epi32(bf1[5], bf1[10]);
    2307           0 :         bf0[11] = _mm256_sub_epi32(bf1[4], bf1[11]);
    2308           0 :         bf0[12] = _mm256_sub_epi32(bf1[3], bf1[12]);
    2309           0 :         bf0[13] = _mm256_sub_epi32(bf1[2], bf1[13]);
    2310           0 :         bf0[14] = _mm256_sub_epi32(bf1[1], bf1[14]);
    2311           0 :         bf0[15] = _mm256_sub_epi32(bf1[0], bf1[15]);
    2312           0 :         bf0[16] = bf1[16];
    2313           0 :         bf0[17] = bf1[17];
    2314           0 :         bf0[18] = bf1[18];
    2315           0 :         bf0[19] = bf1[19];
    2316           0 :         bf0[20] =
    2317           0 :             half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
    2318           0 :         bf0[21] =
    2319           0 :             half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
    2320           0 :         bf0[22] =
    2321           0 :             half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
    2322           0 :         bf0[23] =
    2323           0 :             half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
    2324           0 :         bf0[24] =
    2325           0 :             half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
    2326           0 :         bf0[25] =
    2327           0 :             half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
    2328           0 :         bf0[26] =
    2329           0 :             half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
    2330           0 :         bf0[27] =
    2331           0 :             half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
    2332           0 :         bf0[28] = bf1[28];
    2333           0 :         bf0[29] = bf1[29];
    2334           0 :         bf0[30] = bf1[30];
    2335           0 :         bf0[31] = bf1[31];
    2336             : 
    2337             :         // stage 9
    2338           0 :         out[0 * 4 + col] = _mm256_add_epi32(bf0[0], bf0[31]);
    2339           0 :         out[1 * 4 + col] = _mm256_add_epi32(bf0[1], bf0[30]);
    2340           0 :         out[2 * 4 + col] = _mm256_add_epi32(bf0[2], bf0[29]);
    2341           0 :         out[3 * 4 + col] = _mm256_add_epi32(bf0[3], bf0[28]);
    2342           0 :         out[4 * 4 + col] = _mm256_add_epi32(bf0[4], bf0[27]);
    2343           0 :         out[5 * 4 + col] = _mm256_add_epi32(bf0[5], bf0[26]);
    2344           0 :         out[6 * 4 + col] = _mm256_add_epi32(bf0[6], bf0[25]);
    2345           0 :         out[7 * 4 + col] = _mm256_add_epi32(bf0[7], bf0[24]);
    2346           0 :         out[8 * 4 + col] = _mm256_add_epi32(bf0[8], bf0[23]);
    2347           0 :         out[9 * 4 + col] = _mm256_add_epi32(bf0[9], bf0[22]);
    2348           0 :         out[10 * 4 + col] = _mm256_add_epi32(bf0[10], bf0[21]);
    2349           0 :         out[11 * 4 + col] = _mm256_add_epi32(bf0[11], bf0[20]);
    2350           0 :         out[12 * 4 + col] = _mm256_add_epi32(bf0[12], bf0[19]);
    2351           0 :         out[13 * 4 + col] = _mm256_add_epi32(bf0[13], bf0[18]);
    2352           0 :         out[14 * 4 + col] = _mm256_add_epi32(bf0[14], bf0[17]);
    2353           0 :         out[15 * 4 + col] = _mm256_add_epi32(bf0[15], bf0[16]);
    2354           0 :         out[16 * 4 + col] = _mm256_sub_epi32(bf0[15], bf0[16]);
    2355           0 :         out[17 * 4 + col] = _mm256_sub_epi32(bf0[14], bf0[17]);
    2356           0 :         out[18 * 4 + col] = _mm256_sub_epi32(bf0[13], bf0[18]);
    2357           0 :         out[19 * 4 + col] = _mm256_sub_epi32(bf0[12], bf0[19]);
    2358           0 :         out[20 * 4 + col] = _mm256_sub_epi32(bf0[11], bf0[20]);
    2359           0 :         out[21 * 4 + col] = _mm256_sub_epi32(bf0[10], bf0[21]);
    2360           0 :         out[22 * 4 + col] = _mm256_sub_epi32(bf0[9], bf0[22]);
    2361           0 :         out[23 * 4 + col] = _mm256_sub_epi32(bf0[8], bf0[23]);
    2362           0 :         out[24 * 4 + col] = _mm256_sub_epi32(bf0[7], bf0[24]);
    2363           0 :         out[25 * 4 + col] = _mm256_sub_epi32(bf0[6], bf0[25]);
    2364           0 :         out[26 * 4 + col] = _mm256_sub_epi32(bf0[5], bf0[26]);
    2365           0 :         out[27 * 4 + col] = _mm256_sub_epi32(bf0[4], bf0[27]);
    2366           0 :         out[28 * 4 + col] = _mm256_sub_epi32(bf0[3], bf0[28]);
    2367           0 :         out[29 * 4 + col] = _mm256_sub_epi32(bf0[2], bf0[29]);
    2368           0 :         out[30 * 4 + col] = _mm256_sub_epi32(bf0[1], bf0[30]);
    2369           0 :         out[31 * 4 + col] = _mm256_sub_epi32(bf0[0], bf0[31]);
    2370             :     }
    2371           0 : }
    2372             : 
    2373           0 : void eb_av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff,
    2374             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    2375             :     TxType tx_type, int32_t bd) {
    2376             :     __m256i in[128], out[128];
    2377           0 :     const int8_t *shift = eb_inv_txfm_shift_ls[TX_32X32];
    2378           0 :     const int32_t txw_idx = get_txw_idx(TX_32X32);
    2379           0 :     const int32_t txh_idx = get_txh_idx(TX_32X32);
    2380             : 
    2381           0 :     switch (tx_type) {
    2382           0 :     case DCT_DCT:
    2383           0 :         load_buffer_32x32(coeff, in);
    2384           0 :         transpose_32x32(in, out);
    2385           0 :         idct32_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
    2386           0 :         round_shift_32x32(in, -shift[0]);
    2387           0 :         transpose_32x32(in, out);
    2388           0 :         idct32_avx2(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
    2389           0 :         round_shift_32x32(in, -shift[1]);
    2390           0 :         write_buffer_32x32(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
    2391           0 :         break;
    2392           0 :     case IDTX:
    2393           0 :         load_buffer_32x32(coeff, in);
    2394             :         // Operations can be joined together without losing precision
    2395             :         // eb_av1_iidentity32_c() shift left 2 bits
    2396             :         // round_shift_32x32(, -shift[0]) shift right 2 bits
    2397             :         // eb_av1_iidentity32_c() shift left 2 bits
    2398             :         // round_shift_32x32(, -shift[1]) shift right 4 bits with complement
    2399           0 :         round_shift_32x32(in, -shift[0] - shift[1] - 4);
    2400           0 :         write_buffer_32x32(in, output_r, stride_r, output_w, stride_w, 0, 0, bd);
    2401           0 :         break;
    2402           0 :     default: assert(0);
    2403             :     }
    2404           0 : }
    2405             : 
    2406           0 : static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    2407             :     int32_t bd, int32_t out_shift) {
    2408           0 :     const int32_t *cospi = cospi_arr(bit);
    2409           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    2410           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    2411           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    2412             :     __m256i x;
    2413             : 
    2414             :     // stage 0
    2415             :     // stage 1
    2416             :     // stage 2
    2417             :     // stage 3
    2418           0 :     x = _mm256_mullo_epi32(in[0], cospi32);
    2419           0 :     x = _mm256_add_epi32(x, rnding);
    2420           0 :     x = _mm256_srai_epi32(x, bit);
    2421             : 
    2422             :     // stage 4
    2423             :     // stage 5
    2424           0 :     if (!do_cols) {
    2425           0 :         const int32_t log_range_out = AOMMAX(16, bd + 6);
    2426           0 :         const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
    2427             :             -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
    2428           0 :         const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
    2429             :             (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
    2430             : 
    2431           0 :         __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
    2432           0 :         x = _mm256_add_epi32(x, offset);
    2433           0 :         x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
    2434           0 :         x = _mm256_max_epi32(x, clamp_lo_out);
    2435           0 :         x = _mm256_min_epi32(x, clamp_hi_out);
    2436             :     }
    2437             : 
    2438           0 :     out[0] = x;
    2439           0 :     out[1] = x;
    2440           0 :     out[2] = x;
    2441           0 :     out[3] = x;
    2442           0 :     out[4] = x;
    2443           0 :     out[5] = x;
    2444           0 :     out[6] = x;
    2445           0 :     out[7] = x;
    2446           0 : }
    2447             : 
    2448           0 : static void idct8x8_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    2449             :     int32_t bd, int32_t out_shift) {
    2450           0 :     const int32_t *cospi = cospi_arr(bit);
    2451           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    2452           0 :     const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
    2453           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    2454           0 :     const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
    2455           0 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    2456           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    2457           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    2458           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    2459           0 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    2460           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    2461           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    2462           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    2463           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    2464           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    2465             :     __m256i u0, u1, u2, u3, u4, u5, u6, u7;
    2466             :     __m256i v0, v1, v2, v3, v4, v5, v6, v7;
    2467             :     __m256i x, y;
    2468             : 
    2469             :     // stage 0
    2470             :     // stage 1
    2471             :     // stage 2
    2472           0 :     u0 = in[0];
    2473           0 :     u1 = in[4];
    2474           0 :     u2 = in[2];
    2475           0 :     u3 = in[6];
    2476             : 
    2477           0 :     x = _mm256_mullo_epi32(in[1], cospi56);
    2478           0 :     y = _mm256_mullo_epi32(in[7], cospim8);
    2479           0 :     u4 = _mm256_add_epi32(x, y);
    2480           0 :     u4 = _mm256_add_epi32(u4, rnding);
    2481           0 :     u4 = _mm256_srai_epi32(u4, bit);
    2482             : 
    2483           0 :     x = _mm256_mullo_epi32(in[1], cospi8);
    2484           0 :     y = _mm256_mullo_epi32(in[7], cospi56);
    2485           0 :     u7 = _mm256_add_epi32(x, y);
    2486           0 :     u7 = _mm256_add_epi32(u7, rnding);
    2487           0 :     u7 = _mm256_srai_epi32(u7, bit);
    2488             : 
    2489           0 :     x = _mm256_mullo_epi32(in[5], cospi24);
    2490           0 :     y = _mm256_mullo_epi32(in[3], cospim40);
    2491           0 :     u5 = _mm256_add_epi32(x, y);
    2492           0 :     u5 = _mm256_add_epi32(u5, rnding);
    2493           0 :     u5 = _mm256_srai_epi32(u5, bit);
    2494             : 
    2495           0 :     x = _mm256_mullo_epi32(in[5], cospi40);
    2496           0 :     y = _mm256_mullo_epi32(in[3], cospi24);
    2497           0 :     u6 = _mm256_add_epi32(x, y);
    2498           0 :     u6 = _mm256_add_epi32(u6, rnding);
    2499           0 :     u6 = _mm256_srai_epi32(u6, bit);
    2500             : 
    2501             :     // stage 3
    2502           0 :     x = _mm256_mullo_epi32(u0, cospi32);
    2503           0 :     y = _mm256_mullo_epi32(u1, cospi32);
    2504           0 :     v0 = _mm256_add_epi32(x, y);
    2505           0 :     v0 = _mm256_add_epi32(v0, rnding);
    2506           0 :     v0 = _mm256_srai_epi32(v0, bit);
    2507             : 
    2508           0 :     v1 = _mm256_sub_epi32(x, y);
    2509           0 :     v1 = _mm256_add_epi32(v1, rnding);
    2510           0 :     v1 = _mm256_srai_epi32(v1, bit);
    2511             : 
    2512           0 :     x = _mm256_mullo_epi32(u2, cospi48);
    2513           0 :     y = _mm256_mullo_epi32(u3, cospim16);
    2514           0 :     v2 = _mm256_add_epi32(x, y);
    2515           0 :     v2 = _mm256_add_epi32(v2, rnding);
    2516           0 :     v2 = _mm256_srai_epi32(v2, bit);
    2517             : 
    2518           0 :     x = _mm256_mullo_epi32(u2, cospi16);
    2519           0 :     y = _mm256_mullo_epi32(u3, cospi48);
    2520           0 :     v3 = _mm256_add_epi32(x, y);
    2521           0 :     v3 = _mm256_add_epi32(v3, rnding);
    2522           0 :     v3 = _mm256_srai_epi32(v3, bit);
    2523             : 
    2524           0 :     addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
    2525           0 :     addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
    2526             : 
    2527             :     // stage 4
    2528           0 :     addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
    2529           0 :     addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
    2530           0 :     u4 = v4;
    2531           0 :     u7 = v7;
    2532             : 
    2533           0 :     x = _mm256_mullo_epi32(v5, cospi32);
    2534           0 :     y = _mm256_mullo_epi32(v6, cospi32);
    2535           0 :     u6 = _mm256_add_epi32(y, x);
    2536           0 :     u6 = _mm256_add_epi32(u6, rnding);
    2537           0 :     u6 = _mm256_srai_epi32(u6, bit);
    2538             : 
    2539           0 :     u5 = _mm256_sub_epi32(y, x);
    2540           0 :     u5 = _mm256_add_epi32(u5, rnding);
    2541           0 :     u5 = _mm256_srai_epi32(u5, bit);
    2542             : 
    2543             :     // stage 5
    2544           0 :     if (do_cols) {
    2545           0 :         addsub_no_clamp_avx2(u0, u7, out + 0, out + 7);
    2546           0 :         addsub_no_clamp_avx2(u1, u6, out + 1, out + 6);
    2547           0 :         addsub_no_clamp_avx2(u2, u5, out + 2, out + 5);
    2548           0 :         addsub_no_clamp_avx2(u3, u4, out + 3, out + 4);
    2549             :     }
    2550             :     else {
    2551           0 :         const int32_t log_range_out = AOMMAX(16, bd + 6);
    2552           0 :         const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
    2553             :             -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
    2554           0 :         const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
    2555             :             (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
    2556           0 :         addsub_shift_avx2(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
    2557             :             out_shift);
    2558           0 :         addsub_shift_avx2(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
    2559             :             out_shift);
    2560           0 :         addsub_shift_avx2(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
    2561             :             out_shift);
    2562           0 :         addsub_shift_avx2(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
    2563             :             out_shift);
    2564             :     }
    2565           0 : }
    2566             : 
    2567           0 : static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    2568             :     int32_t bd, int32_t out_shift) {
    2569           0 :     const int32_t *cospi = cospi_arr(bit);
    2570           0 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    2571           0 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    2572           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    2573           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    2574           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    2575           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    2576           0 :     const __m256i kZero = _mm256_setzero_si256();
    2577             :     __m256i u[8], x;
    2578             : 
    2579             :     // stage 0
    2580             :     // stage 1
    2581             :     // stage 2
    2582             : 
    2583           0 :     x = _mm256_mullo_epi32(in[0], cospi60);
    2584           0 :     u[0] = _mm256_add_epi32(x, rnding);
    2585           0 :     u[0] = _mm256_srai_epi32(u[0], bit);
    2586             : 
    2587           0 :     x = _mm256_mullo_epi32(in[0], cospi4);
    2588           0 :     u[1] = _mm256_sub_epi32(kZero, x);
    2589           0 :     u[1] = _mm256_add_epi32(u[1], rnding);
    2590           0 :     u[1] = _mm256_srai_epi32(u[1], bit);
    2591             : 
    2592             :     // stage 3
    2593             :     // stage 4
    2594             :     __m256i temp1, temp2;
    2595           0 :     temp1 = _mm256_mullo_epi32(u[0], cospi16);
    2596           0 :     x = _mm256_mullo_epi32(u[1], cospi48);
    2597           0 :     temp1 = _mm256_add_epi32(temp1, x);
    2598           0 :     temp1 = _mm256_add_epi32(temp1, rnding);
    2599           0 :     temp1 = _mm256_srai_epi32(temp1, bit);
    2600           0 :     u[4] = temp1;
    2601             : 
    2602           0 :     temp2 = _mm256_mullo_epi32(u[0], cospi48);
    2603           0 :     x = _mm256_mullo_epi32(u[1], cospi16);
    2604           0 :     u[5] = _mm256_sub_epi32(temp2, x);
    2605           0 :     u[5] = _mm256_add_epi32(u[5], rnding);
    2606           0 :     u[5] = _mm256_srai_epi32(u[5], bit);
    2607             : 
    2608             :     // stage 5
    2609             :     // stage 6
    2610           0 :     temp1 = _mm256_mullo_epi32(u[0], cospi32);
    2611           0 :     x = _mm256_mullo_epi32(u[1], cospi32);
    2612           0 :     u[2] = _mm256_add_epi32(temp1, x);
    2613           0 :     u[2] = _mm256_add_epi32(u[2], rnding);
    2614           0 :     u[2] = _mm256_srai_epi32(u[2], bit);
    2615             : 
    2616           0 :     u[3] = _mm256_sub_epi32(temp1, x);
    2617           0 :     u[3] = _mm256_add_epi32(u[3], rnding);
    2618           0 :     u[3] = _mm256_srai_epi32(u[3], bit);
    2619             : 
    2620           0 :     temp1 = _mm256_mullo_epi32(u[4], cospi32);
    2621           0 :     x = _mm256_mullo_epi32(u[5], cospi32);
    2622           0 :     u[6] = _mm256_add_epi32(temp1, x);
    2623           0 :     u[6] = _mm256_add_epi32(u[6], rnding);
    2624           0 :     u[6] = _mm256_srai_epi32(u[6], bit);
    2625             : 
    2626           0 :     u[7] = _mm256_sub_epi32(temp1, x);
    2627           0 :     u[7] = _mm256_add_epi32(u[7], rnding);
    2628           0 :     u[7] = _mm256_srai_epi32(u[7], bit);
    2629             : 
    2630             :     // stage 7
    2631           0 :     if (do_cols) {
    2632           0 :         out[0] = u[0];
    2633           0 :         out[1] = _mm256_sub_epi32(kZero, u[4]);
    2634           0 :         out[2] = u[6];
    2635           0 :         out[3] = _mm256_sub_epi32(kZero, u[2]);
    2636           0 :         out[4] = u[3];
    2637           0 :         out[5] = _mm256_sub_epi32(kZero, u[7]);
    2638           0 :         out[6] = u[5];
    2639           0 :         out[7] = _mm256_sub_epi32(kZero, u[1]);
    2640             :     }
    2641             :     else {
    2642           0 :         const int32_t log_range_out = AOMMAX(16, bd + 6);
    2643           0 :         const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
    2644           0 :         const __m256i clamp_hi_out =
    2645           0 :             _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
    2646             : 
    2647           0 :         neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
    2648             :             out_shift);
    2649           0 :         neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
    2650             :             out_shift);
    2651           0 :         neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
    2652             :             out_shift);
    2653           0 :         neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
    2654             :             out_shift);
    2655             :     }
    2656           0 : }
    2657             : 
    2658           0 : static void iadst8x8_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    2659             :     int32_t bd, int32_t out_shift) {
    2660           0 :     const int32_t *cospi = cospi_arr(bit);
    2661           0 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    2662           0 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    2663           0 :     const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
    2664           0 :     const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
    2665           0 :     const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
    2666           0 :     const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
    2667           0 :     const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
    2668           0 :     const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    2669           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    2670           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    2671           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    2672           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    2673           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    2674           0 :     const __m256i kZero = _mm256_setzero_si256();
    2675           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    2676           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    2677           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    2678             :     __m256i u[8], v[8], x;
    2679             : 
    2680             :     // stage 0
    2681             :     // stage 1
    2682             :     // stage 2
    2683             : 
    2684           0 :     u[0] = _mm256_mullo_epi32(in[7], cospi4);
    2685           0 :     x = _mm256_mullo_epi32(in[0], cospi60);
    2686           0 :     u[0] = _mm256_add_epi32(u[0], x);
    2687           0 :     u[0] = _mm256_add_epi32(u[0], rnding);
    2688           0 :     u[0] = _mm256_srai_epi32(u[0], bit);
    2689             : 
    2690           0 :     u[1] = _mm256_mullo_epi32(in[7], cospi60);
    2691           0 :     x = _mm256_mullo_epi32(in[0], cospi4);
    2692           0 :     u[1] = _mm256_sub_epi32(u[1], x);
    2693           0 :     u[1] = _mm256_add_epi32(u[1], rnding);
    2694           0 :     u[1] = _mm256_srai_epi32(u[1], bit);
    2695             : 
    2696           0 :     u[2] = _mm256_mullo_epi32(in[5], cospi20);
    2697           0 :     x = _mm256_mullo_epi32(in[2], cospi44);
    2698           0 :     u[2] = _mm256_add_epi32(u[2], x);
    2699           0 :     u[2] = _mm256_add_epi32(u[2], rnding);
    2700           0 :     u[2] = _mm256_srai_epi32(u[2], bit);
    2701             : 
    2702           0 :     u[3] = _mm256_mullo_epi32(in[5], cospi44);
    2703           0 :     x = _mm256_mullo_epi32(in[2], cospi20);
    2704           0 :     u[3] = _mm256_sub_epi32(u[3], x);
    2705           0 :     u[3] = _mm256_add_epi32(u[3], rnding);
    2706           0 :     u[3] = _mm256_srai_epi32(u[3], bit);
    2707             : 
    2708           0 :     u[4] = _mm256_mullo_epi32(in[3], cospi36);
    2709           0 :     x = _mm256_mullo_epi32(in[4], cospi28);
    2710           0 :     u[4] = _mm256_add_epi32(u[4], x);
    2711           0 :     u[4] = _mm256_add_epi32(u[4], rnding);
    2712           0 :     u[4] = _mm256_srai_epi32(u[4], bit);
    2713             : 
    2714           0 :     u[5] = _mm256_mullo_epi32(in[3], cospi28);
    2715           0 :     x = _mm256_mullo_epi32(in[4], cospi36);
    2716           0 :     u[5] = _mm256_sub_epi32(u[5], x);
    2717           0 :     u[5] = _mm256_add_epi32(u[5], rnding);
    2718           0 :     u[5] = _mm256_srai_epi32(u[5], bit);
    2719             : 
    2720           0 :     u[6] = _mm256_mullo_epi32(in[1], cospi52);
    2721           0 :     x = _mm256_mullo_epi32(in[6], cospi12);
    2722           0 :     u[6] = _mm256_add_epi32(u[6], x);
    2723           0 :     u[6] = _mm256_add_epi32(u[6], rnding);
    2724           0 :     u[6] = _mm256_srai_epi32(u[6], bit);
    2725             : 
    2726           0 :     u[7] = _mm256_mullo_epi32(in[1], cospi12);
    2727           0 :     x = _mm256_mullo_epi32(in[6], cospi52);
    2728           0 :     u[7] = _mm256_sub_epi32(u[7], x);
    2729           0 :     u[7] = _mm256_add_epi32(u[7], rnding);
    2730           0 :     u[7] = _mm256_srai_epi32(u[7], bit);
    2731             : 
    2732             :     // stage 3
    2733           0 :     addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
    2734           0 :     addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
    2735           0 :     addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
    2736           0 :     addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
    2737             : 
    2738             :     // stage 4
    2739           0 :     u[0] = v[0];
    2740           0 :     u[1] = v[1];
    2741           0 :     u[2] = v[2];
    2742           0 :     u[3] = v[3];
    2743             : 
    2744           0 :     u[4] = _mm256_mullo_epi32(v[4], cospi16);
    2745           0 :     x = _mm256_mullo_epi32(v[5], cospi48);
    2746           0 :     u[4] = _mm256_add_epi32(u[4], x);
    2747           0 :     u[4] = _mm256_add_epi32(u[4], rnding);
    2748           0 :     u[4] = _mm256_srai_epi32(u[4], bit);
    2749             : 
    2750           0 :     u[5] = _mm256_mullo_epi32(v[4], cospi48);
    2751           0 :     x = _mm256_mullo_epi32(v[5], cospi16);
    2752           0 :     u[5] = _mm256_sub_epi32(u[5], x);
    2753           0 :     u[5] = _mm256_add_epi32(u[5], rnding);
    2754           0 :     u[5] = _mm256_srai_epi32(u[5], bit);
    2755             : 
    2756           0 :     u[6] = _mm256_mullo_epi32(v[6], cospim48);
    2757           0 :     x = _mm256_mullo_epi32(v[7], cospi16);
    2758           0 :     u[6] = _mm256_add_epi32(u[6], x);
    2759           0 :     u[6] = _mm256_add_epi32(u[6], rnding);
    2760           0 :     u[6] = _mm256_srai_epi32(u[6], bit);
    2761             : 
    2762           0 :     u[7] = _mm256_mullo_epi32(v[6], cospi16);
    2763           0 :     x = _mm256_mullo_epi32(v[7], cospim48);
    2764           0 :     u[7] = _mm256_sub_epi32(u[7], x);
    2765           0 :     u[7] = _mm256_add_epi32(u[7], rnding);
    2766           0 :     u[7] = _mm256_srai_epi32(u[7], bit);
    2767             : 
    2768             :     // stage 5
    2769           0 :     addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
    2770           0 :     addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
    2771           0 :     addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
    2772           0 :     addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
    2773             : 
    2774             :     // stage 6
    2775           0 :     u[0] = v[0];
    2776           0 :     u[1] = v[1];
    2777           0 :     u[4] = v[4];
    2778           0 :     u[5] = v[5];
    2779             : 
    2780           0 :     v[0] = _mm256_mullo_epi32(v[2], cospi32);
    2781           0 :     x = _mm256_mullo_epi32(v[3], cospi32);
    2782           0 :     u[2] = _mm256_add_epi32(v[0], x);
    2783           0 :     u[2] = _mm256_add_epi32(u[2], rnding);
    2784           0 :     u[2] = _mm256_srai_epi32(u[2], bit);
    2785             : 
    2786           0 :     u[3] = _mm256_sub_epi32(v[0], x);
    2787           0 :     u[3] = _mm256_add_epi32(u[3], rnding);
    2788           0 :     u[3] = _mm256_srai_epi32(u[3], bit);
    2789             : 
    2790           0 :     v[0] = _mm256_mullo_epi32(v[6], cospi32);
    2791           0 :     x = _mm256_mullo_epi32(v[7], cospi32);
    2792           0 :     u[6] = _mm256_add_epi32(v[0], x);
    2793           0 :     u[6] = _mm256_add_epi32(u[6], rnding);
    2794           0 :     u[6] = _mm256_srai_epi32(u[6], bit);
    2795             : 
    2796           0 :     u[7] = _mm256_sub_epi32(v[0], x);
    2797           0 :     u[7] = _mm256_add_epi32(u[7], rnding);
    2798           0 :     u[7] = _mm256_srai_epi32(u[7], bit);
    2799             : 
    2800             :     // stage 7
    2801           0 :     if (do_cols) {
    2802           0 :         out[0] = u[0];
    2803           0 :         out[1] = _mm256_sub_epi32(kZero, u[4]);
    2804           0 :         out[2] = u[6];
    2805           0 :         out[3] = _mm256_sub_epi32(kZero, u[2]);
    2806           0 :         out[4] = u[3];
    2807           0 :         out[5] = _mm256_sub_epi32(kZero, u[7]);
    2808           0 :         out[6] = u[5];
    2809           0 :         out[7] = _mm256_sub_epi32(kZero, u[1]);
    2810             :     }
    2811             :     else {
    2812           0 :         const int32_t log_range_out = AOMMAX(16, bd + 6);
    2813           0 :         const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
    2814           0 :         const __m256i clamp_hi_out =
    2815           0 :             _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
    2816             : 
    2817           0 :         neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
    2818             :             out_shift);
    2819           0 :         neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
    2820             :             out_shift);
    2821           0 :         neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
    2822             :             out_shift);
    2823           0 :         neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
    2824             :             out_shift);
    2825             :     }
    2826           0 : }
    2827           0 : static void highbd_clamp_epi32_avx2(const __m256i *in, __m256i *out,
    2828             :     const __m256i *clamp_lo,
    2829             :     const __m256i *clamp_hi, int32_t size) {
    2830             :     __m256i a0, a1;
    2831           0 :     for (int32_t i = 0; i < size; i += 4) {
    2832           0 :         a0 = _mm256_max_epi32(in[i], *clamp_lo);
    2833           0 :         out[i] = _mm256_min_epi32(a0, *clamp_hi);
    2834             : 
    2835           0 :         a1 = _mm256_max_epi32(in[i + 1], *clamp_lo);
    2836           0 :         out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
    2837             : 
    2838           0 :         a0 = _mm256_max_epi32(in[i + 2], *clamp_lo);
    2839           0 :         out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
    2840             : 
    2841           0 :         a1 = _mm256_max_epi32(in[i + 3], *clamp_lo);
    2842           0 :         out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
    2843             :     }
    2844           0 : }
    2845           0 : static void shift_avx2(const __m256i *in, __m256i *out,
    2846             :     const __m256i *clamp_lo, const __m256i *clamp_hi,
    2847             :     int32_t shift, int32_t size) {
    2848           0 :     __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
    2849           0 :     __m128i shift_vec = _mm_cvtsi32_si128(shift);
    2850             :     __m256i a0, a1;
    2851           0 :     for (int32_t i = 0; i < size; i += 4) {
    2852           0 :         a0 = _mm256_add_epi32(in[i], offset);
    2853           0 :         a1 = _mm256_add_epi32(in[i + 1], offset);
    2854           0 :         a0 = _mm256_sra_epi32(a0, shift_vec);
    2855           0 :         a1 = _mm256_sra_epi32(a1, shift_vec);
    2856           0 :         a0 = _mm256_max_epi32(a0, *clamp_lo);
    2857           0 :         a1 = _mm256_max_epi32(a1, *clamp_lo);
    2858           0 :         out[i] = _mm256_min_epi32(a0, *clamp_hi);
    2859           0 :         out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
    2860             : 
    2861           0 :         a0 = _mm256_add_epi32(in[i + 2], offset);
    2862           0 :         a1 = _mm256_add_epi32(in[i + 3], offset);
    2863           0 :         a0 = _mm256_sra_epi32(a0, shift_vec);
    2864           0 :         a1 = _mm256_sra_epi32(a1, shift_vec);
    2865           0 :         a0 = _mm256_max_epi32(a0, *clamp_lo);
    2866           0 :         a1 = _mm256_max_epi32(a1, *clamp_lo);
    2867           0 :         out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
    2868           0 :         out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
    2869             :     }
    2870           0 : }
    2871           0 : static void iidentity8_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    2872             :     int32_t bd, int32_t out_shift) {
    2873             :     (void)bit;
    2874           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    2875           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    2876           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    2877             :     __m256i v[8];
    2878           0 :     v[0] = _mm256_add_epi32(in[0], in[0]);
    2879           0 :     v[1] = _mm256_add_epi32(in[1], in[1]);
    2880           0 :     v[2] = _mm256_add_epi32(in[2], in[2]);
    2881           0 :     v[3] = _mm256_add_epi32(in[3], in[3]);
    2882           0 :     v[4] = _mm256_add_epi32(in[4], in[4]);
    2883           0 :     v[5] = _mm256_add_epi32(in[5], in[5]);
    2884           0 :     v[6] = _mm256_add_epi32(in[6], in[6]);
    2885           0 :     v[7] = _mm256_add_epi32(in[7], in[7]);
    2886             : 
    2887           0 :     if (!do_cols) {
    2888           0 :         const int32_t log_range_out = AOMMAX(16, bd + 6);
    2889           0 :         const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
    2890             :             -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
    2891           0 :         const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
    2892             :             (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
    2893             : 
    2894           0 :         shift_avx2(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 8);
    2895             :     }
    2896             :     else
    2897           0 :         highbd_clamp_epi32_avx2(v, out, &clamp_lo, &clamp_hi, 8);
    2898           0 : }
    2899           0 : static void idct16_low1_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    2900             :     int32_t bd, int32_t out_shift) {
    2901           0 :     const int32_t *cospi = cospi_arr(bit);
    2902           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    2903           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    2904           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    2905           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    2906           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    2907             : 
    2908             :     {
    2909             :         // stage 0
    2910             :         // stage 1
    2911             :         // stage 2
    2912             :         // stage 3
    2913             :         // stage 4
    2914           0 :         in[0] = _mm256_mullo_epi32(in[0], cospi32);
    2915           0 :         in[0] = _mm256_add_epi32(in[0], rnding);
    2916           0 :         in[0] = _mm256_srai_epi32(in[0], bit);
    2917             : 
    2918             :         // stage 5
    2919             :         // stage 6
    2920             :         // stage 7
    2921           0 :         if (do_cols) {
    2922           0 :             in[0] = _mm256_max_epi32(in[0], clamp_lo);
    2923           0 :             in[0] = _mm256_min_epi32(in[0], clamp_hi);
    2924             :         }
    2925             :         else {
    2926           0 :             const int32_t log_range_out = AOMMAX(16, bd + 6);
    2927           0 :             const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
    2928             :                 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
    2929           0 :             const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
    2930             :                 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
    2931           0 :             __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
    2932           0 :             in[0] = _mm256_add_epi32(in[0], offset);
    2933           0 :             in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
    2934           0 :             in[0] = _mm256_max_epi32(in[0], clamp_lo_out);
    2935           0 :             in[0] = _mm256_min_epi32(in[0], clamp_hi_out);
    2936             :         }
    2937             : 
    2938           0 :         out[0] = in[0];
    2939           0 :         out[1] = in[0];
    2940           0 :         out[2] = in[0];
    2941           0 :         out[3] = in[0];
    2942           0 :         out[4] = in[0];
    2943           0 :         out[5] = in[0];
    2944           0 :         out[6] = in[0];
    2945           0 :         out[7] = in[0];
    2946           0 :         out[8] = in[0];
    2947           0 :         out[9] = in[0];
    2948           0 :         out[10] = in[0];
    2949           0 :         out[11] = in[0];
    2950           0 :         out[12] = in[0];
    2951           0 :         out[13] = in[0];
    2952           0 :         out[14] = in[0];
    2953           0 :         out[15] = in[0];
    2954             :     }
    2955           0 : }
    2956             : 
    2957           0 : static void idct16_low8_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    2958             :     int32_t bd, int32_t out_shift) {
    2959           0 :     const int32_t *cospi = cospi_arr(bit);
    2960           0 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    2961           0 :     const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
    2962           0 :     const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
    2963           0 :     const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
    2964           0 :     const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    2965           0 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    2966           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    2967           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    2968           0 :     const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
    2969           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    2970           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    2971           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    2972           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    2973           0 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    2974           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    2975           0 :     const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
    2976           0 :     const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
    2977           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    2978           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    2979           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    2980           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    2981             :     __m256i u[16], x, y;
    2982             : 
    2983             :     {
    2984             :         // stage 0
    2985             :         // stage 1
    2986           0 :         u[0] = in[0];
    2987           0 :         u[2] = in[4];
    2988           0 :         u[4] = in[2];
    2989           0 :         u[6] = in[6];
    2990           0 :         u[8] = in[1];
    2991           0 :         u[10] = in[5];
    2992           0 :         u[12] = in[3];
    2993           0 :         u[14] = in[7];
    2994             : 
    2995             :         // stage 2
    2996           0 :         u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
    2997           0 :         u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
    2998             : 
    2999           0 :         u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
    3000           0 :         u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
    3001             : 
    3002           0 :         u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
    3003           0 :         u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
    3004             : 
    3005           0 :         u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
    3006           0 :         u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
    3007             : 
    3008             :         // stage 3
    3009           0 :         u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
    3010           0 :         u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
    3011           0 :         u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit);
    3012           0 :         u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit);
    3013             : 
    3014           0 :         addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
    3015           0 :         addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
    3016           0 :         addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
    3017           0 :         addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
    3018             : 
    3019             :         // stage 4
    3020           0 :         x = _mm256_mullo_epi32(u[0], cospi32);
    3021           0 :         u[0] = _mm256_add_epi32(x, rnding);
    3022           0 :         u[0] = _mm256_srai_epi32(u[0], bit);
    3023           0 :         u[1] = u[0];
    3024             : 
    3025           0 :         u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
    3026           0 :         u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
    3027             : 
    3028           0 :         addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
    3029           0 :         addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
    3030             : 
    3031           0 :         x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
    3032           0 :         u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
    3033           0 :         u[9] = x;
    3034           0 :         y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
    3035           0 :         u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
    3036           0 :         u[10] = y;
    3037             : 
    3038             :         // stage 5
    3039           0 :         addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
    3040           0 :         addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
    3041             : 
    3042           0 :         x = _mm256_mullo_epi32(u[5], cospi32);
    3043           0 :         y = _mm256_mullo_epi32(u[6], cospi32);
    3044           0 :         u[5] = _mm256_sub_epi32(y, x);
    3045           0 :         u[5] = _mm256_add_epi32(u[5], rnding);
    3046           0 :         u[5] = _mm256_srai_epi32(u[5], bit);
    3047             : 
    3048           0 :         u[6] = _mm256_add_epi32(y, x);
    3049           0 :         u[6] = _mm256_add_epi32(u[6], rnding);
    3050           0 :         u[6] = _mm256_srai_epi32(u[6], bit);
    3051             : 
    3052           0 :         addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
    3053           0 :         addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
    3054           0 :         addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
    3055           0 :         addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
    3056             : 
    3057             :         // stage 6
    3058           0 :         addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
    3059           0 :         addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
    3060           0 :         addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
    3061           0 :         addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
    3062             : 
    3063           0 :         x = _mm256_mullo_epi32(u[10], cospi32);
    3064           0 :         y = _mm256_mullo_epi32(u[13], cospi32);
    3065           0 :         u[10] = _mm256_sub_epi32(y, x);
    3066           0 :         u[10] = _mm256_add_epi32(u[10], rnding);
    3067           0 :         u[10] = _mm256_srai_epi32(u[10], bit);
    3068             : 
    3069           0 :         u[13] = _mm256_add_epi32(x, y);
    3070           0 :         u[13] = _mm256_add_epi32(u[13], rnding);
    3071           0 :         u[13] = _mm256_srai_epi32(u[13], bit);
    3072             : 
    3073           0 :         x = _mm256_mullo_epi32(u[11], cospi32);
    3074           0 :         y = _mm256_mullo_epi32(u[12], cospi32);
    3075           0 :         u[11] = _mm256_sub_epi32(y, x);
    3076           0 :         u[11] = _mm256_add_epi32(u[11], rnding);
    3077           0 :         u[11] = _mm256_srai_epi32(u[11], bit);
    3078             : 
    3079           0 :         u[12] = _mm256_add_epi32(x, y);
    3080           0 :         u[12] = _mm256_add_epi32(u[12], rnding);
    3081           0 :         u[12] = _mm256_srai_epi32(u[12], bit);
    3082             :         // stage 7
    3083           0 :         if (do_cols) {
    3084           0 :             addsub_no_clamp_avx2(u[0], u[15], out + 0, out + 15);
    3085           0 :             addsub_no_clamp_avx2(u[1], u[14], out + 1, out + 14);
    3086           0 :             addsub_no_clamp_avx2(u[2], u[13], out + 2, out + 13);
    3087           0 :             addsub_no_clamp_avx2(u[3], u[12], out + 3, out + 12);
    3088           0 :             addsub_no_clamp_avx2(u[4], u[11], out + 4, out + 11);
    3089           0 :             addsub_no_clamp_avx2(u[5], u[10], out + 5, out + 10);
    3090           0 :             addsub_no_clamp_avx2(u[6], u[9], out + 6, out + 9);
    3091           0 :             addsub_no_clamp_avx2(u[7], u[8], out + 7, out + 8);
    3092             :         }
    3093             :         else {
    3094           0 :             const int32_t log_range_out = AOMMAX(16, bd + 6);
    3095           0 :             const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
    3096             :                 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
    3097           0 :             const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
    3098             :                 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
    3099             : 
    3100           0 :             addsub_shift_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
    3101             :                 &clamp_hi_out, out_shift);
    3102           0 :             addsub_shift_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
    3103             :                 &clamp_hi_out, out_shift);
    3104           0 :             addsub_shift_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
    3105             :                 &clamp_hi_out, out_shift);
    3106           0 :             addsub_shift_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
    3107             :                 &clamp_hi_out, out_shift);
    3108           0 :             addsub_shift_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
    3109             :                 &clamp_hi_out, out_shift);
    3110           0 :             addsub_shift_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
    3111             :                 &clamp_hi_out, out_shift);
    3112           0 :             addsub_shift_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
    3113             :                 &clamp_hi_out, out_shift);
    3114           0 :             addsub_shift_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
    3115             :                 &clamp_hi_out, out_shift);
    3116             :         }
    3117             :     }
    3118           0 : }
    3119             : 
    3120           0 : static void idct16_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols, int32_t bd,
    3121             :     int32_t out_shift) {
    3122           0 :     const int32_t *cospi = cospi_arr(bit);
    3123           0 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    3124           0 :     const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
    3125           0 :     const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
    3126           0 :     const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
    3127           0 :     const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
    3128           0 :     const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
    3129           0 :     const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
    3130           0 :     const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    3131           0 :     const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
    3132           0 :     const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
    3133           0 :     const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
    3134           0 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    3135           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    3136           0 :     const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
    3137           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    3138           0 :     const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
    3139           0 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    3140           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    3141           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    3142           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    3143           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    3144           0 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    3145           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    3146           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    3147           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    3148           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    3149           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    3150             :     __m256i u[16], v[16], x, y;
    3151             : 
    3152             :     {
    3153             :         // stage 0
    3154             :         // stage 1
    3155           0 :         u[0] = in[0];
    3156           0 :         u[1] = in[8];
    3157           0 :         u[2] = in[4];
    3158           0 :         u[3] = in[12];
    3159           0 :         u[4] = in[2];
    3160           0 :         u[5] = in[10];
    3161           0 :         u[6] = in[6];
    3162           0 :         u[7] = in[14];
    3163           0 :         u[8] = in[1];
    3164           0 :         u[9] = in[9];
    3165           0 :         u[10] = in[5];
    3166           0 :         u[11] = in[13];
    3167           0 :         u[12] = in[3];
    3168           0 :         u[13] = in[11];
    3169           0 :         u[14] = in[7];
    3170           0 :         u[15] = in[15];
    3171             : 
    3172             :         // stage 2
    3173           0 :         v[0] = u[0];
    3174           0 :         v[1] = u[1];
    3175           0 :         v[2] = u[2];
    3176           0 :         v[3] = u[3];
    3177           0 :         v[4] = u[4];
    3178           0 :         v[5] = u[5];
    3179           0 :         v[6] = u[6];
    3180           0 :         v[7] = u[7];
    3181             : 
    3182           0 :         v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
    3183           0 :         v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
    3184           0 :         v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
    3185           0 :         v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
    3186           0 :         v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
    3187           0 :         v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
    3188           0 :         v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
    3189           0 :         v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
    3190             : 
    3191             :         // stage 3
    3192           0 :         u[0] = v[0];
    3193           0 :         u[1] = v[1];
    3194           0 :         u[2] = v[2];
    3195           0 :         u[3] = v[3];
    3196           0 :         u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
    3197           0 :         u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
    3198           0 :         u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
    3199           0 :         u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
    3200           0 :         addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
    3201           0 :         addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
    3202           0 :         addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
    3203           0 :         addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
    3204             : 
    3205             :         // stage 4
    3206           0 :         x = _mm256_mullo_epi32(u[0], cospi32);
    3207           0 :         y = _mm256_mullo_epi32(u[1], cospi32);
    3208           0 :         v[0] = _mm256_add_epi32(x, y);
    3209           0 :         v[0] = _mm256_add_epi32(v[0], rnding);
    3210           0 :         v[0] = _mm256_srai_epi32(v[0], bit);
    3211             : 
    3212           0 :         v[1] = _mm256_sub_epi32(x, y);
    3213           0 :         v[1] = _mm256_add_epi32(v[1], rnding);
    3214           0 :         v[1] = _mm256_srai_epi32(v[1], bit);
    3215             : 
    3216           0 :         v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
    3217           0 :         v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
    3218           0 :         addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
    3219           0 :         addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
    3220           0 :         v[8] = u[8];
    3221           0 :         v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
    3222           0 :         v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
    3223           0 :         v[11] = u[11];
    3224           0 :         v[12] = u[12];
    3225           0 :         v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
    3226           0 :         v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
    3227           0 :         v[15] = u[15];
    3228             : 
    3229             :         // stage 5
    3230           0 :         addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
    3231           0 :         addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
    3232           0 :         u[4] = v[4];
    3233             : 
    3234           0 :         x = _mm256_mullo_epi32(v[5], cospi32);
    3235           0 :         y = _mm256_mullo_epi32(v[6], cospi32);
    3236           0 :         u[5] = _mm256_sub_epi32(y, x);
    3237           0 :         u[5] = _mm256_add_epi32(u[5], rnding);
    3238           0 :         u[5] = _mm256_srai_epi32(u[5], bit);
    3239             : 
    3240           0 :         u[6] = _mm256_add_epi32(y, x);
    3241           0 :         u[6] = _mm256_add_epi32(u[6], rnding);
    3242           0 :         u[6] = _mm256_srai_epi32(u[6], bit);
    3243             : 
    3244           0 :         u[7] = v[7];
    3245           0 :         addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
    3246           0 :         addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
    3247           0 :         addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
    3248           0 :         addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
    3249             : 
    3250             :         // stage 6
    3251           0 :         addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
    3252           0 :         addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
    3253           0 :         addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
    3254           0 :         addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
    3255           0 :         v[8] = u[8];
    3256           0 :         v[9] = u[9];
    3257             : 
    3258           0 :         x = _mm256_mullo_epi32(u[10], cospi32);
    3259           0 :         y = _mm256_mullo_epi32(u[13], cospi32);
    3260           0 :         v[10] = _mm256_sub_epi32(y, x);
    3261           0 :         v[10] = _mm256_add_epi32(v[10], rnding);
    3262           0 :         v[10] = _mm256_srai_epi32(v[10], bit);
    3263             : 
    3264           0 :         v[13] = _mm256_add_epi32(x, y);
    3265           0 :         v[13] = _mm256_add_epi32(v[13], rnding);
    3266           0 :         v[13] = _mm256_srai_epi32(v[13], bit);
    3267             : 
    3268           0 :         x = _mm256_mullo_epi32(u[11], cospi32);
    3269           0 :         y = _mm256_mullo_epi32(u[12], cospi32);
    3270           0 :         v[11] = _mm256_sub_epi32(y, x);
    3271           0 :         v[11] = _mm256_add_epi32(v[11], rnding);
    3272           0 :         v[11] = _mm256_srai_epi32(v[11], bit);
    3273             : 
    3274           0 :         v[12] = _mm256_add_epi32(x, y);
    3275           0 :         v[12] = _mm256_add_epi32(v[12], rnding);
    3276           0 :         v[12] = _mm256_srai_epi32(v[12], bit);
    3277             : 
    3278           0 :         v[14] = u[14];
    3279           0 :         v[15] = u[15];
    3280             : 
    3281             :         // stage 7
    3282           0 :         if (do_cols) {
    3283           0 :             addsub_no_clamp_avx2(v[0], v[15], out + 0, out + 15);
    3284           0 :             addsub_no_clamp_avx2(v[1], v[14], out + 1, out + 14);
    3285           0 :             addsub_no_clamp_avx2(v[2], v[13], out + 2, out + 13);
    3286           0 :             addsub_no_clamp_avx2(v[3], v[12], out + 3, out + 12);
    3287           0 :             addsub_no_clamp_avx2(v[4], v[11], out + 4, out + 11);
    3288           0 :             addsub_no_clamp_avx2(v[5], v[10], out + 5, out + 10);
    3289           0 :             addsub_no_clamp_avx2(v[6], v[9], out + 6, out + 9);
    3290           0 :             addsub_no_clamp_avx2(v[7], v[8], out + 7, out + 8);
    3291             :         }
    3292             :         else {
    3293           0 :             const int32_t log_range_out = AOMMAX(16, bd + 6);
    3294           0 :             const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
    3295             :                 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
    3296           0 :             const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
    3297             :                 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
    3298             : 
    3299           0 :             addsub_shift_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
    3300             :                 &clamp_hi_out, out_shift);
    3301           0 :             addsub_shift_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
    3302             :                 &clamp_hi_out, out_shift);
    3303           0 :             addsub_shift_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
    3304             :                 &clamp_hi_out, out_shift);
    3305           0 :             addsub_shift_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
    3306             :                 &clamp_hi_out, out_shift);
    3307           0 :             addsub_shift_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
    3308             :                 &clamp_hi_out, out_shift);
    3309           0 :             addsub_shift_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
    3310             :                 &clamp_hi_out, out_shift);
    3311           0 :             addsub_shift_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
    3312             :                 &clamp_hi_out, out_shift);
    3313           0 :             addsub_shift_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
    3314             :                 &clamp_hi_out, out_shift);
    3315             :         }
    3316             :     }
    3317           0 : }
    3318             : 
    3319           0 : static void iadst16_low1_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    3320             :     int32_t bd, int32_t out_shift) {
    3321           0 :     const int32_t *cospi = cospi_arr(bit);
    3322           0 :     const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
    3323           0 :     const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
    3324           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    3325           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    3326           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    3327           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    3328           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    3329           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    3330           0 :     const __m256i zero = _mm256_setzero_si256();
    3331             :     __m256i v[16], x, y, temp1, temp2;
    3332             : 
    3333             :     // Calculate the column 0, 1, 2, 3
    3334             :     {
    3335             :         // stage 0
    3336             :         // stage 1
    3337             :         // stage 2
    3338           0 :         x = _mm256_mullo_epi32(in[0], cospi62);
    3339           0 :         v[0] = _mm256_add_epi32(x, rnding);
    3340           0 :         v[0] = _mm256_srai_epi32(v[0], bit);
    3341             : 
    3342           0 :         x = _mm256_mullo_epi32(in[0], cospi2);
    3343           0 :         v[1] = _mm256_sub_epi32(zero, x);
    3344           0 :         v[1] = _mm256_add_epi32(v[1], rnding);
    3345           0 :         v[1] = _mm256_srai_epi32(v[1], bit);
    3346             : 
    3347             :         // stage 3
    3348           0 :         v[8] = v[0];
    3349           0 :         v[9] = v[1];
    3350             : 
    3351             :         // stage 4
    3352           0 :         temp1 = _mm256_mullo_epi32(v[8], cospi8);
    3353           0 :         x = _mm256_mullo_epi32(v[9], cospi56);
    3354           0 :         temp1 = _mm256_add_epi32(temp1, x);
    3355           0 :         temp1 = _mm256_add_epi32(temp1, rnding);
    3356           0 :         temp1 = _mm256_srai_epi32(temp1, bit);
    3357             : 
    3358           0 :         temp2 = _mm256_mullo_epi32(v[8], cospi56);
    3359           0 :         x = _mm256_mullo_epi32(v[9], cospi8);
    3360           0 :         temp2 = _mm256_sub_epi32(temp2, x);
    3361           0 :         temp2 = _mm256_add_epi32(temp2, rnding);
    3362           0 :         temp2 = _mm256_srai_epi32(temp2, bit);
    3363           0 :         v[8] = temp1;
    3364           0 :         v[9] = temp2;
    3365             : 
    3366             :         // stage 5
    3367           0 :         v[4] = v[0];
    3368           0 :         v[5] = v[1];
    3369           0 :         v[12] = v[8];
    3370           0 :         v[13] = v[9];
    3371             : 
    3372             :         // stage 6
    3373           0 :         temp1 = _mm256_mullo_epi32(v[4], cospi16);
    3374           0 :         x = _mm256_mullo_epi32(v[5], cospi48);
    3375           0 :         temp1 = _mm256_add_epi32(temp1, x);
    3376           0 :         temp1 = _mm256_add_epi32(temp1, rnding);
    3377           0 :         temp1 = _mm256_srai_epi32(temp1, bit);
    3378             : 
    3379           0 :         temp2 = _mm256_mullo_epi32(v[4], cospi48);
    3380           0 :         x = _mm256_mullo_epi32(v[5], cospi16);
    3381           0 :         temp2 = _mm256_sub_epi32(temp2, x);
    3382           0 :         temp2 = _mm256_add_epi32(temp2, rnding);
    3383           0 :         temp2 = _mm256_srai_epi32(temp2, bit);
    3384           0 :         v[4] = temp1;
    3385           0 :         v[5] = temp2;
    3386             : 
    3387           0 :         temp1 = _mm256_mullo_epi32(v[12], cospi16);
    3388           0 :         x = _mm256_mullo_epi32(v[13], cospi48);
    3389           0 :         temp1 = _mm256_add_epi32(temp1, x);
    3390           0 :         temp1 = _mm256_add_epi32(temp1, rnding);
    3391           0 :         temp1 = _mm256_srai_epi32(temp1, bit);
    3392             : 
    3393           0 :         temp2 = _mm256_mullo_epi32(v[12], cospi48);
    3394           0 :         x = _mm256_mullo_epi32(v[13], cospi16);
    3395           0 :         temp2 = _mm256_sub_epi32(temp2, x);
    3396           0 :         temp2 = _mm256_add_epi32(temp2, rnding);
    3397           0 :         temp2 = _mm256_srai_epi32(temp2, bit);
    3398           0 :         v[12] = temp1;
    3399           0 :         v[13] = temp2;
    3400             : 
    3401             :         // stage 7
    3402           0 :         v[2] = v[0];
    3403           0 :         v[3] = v[1];
    3404           0 :         v[6] = v[4];
    3405           0 :         v[7] = v[5];
    3406           0 :         v[10] = v[8];
    3407           0 :         v[11] = v[9];
    3408           0 :         v[14] = v[12];
    3409           0 :         v[15] = v[13];
    3410             : 
    3411             :         // stage 8
    3412           0 :         y = _mm256_mullo_epi32(v[2], cospi32);
    3413           0 :         x = _mm256_mullo_epi32(v[3], cospi32);
    3414           0 :         v[2] = _mm256_add_epi32(y, x);
    3415           0 :         v[2] = _mm256_add_epi32(v[2], rnding);
    3416           0 :         v[2] = _mm256_srai_epi32(v[2], bit);
    3417             : 
    3418           0 :         v[3] = _mm256_sub_epi32(y, x);
    3419           0 :         v[3] = _mm256_add_epi32(v[3], rnding);
    3420           0 :         v[3] = _mm256_srai_epi32(v[3], bit);
    3421             : 
    3422           0 :         y = _mm256_mullo_epi32(v[6], cospi32);
    3423           0 :         x = _mm256_mullo_epi32(v[7], cospi32);
    3424           0 :         v[6] = _mm256_add_epi32(y, x);
    3425           0 :         v[6] = _mm256_add_epi32(v[6], rnding);
    3426           0 :         v[6] = _mm256_srai_epi32(v[6], bit);
    3427             : 
    3428           0 :         v[7] = _mm256_sub_epi32(y, x);
    3429           0 :         v[7] = _mm256_add_epi32(v[7], rnding);
    3430           0 :         v[7] = _mm256_srai_epi32(v[7], bit);
    3431             : 
    3432           0 :         y = _mm256_mullo_epi32(v[10], cospi32);
    3433           0 :         x = _mm256_mullo_epi32(v[11], cospi32);
    3434           0 :         v[10] = _mm256_add_epi32(y, x);
    3435           0 :         v[10] = _mm256_add_epi32(v[10], rnding);
    3436           0 :         v[10] = _mm256_srai_epi32(v[10], bit);
    3437             : 
    3438           0 :         v[11] = _mm256_sub_epi32(y, x);
    3439           0 :         v[11] = _mm256_add_epi32(v[11], rnding);
    3440           0 :         v[11] = _mm256_srai_epi32(v[11], bit);
    3441             : 
    3442           0 :         y = _mm256_mullo_epi32(v[14], cospi32);
    3443           0 :         x = _mm256_mullo_epi32(v[15], cospi32);
    3444           0 :         v[14] = _mm256_add_epi32(y, x);
    3445           0 :         v[14] = _mm256_add_epi32(v[14], rnding);
    3446           0 :         v[14] = _mm256_srai_epi32(v[14], bit);
    3447             : 
    3448           0 :         v[15] = _mm256_sub_epi32(y, x);
    3449           0 :         v[15] = _mm256_add_epi32(v[15], rnding);
    3450           0 :         v[15] = _mm256_srai_epi32(v[15], bit);
    3451             : 
    3452             :         // stage 9
    3453           0 :         if (do_cols) {
    3454           0 :             out[0] = v[0];
    3455           0 :             out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
    3456           0 :             out[2] = v[12];
    3457           0 :             out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
    3458           0 :             out[4] = v[6];
    3459           0 :             out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
    3460           0 :             out[6] = v[10];
    3461           0 :             out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
    3462           0 :             out[8] = v[3];
    3463           0 :             out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
    3464           0 :             out[10] = v[15];
    3465           0 :             out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
    3466           0 :             out[12] = v[5];
    3467           0 :             out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
    3468           0 :             out[14] = v[9];
    3469           0 :             out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
    3470             :         }
    3471             :         else {
    3472           0 :             const int32_t log_range_out = AOMMAX(16, bd + 6);
    3473           0 :             const __m256i clamp_lo_out =
    3474           0 :                 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
    3475           0 :             const __m256i clamp_hi_out =
    3476           0 :                 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
    3477             : 
    3478           0 :             neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
    3479             :                 out_shift);
    3480           0 :             neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
    3481             :                 &clamp_hi_out, out_shift);
    3482           0 :             neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
    3483             :                 &clamp_hi_out, out_shift);
    3484           0 :             neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
    3485             :                 &clamp_hi_out, out_shift);
    3486           0 :             neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
    3487             :                 &clamp_hi_out, out_shift);
    3488           0 :             neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
    3489             :                 &clamp_hi_out, out_shift);
    3490           0 :             neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
    3491             :                 &clamp_hi_out, out_shift);
    3492           0 :             neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
    3493             :                 &clamp_hi_out, out_shift);
    3494             :         }
    3495             :     }
    3496           0 : }
    3497             : 
    3498           0 : static void iadst16_low8_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    3499             :     int32_t bd, int32_t out_shift) {
    3500           0 :     const int32_t *cospi = cospi_arr(bit);
    3501           0 :     const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
    3502           0 :     const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
    3503           0 :     const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
    3504           0 :     const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
    3505           0 :     const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
    3506           0 :     const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
    3507           0 :     const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
    3508           0 :     const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
    3509           0 :     const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
    3510           0 :     const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
    3511           0 :     const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
    3512           0 :     const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
    3513           0 :     const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
    3514           0 :     const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
    3515           0 :     const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
    3516           0 :     const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
    3517           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    3518           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    3519           0 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    3520           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    3521           0 :     const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
    3522           0 :     const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
    3523           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    3524           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    3525           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    3526           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    3527           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    3528           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    3529           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    3530           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    3531             :     __m256i u[16], x, y;
    3532             : 
    3533             :     {
    3534             :         // stage 0
    3535             :         // stage 1
    3536             :         // stage 2
    3537           0 :         __m256i zero = _mm256_setzero_si256();
    3538           0 :         x = _mm256_mullo_epi32(in[0], cospi62);
    3539           0 :         u[0] = _mm256_add_epi32(x, rnding);
    3540           0 :         u[0] = _mm256_srai_epi32(u[0], bit);
    3541             : 
    3542           0 :         x = _mm256_mullo_epi32(in[0], cospi2);
    3543           0 :         u[1] = _mm256_sub_epi32(zero, x);
    3544           0 :         u[1] = _mm256_add_epi32(u[1], rnding);
    3545           0 :         u[1] = _mm256_srai_epi32(u[1], bit);
    3546             : 
    3547           0 :         x = _mm256_mullo_epi32(in[2], cospi54);
    3548           0 :         u[2] = _mm256_add_epi32(x, rnding);
    3549           0 :         u[2] = _mm256_srai_epi32(u[2], bit);
    3550             : 
    3551           0 :         x = _mm256_mullo_epi32(in[2], cospi10);
    3552           0 :         u[3] = _mm256_sub_epi32(zero, x);
    3553           0 :         u[3] = _mm256_add_epi32(u[3], rnding);
    3554           0 :         u[3] = _mm256_srai_epi32(u[3], bit);
    3555             : 
    3556           0 :         x = _mm256_mullo_epi32(in[4], cospi46);
    3557           0 :         u[4] = _mm256_add_epi32(x, rnding);
    3558           0 :         u[4] = _mm256_srai_epi32(u[4], bit);
    3559             : 
    3560           0 :         x = _mm256_mullo_epi32(in[4], cospi18);
    3561           0 :         u[5] = _mm256_sub_epi32(zero, x);
    3562           0 :         u[5] = _mm256_add_epi32(u[5], rnding);
    3563           0 :         u[5] = _mm256_srai_epi32(u[5], bit);
    3564             : 
    3565           0 :         x = _mm256_mullo_epi32(in[6], cospi38);
    3566           0 :         u[6] = _mm256_add_epi32(x, rnding);
    3567           0 :         u[6] = _mm256_srai_epi32(u[6], bit);
    3568             : 
    3569           0 :         x = _mm256_mullo_epi32(in[6], cospi26);
    3570           0 :         u[7] = _mm256_sub_epi32(zero, x);
    3571           0 :         u[7] = _mm256_add_epi32(u[7], rnding);
    3572           0 :         u[7] = _mm256_srai_epi32(u[7], bit);
    3573             : 
    3574           0 :         u[8] = _mm256_mullo_epi32(in[7], cospi34);
    3575           0 :         u[8] = _mm256_add_epi32(u[8], rnding);
    3576           0 :         u[8] = _mm256_srai_epi32(u[8], bit);
    3577             : 
    3578           0 :         u[9] = _mm256_mullo_epi32(in[7], cospi30);
    3579           0 :         u[9] = _mm256_add_epi32(u[9], rnding);
    3580           0 :         u[9] = _mm256_srai_epi32(u[9], bit);
    3581             : 
    3582           0 :         u[10] = _mm256_mullo_epi32(in[5], cospi42);
    3583           0 :         u[10] = _mm256_add_epi32(u[10], rnding);
    3584           0 :         u[10] = _mm256_srai_epi32(u[10], bit);
    3585             : 
    3586           0 :         u[11] = _mm256_mullo_epi32(in[5], cospi22);
    3587           0 :         u[11] = _mm256_add_epi32(u[11], rnding);
    3588           0 :         u[11] = _mm256_srai_epi32(u[11], bit);
    3589             : 
    3590           0 :         u[12] = _mm256_mullo_epi32(in[3], cospi50);
    3591           0 :         u[12] = _mm256_add_epi32(u[12], rnding);
    3592           0 :         u[12] = _mm256_srai_epi32(u[12], bit);
    3593             : 
    3594           0 :         u[13] = _mm256_mullo_epi32(in[3], cospi14);
    3595           0 :         u[13] = _mm256_add_epi32(u[13], rnding);
    3596           0 :         u[13] = _mm256_srai_epi32(u[13], bit);
    3597             : 
    3598           0 :         u[14] = _mm256_mullo_epi32(in[1], cospi58);
    3599           0 :         u[14] = _mm256_add_epi32(u[14], rnding);
    3600           0 :         u[14] = _mm256_srai_epi32(u[14], bit);
    3601             : 
    3602           0 :         u[15] = _mm256_mullo_epi32(in[1], cospi6);
    3603           0 :         u[15] = _mm256_add_epi32(u[15], rnding);
    3604           0 :         u[15] = _mm256_srai_epi32(u[15], bit);
    3605             : 
    3606             :         // stage 3
    3607           0 :         addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
    3608           0 :         addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
    3609           0 :         addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
    3610           0 :         addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
    3611           0 :         addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
    3612           0 :         addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
    3613           0 :         addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
    3614           0 :         addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
    3615             : 
    3616             :         // stage 4
    3617           0 :         y = _mm256_mullo_epi32(u[8], cospi56);
    3618           0 :         x = _mm256_mullo_epi32(u[9], cospi56);
    3619           0 :         u[8] = _mm256_mullo_epi32(u[8], cospi8);
    3620           0 :         u[8] = _mm256_add_epi32(u[8], x);
    3621           0 :         u[8] = _mm256_add_epi32(u[8], rnding);
    3622           0 :         u[8] = _mm256_srai_epi32(u[8], bit);
    3623             : 
    3624           0 :         x = _mm256_mullo_epi32(u[9], cospi8);
    3625           0 :         u[9] = _mm256_sub_epi32(y, x);
    3626           0 :         u[9] = _mm256_add_epi32(u[9], rnding);
    3627           0 :         u[9] = _mm256_srai_epi32(u[9], bit);
    3628             : 
    3629           0 :         x = _mm256_mullo_epi32(u[11], cospi24);
    3630           0 :         y = _mm256_mullo_epi32(u[10], cospi24);
    3631           0 :         u[10] = _mm256_mullo_epi32(u[10], cospi40);
    3632           0 :         u[10] = _mm256_add_epi32(u[10], x);
    3633           0 :         u[10] = _mm256_add_epi32(u[10], rnding);
    3634           0 :         u[10] = _mm256_srai_epi32(u[10], bit);
    3635             : 
    3636           0 :         x = _mm256_mullo_epi32(u[11], cospi40);
    3637           0 :         u[11] = _mm256_sub_epi32(y, x);
    3638           0 :         u[11] = _mm256_add_epi32(u[11], rnding);
    3639           0 :         u[11] = _mm256_srai_epi32(u[11], bit);
    3640             : 
    3641           0 :         x = _mm256_mullo_epi32(u[13], cospi8);
    3642           0 :         y = _mm256_mullo_epi32(u[12], cospi8);
    3643           0 :         u[12] = _mm256_mullo_epi32(u[12], cospim56);
    3644           0 :         u[12] = _mm256_add_epi32(u[12], x);
    3645           0 :         u[12] = _mm256_add_epi32(u[12], rnding);
    3646           0 :         u[12] = _mm256_srai_epi32(u[12], bit);
    3647             : 
    3648           0 :         x = _mm256_mullo_epi32(u[13], cospim56);
    3649           0 :         u[13] = _mm256_sub_epi32(y, x);
    3650           0 :         u[13] = _mm256_add_epi32(u[13], rnding);
    3651           0 :         u[13] = _mm256_srai_epi32(u[13], bit);
    3652             : 
    3653           0 :         x = _mm256_mullo_epi32(u[15], cospi40);
    3654           0 :         y = _mm256_mullo_epi32(u[14], cospi40);
    3655           0 :         u[14] = _mm256_mullo_epi32(u[14], cospim24);
    3656           0 :         u[14] = _mm256_add_epi32(u[14], x);
    3657           0 :         u[14] = _mm256_add_epi32(u[14], rnding);
    3658           0 :         u[14] = _mm256_srai_epi32(u[14], bit);
    3659             : 
    3660           0 :         x = _mm256_mullo_epi32(u[15], cospim24);
    3661           0 :         u[15] = _mm256_sub_epi32(y, x);
    3662           0 :         u[15] = _mm256_add_epi32(u[15], rnding);
    3663           0 :         u[15] = _mm256_srai_epi32(u[15], bit);
    3664             : 
    3665             :         // stage 5
    3666           0 :         addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
    3667           0 :         addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
    3668           0 :         addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
    3669           0 :         addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
    3670           0 :         addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
    3671           0 :         addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
    3672           0 :         addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
    3673           0 :         addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
    3674             : 
    3675             :         // stage 6
    3676           0 :         x = _mm256_mullo_epi32(u[5], cospi48);
    3677           0 :         y = _mm256_mullo_epi32(u[4], cospi48);
    3678           0 :         u[4] = _mm256_mullo_epi32(u[4], cospi16);
    3679           0 :         u[4] = _mm256_add_epi32(u[4], x);
    3680           0 :         u[4] = _mm256_add_epi32(u[4], rnding);
    3681           0 :         u[4] = _mm256_srai_epi32(u[4], bit);
    3682             : 
    3683           0 :         x = _mm256_mullo_epi32(u[5], cospi16);
    3684           0 :         u[5] = _mm256_sub_epi32(y, x);
    3685           0 :         u[5] = _mm256_add_epi32(u[5], rnding);
    3686           0 :         u[5] = _mm256_srai_epi32(u[5], bit);
    3687             : 
    3688           0 :         x = _mm256_mullo_epi32(u[7], cospi16);
    3689           0 :         y = _mm256_mullo_epi32(u[6], cospi16);
    3690           0 :         u[6] = _mm256_mullo_epi32(u[6], cospim48);
    3691           0 :         u[6] = _mm256_add_epi32(u[6], x);
    3692           0 :         u[6] = _mm256_add_epi32(u[6], rnding);
    3693           0 :         u[6] = _mm256_srai_epi32(u[6], bit);
    3694             : 
    3695           0 :         x = _mm256_mullo_epi32(u[7], cospim48);
    3696           0 :         u[7] = _mm256_sub_epi32(y, x);
    3697           0 :         u[7] = _mm256_add_epi32(u[7], rnding);
    3698           0 :         u[7] = _mm256_srai_epi32(u[7], bit);
    3699             : 
    3700           0 :         x = _mm256_mullo_epi32(u[13], cospi48);
    3701           0 :         y = _mm256_mullo_epi32(u[12], cospi48);
    3702           0 :         u[12] = _mm256_mullo_epi32(u[12], cospi16);
    3703           0 :         u[12] = _mm256_add_epi32(u[12], x);
    3704           0 :         u[12] = _mm256_add_epi32(u[12], rnding);
    3705           0 :         u[12] = _mm256_srai_epi32(u[12], bit);
    3706             : 
    3707           0 :         x = _mm256_mullo_epi32(u[13], cospi16);
    3708           0 :         u[13] = _mm256_sub_epi32(y, x);
    3709           0 :         u[13] = _mm256_add_epi32(u[13], rnding);
    3710           0 :         u[13] = _mm256_srai_epi32(u[13], bit);
    3711             : 
    3712           0 :         x = _mm256_mullo_epi32(u[15], cospi16);
    3713           0 :         y = _mm256_mullo_epi32(u[14], cospi16);
    3714           0 :         u[14] = _mm256_mullo_epi32(u[14], cospim48);
    3715           0 :         u[14] = _mm256_add_epi32(u[14], x);
    3716           0 :         u[14] = _mm256_add_epi32(u[14], rnding);
    3717           0 :         u[14] = _mm256_srai_epi32(u[14], bit);
    3718             : 
    3719           0 :         x = _mm256_mullo_epi32(u[15], cospim48);
    3720           0 :         u[15] = _mm256_sub_epi32(y, x);
    3721           0 :         u[15] = _mm256_add_epi32(u[15], rnding);
    3722           0 :         u[15] = _mm256_srai_epi32(u[15], bit);
    3723             : 
    3724             :         // stage 7
    3725           0 :         addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
    3726           0 :         addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
    3727           0 :         addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
    3728           0 :         addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
    3729           0 :         addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
    3730           0 :         addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
    3731           0 :         addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
    3732           0 :         addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
    3733             : 
    3734             :         // stage 8
    3735           0 :         y = _mm256_mullo_epi32(u[2], cospi32);
    3736           0 :         x = _mm256_mullo_epi32(u[3], cospi32);
    3737           0 :         u[2] = _mm256_add_epi32(y, x);
    3738           0 :         u[2] = _mm256_add_epi32(u[2], rnding);
    3739           0 :         u[2] = _mm256_srai_epi32(u[2], bit);
    3740             : 
    3741           0 :         u[3] = _mm256_sub_epi32(y, x);
    3742           0 :         u[3] = _mm256_add_epi32(u[3], rnding);
    3743           0 :         u[3] = _mm256_srai_epi32(u[3], bit);
    3744           0 :         y = _mm256_mullo_epi32(u[6], cospi32);
    3745           0 :         x = _mm256_mullo_epi32(u[7], cospi32);
    3746           0 :         u[6] = _mm256_add_epi32(y, x);
    3747           0 :         u[6] = _mm256_add_epi32(u[6], rnding);
    3748           0 :         u[6] = _mm256_srai_epi32(u[6], bit);
    3749             : 
    3750           0 :         u[7] = _mm256_sub_epi32(y, x);
    3751           0 :         u[7] = _mm256_add_epi32(u[7], rnding);
    3752           0 :         u[7] = _mm256_srai_epi32(u[7], bit);
    3753             : 
    3754           0 :         y = _mm256_mullo_epi32(u[10], cospi32);
    3755           0 :         x = _mm256_mullo_epi32(u[11], cospi32);
    3756           0 :         u[10] = _mm256_add_epi32(y, x);
    3757           0 :         u[10] = _mm256_add_epi32(u[10], rnding);
    3758           0 :         u[10] = _mm256_srai_epi32(u[10], bit);
    3759             : 
    3760           0 :         u[11] = _mm256_sub_epi32(y, x);
    3761           0 :         u[11] = _mm256_add_epi32(u[11], rnding);
    3762           0 :         u[11] = _mm256_srai_epi32(u[11], bit);
    3763             : 
    3764           0 :         y = _mm256_mullo_epi32(u[14], cospi32);
    3765           0 :         x = _mm256_mullo_epi32(u[15], cospi32);
    3766           0 :         u[14] = _mm256_add_epi32(y, x);
    3767           0 :         u[14] = _mm256_add_epi32(u[14], rnding);
    3768           0 :         u[14] = _mm256_srai_epi32(u[14], bit);
    3769             : 
    3770           0 :         u[15] = _mm256_sub_epi32(y, x);
    3771           0 :         u[15] = _mm256_add_epi32(u[15], rnding);
    3772           0 :         u[15] = _mm256_srai_epi32(u[15], bit);
    3773             : 
    3774             :         // stage 9
    3775           0 :         if (do_cols) {
    3776           0 :             out[0] = u[0];
    3777           0 :             out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]);
    3778           0 :             out[2] = u[12];
    3779           0 :             out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]);
    3780           0 :             out[4] = u[6];
    3781           0 :             out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]);
    3782           0 :             out[6] = u[10];
    3783           0 :             out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]);
    3784           0 :             out[8] = u[3];
    3785           0 :             out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]);
    3786           0 :             out[10] = u[15];
    3787           0 :             out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]);
    3788           0 :             out[12] = u[5];
    3789           0 :             out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]);
    3790           0 :             out[14] = u[9];
    3791           0 :             out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]);
    3792             :         }
    3793             :         else {
    3794           0 :             const int32_t log_range_out = AOMMAX(16, bd + 6);
    3795           0 :             const __m256i clamp_lo_out =
    3796           0 :                 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
    3797           0 :             const __m256i clamp_hi_out =
    3798           0 :                 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
    3799             : 
    3800           0 :             neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
    3801             :                 out_shift);
    3802           0 :             neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
    3803             :                 &clamp_hi_out, out_shift);
    3804           0 :             neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
    3805             :                 &clamp_hi_out, out_shift);
    3806           0 :             neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
    3807             :                 &clamp_hi_out, out_shift);
    3808           0 :             neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
    3809             :                 &clamp_hi_out, out_shift);
    3810           0 :             neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
    3811             :                 &clamp_hi_out, out_shift);
    3812           0 :             neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
    3813             :                 &clamp_hi_out, out_shift);
    3814           0 :             neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
    3815             :                 &clamp_hi_out, out_shift);
    3816             :         }
    3817             :     }
    3818           0 : }
    3819             : 
    3820           0 : static void iadst16_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    3821             :     int32_t bd, int32_t out_shift) {
    3822           0 :     const int32_t *cospi = cospi_arr(bit);
    3823           0 :     const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
    3824           0 :     const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
    3825           0 :     const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
    3826           0 :     const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
    3827           0 :     const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
    3828           0 :     const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
    3829           0 :     const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
    3830           0 :     const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
    3831           0 :     const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
    3832           0 :     const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
    3833           0 :     const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
    3834           0 :     const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
    3835           0 :     const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
    3836           0 :     const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
    3837           0 :     const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
    3838           0 :     const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
    3839           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    3840           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    3841           0 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    3842           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    3843           0 :     const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
    3844           0 :     const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
    3845           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    3846           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    3847           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    3848           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    3849           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    3850           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    3851           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    3852           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    3853             :     __m256i u[16], v[16], x, y;
    3854             : 
    3855             :     {
    3856             :         // stage 0
    3857             :         // stage 1
    3858             :         // stage 2
    3859           0 :         v[0] = _mm256_mullo_epi32(in[15], cospi2);
    3860           0 :         x = _mm256_mullo_epi32(in[0], cospi62);
    3861           0 :         v[0] = _mm256_add_epi32(v[0], x);
    3862           0 :         v[0] = _mm256_add_epi32(v[0], rnding);
    3863           0 :         v[0] = _mm256_srai_epi32(v[0], bit);
    3864             : 
    3865           0 :         v[1] = _mm256_mullo_epi32(in[15], cospi62);
    3866           0 :         x = _mm256_mullo_epi32(in[0], cospi2);
    3867           0 :         v[1] = _mm256_sub_epi32(v[1], x);
    3868           0 :         v[1] = _mm256_add_epi32(v[1], rnding);
    3869           0 :         v[1] = _mm256_srai_epi32(v[1], bit);
    3870             : 
    3871           0 :         v[2] = _mm256_mullo_epi32(in[13], cospi10);
    3872           0 :         x = _mm256_mullo_epi32(in[2], cospi54);
    3873           0 :         v[2] = _mm256_add_epi32(v[2], x);
    3874           0 :         v[2] = _mm256_add_epi32(v[2], rnding);
    3875           0 :         v[2] = _mm256_srai_epi32(v[2], bit);
    3876             : 
    3877           0 :         v[3] = _mm256_mullo_epi32(in[13], cospi54);
    3878           0 :         x = _mm256_mullo_epi32(in[2], cospi10);
    3879           0 :         v[3] = _mm256_sub_epi32(v[3], x);
    3880           0 :         v[3] = _mm256_add_epi32(v[3], rnding);
    3881           0 :         v[3] = _mm256_srai_epi32(v[3], bit);
    3882             : 
    3883           0 :         v[4] = _mm256_mullo_epi32(in[11], cospi18);
    3884           0 :         x = _mm256_mullo_epi32(in[4], cospi46);
    3885           0 :         v[4] = _mm256_add_epi32(v[4], x);
    3886           0 :         v[4] = _mm256_add_epi32(v[4], rnding);
    3887           0 :         v[4] = _mm256_srai_epi32(v[4], bit);
    3888             : 
    3889           0 :         v[5] = _mm256_mullo_epi32(in[11], cospi46);
    3890           0 :         x = _mm256_mullo_epi32(in[4], cospi18);
    3891           0 :         v[5] = _mm256_sub_epi32(v[5], x);
    3892           0 :         v[5] = _mm256_add_epi32(v[5], rnding);
    3893           0 :         v[5] = _mm256_srai_epi32(v[5], bit);
    3894             : 
    3895           0 :         v[6] = _mm256_mullo_epi32(in[9], cospi26);
    3896           0 :         x = _mm256_mullo_epi32(in[6], cospi38);
    3897           0 :         v[6] = _mm256_add_epi32(v[6], x);
    3898           0 :         v[6] = _mm256_add_epi32(v[6], rnding);
    3899           0 :         v[6] = _mm256_srai_epi32(v[6], bit);
    3900             : 
    3901           0 :         v[7] = _mm256_mullo_epi32(in[9], cospi38);
    3902           0 :         x = _mm256_mullo_epi32(in[6], cospi26);
    3903           0 :         v[7] = _mm256_sub_epi32(v[7], x);
    3904           0 :         v[7] = _mm256_add_epi32(v[7], rnding);
    3905           0 :         v[7] = _mm256_srai_epi32(v[7], bit);
    3906             : 
    3907           0 :         v[8] = _mm256_mullo_epi32(in[7], cospi34);
    3908           0 :         x = _mm256_mullo_epi32(in[8], cospi30);
    3909           0 :         v[8] = _mm256_add_epi32(v[8], x);
    3910           0 :         v[8] = _mm256_add_epi32(v[8], rnding);
    3911           0 :         v[8] = _mm256_srai_epi32(v[8], bit);
    3912             : 
    3913           0 :         v[9] = _mm256_mullo_epi32(in[7], cospi30);
    3914           0 :         x = _mm256_mullo_epi32(in[8], cospi34);
    3915           0 :         v[9] = _mm256_sub_epi32(v[9], x);
    3916           0 :         v[9] = _mm256_add_epi32(v[9], rnding);
    3917           0 :         v[9] = _mm256_srai_epi32(v[9], bit);
    3918             : 
    3919           0 :         v[10] = _mm256_mullo_epi32(in[5], cospi42);
    3920           0 :         x = _mm256_mullo_epi32(in[10], cospi22);
    3921           0 :         v[10] = _mm256_add_epi32(v[10], x);
    3922           0 :         v[10] = _mm256_add_epi32(v[10], rnding);
    3923           0 :         v[10] = _mm256_srai_epi32(v[10], bit);
    3924             : 
    3925           0 :         v[11] = _mm256_mullo_epi32(in[5], cospi22);
    3926           0 :         x = _mm256_mullo_epi32(in[10], cospi42);
    3927           0 :         v[11] = _mm256_sub_epi32(v[11], x);
    3928           0 :         v[11] = _mm256_add_epi32(v[11], rnding);
    3929           0 :         v[11] = _mm256_srai_epi32(v[11], bit);
    3930             : 
    3931           0 :         v[12] = _mm256_mullo_epi32(in[3], cospi50);
    3932           0 :         x = _mm256_mullo_epi32(in[12], cospi14);
    3933           0 :         v[12] = _mm256_add_epi32(v[12], x);
    3934           0 :         v[12] = _mm256_add_epi32(v[12], rnding);
    3935           0 :         v[12] = _mm256_srai_epi32(v[12], bit);
    3936             : 
    3937           0 :         v[13] = _mm256_mullo_epi32(in[3], cospi14);
    3938           0 :         x = _mm256_mullo_epi32(in[12], cospi50);
    3939           0 :         v[13] = _mm256_sub_epi32(v[13], x);
    3940           0 :         v[13] = _mm256_add_epi32(v[13], rnding);
    3941           0 :         v[13] = _mm256_srai_epi32(v[13], bit);
    3942             : 
    3943           0 :         v[14] = _mm256_mullo_epi32(in[1], cospi58);
    3944           0 :         x = _mm256_mullo_epi32(in[14], cospi6);
    3945           0 :         v[14] = _mm256_add_epi32(v[14], x);
    3946           0 :         v[14] = _mm256_add_epi32(v[14], rnding);
    3947           0 :         v[14] = _mm256_srai_epi32(v[14], bit);
    3948             : 
    3949           0 :         v[15] = _mm256_mullo_epi32(in[1], cospi6);
    3950           0 :         x = _mm256_mullo_epi32(in[14], cospi58);
    3951           0 :         v[15] = _mm256_sub_epi32(v[15], x);
    3952           0 :         v[15] = _mm256_add_epi32(v[15], rnding);
    3953           0 :         v[15] = _mm256_srai_epi32(v[15], bit);
    3954             : 
    3955             :         // stage 3
    3956           0 :         addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
    3957           0 :         addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
    3958           0 :         addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
    3959           0 :         addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
    3960           0 :         addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
    3961           0 :         addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
    3962           0 :         addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
    3963           0 :         addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
    3964             : 
    3965             :         // stage 4
    3966           0 :         v[0] = u[0];
    3967           0 :         v[1] = u[1];
    3968           0 :         v[2] = u[2];
    3969           0 :         v[3] = u[3];
    3970           0 :         v[4] = u[4];
    3971           0 :         v[5] = u[5];
    3972           0 :         v[6] = u[6];
    3973           0 :         v[7] = u[7];
    3974             : 
    3975           0 :         v[8] = _mm256_mullo_epi32(u[8], cospi8);
    3976           0 :         x = _mm256_mullo_epi32(u[9], cospi56);
    3977           0 :         v[8] = _mm256_add_epi32(v[8], x);
    3978           0 :         v[8] = _mm256_add_epi32(v[8], rnding);
    3979           0 :         v[8] = _mm256_srai_epi32(v[8], bit);
    3980             : 
    3981           0 :         v[9] = _mm256_mullo_epi32(u[8], cospi56);
    3982           0 :         x = _mm256_mullo_epi32(u[9], cospi8);
    3983           0 :         v[9] = _mm256_sub_epi32(v[9], x);
    3984           0 :         v[9] = _mm256_add_epi32(v[9], rnding);
    3985           0 :         v[9] = _mm256_srai_epi32(v[9], bit);
    3986             : 
    3987           0 :         v[10] = _mm256_mullo_epi32(u[10], cospi40);
    3988           0 :         x = _mm256_mullo_epi32(u[11], cospi24);
    3989           0 :         v[10] = _mm256_add_epi32(v[10], x);
    3990           0 :         v[10] = _mm256_add_epi32(v[10], rnding);
    3991           0 :         v[10] = _mm256_srai_epi32(v[10], bit);
    3992             : 
    3993           0 :         v[11] = _mm256_mullo_epi32(u[10], cospi24);
    3994           0 :         x = _mm256_mullo_epi32(u[11], cospi40);
    3995           0 :         v[11] = _mm256_sub_epi32(v[11], x);
    3996           0 :         v[11] = _mm256_add_epi32(v[11], rnding);
    3997           0 :         v[11] = _mm256_srai_epi32(v[11], bit);
    3998             : 
    3999           0 :         v[12] = _mm256_mullo_epi32(u[12], cospim56);
    4000           0 :         x = _mm256_mullo_epi32(u[13], cospi8);
    4001           0 :         v[12] = _mm256_add_epi32(v[12], x);
    4002           0 :         v[12] = _mm256_add_epi32(v[12], rnding);
    4003           0 :         v[12] = _mm256_srai_epi32(v[12], bit);
    4004             : 
    4005           0 :         v[13] = _mm256_mullo_epi32(u[12], cospi8);
    4006           0 :         x = _mm256_mullo_epi32(u[13], cospim56);
    4007           0 :         v[13] = _mm256_sub_epi32(v[13], x);
    4008           0 :         v[13] = _mm256_add_epi32(v[13], rnding);
    4009           0 :         v[13] = _mm256_srai_epi32(v[13], bit);
    4010             : 
    4011           0 :         v[14] = _mm256_mullo_epi32(u[14], cospim24);
    4012           0 :         x = _mm256_mullo_epi32(u[15], cospi40);
    4013           0 :         v[14] = _mm256_add_epi32(v[14], x);
    4014           0 :         v[14] = _mm256_add_epi32(v[14], rnding);
    4015           0 :         v[14] = _mm256_srai_epi32(v[14], bit);
    4016             : 
    4017           0 :         v[15] = _mm256_mullo_epi32(u[14], cospi40);
    4018           0 :         x = _mm256_mullo_epi32(u[15], cospim24);
    4019           0 :         v[15] = _mm256_sub_epi32(v[15], x);
    4020           0 :         v[15] = _mm256_add_epi32(v[15], rnding);
    4021           0 :         v[15] = _mm256_srai_epi32(v[15], bit);
    4022             : 
    4023             :         // stage 5
    4024           0 :         addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
    4025           0 :         addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
    4026           0 :         addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
    4027           0 :         addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
    4028           0 :         addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
    4029           0 :         addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
    4030           0 :         addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
    4031           0 :         addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
    4032             : 
    4033             :         // stage 6
    4034           0 :         v[0] = u[0];
    4035           0 :         v[1] = u[1];
    4036           0 :         v[2] = u[2];
    4037           0 :         v[3] = u[3];
    4038             : 
    4039           0 :         v[4] = _mm256_mullo_epi32(u[4], cospi16);
    4040           0 :         x = _mm256_mullo_epi32(u[5], cospi48);
    4041           0 :         v[4] = _mm256_add_epi32(v[4], x);
    4042           0 :         v[4] = _mm256_add_epi32(v[4], rnding);
    4043           0 :         v[4] = _mm256_srai_epi32(v[4], bit);
    4044             : 
    4045           0 :         v[5] = _mm256_mullo_epi32(u[4], cospi48);
    4046           0 :         x = _mm256_mullo_epi32(u[5], cospi16);
    4047           0 :         v[5] = _mm256_sub_epi32(v[5], x);
    4048           0 :         v[5] = _mm256_add_epi32(v[5], rnding);
    4049           0 :         v[5] = _mm256_srai_epi32(v[5], bit);
    4050             : 
    4051           0 :         v[6] = _mm256_mullo_epi32(u[6], cospim48);
    4052           0 :         x = _mm256_mullo_epi32(u[7], cospi16);
    4053           0 :         v[6] = _mm256_add_epi32(v[6], x);
    4054           0 :         v[6] = _mm256_add_epi32(v[6], rnding);
    4055           0 :         v[6] = _mm256_srai_epi32(v[6], bit);
    4056             : 
    4057           0 :         v[7] = _mm256_mullo_epi32(u[6], cospi16);
    4058           0 :         x = _mm256_mullo_epi32(u[7], cospim48);
    4059           0 :         v[7] = _mm256_sub_epi32(v[7], x);
    4060           0 :         v[7] = _mm256_add_epi32(v[7], rnding);
    4061           0 :         v[7] = _mm256_srai_epi32(v[7], bit);
    4062             : 
    4063           0 :         v[8] = u[8];
    4064           0 :         v[9] = u[9];
    4065           0 :         v[10] = u[10];
    4066           0 :         v[11] = u[11];
    4067             : 
    4068           0 :         v[12] = _mm256_mullo_epi32(u[12], cospi16);
    4069           0 :         x = _mm256_mullo_epi32(u[13], cospi48);
    4070           0 :         v[12] = _mm256_add_epi32(v[12], x);
    4071           0 :         v[12] = _mm256_add_epi32(v[12], rnding);
    4072           0 :         v[12] = _mm256_srai_epi32(v[12], bit);
    4073             : 
    4074           0 :         v[13] = _mm256_mullo_epi32(u[12], cospi48);
    4075           0 :         x = _mm256_mullo_epi32(u[13], cospi16);
    4076           0 :         v[13] = _mm256_sub_epi32(v[13], x);
    4077           0 :         v[13] = _mm256_add_epi32(v[13], rnding);
    4078           0 :         v[13] = _mm256_srai_epi32(v[13], bit);
    4079             : 
    4080           0 :         v[14] = _mm256_mullo_epi32(u[14], cospim48);
    4081           0 :         x = _mm256_mullo_epi32(u[15], cospi16);
    4082           0 :         v[14] = _mm256_add_epi32(v[14], x);
    4083           0 :         v[14] = _mm256_add_epi32(v[14], rnding);
    4084           0 :         v[14] = _mm256_srai_epi32(v[14], bit);
    4085             : 
    4086           0 :         v[15] = _mm256_mullo_epi32(u[14], cospi16);
    4087           0 :         x = _mm256_mullo_epi32(u[15], cospim48);
    4088           0 :         v[15] = _mm256_sub_epi32(v[15], x);
    4089           0 :         v[15] = _mm256_add_epi32(v[15], rnding);
    4090           0 :         v[15] = _mm256_srai_epi32(v[15], bit);
    4091             : 
    4092             :         // stage 7
    4093           0 :         addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
    4094           0 :         addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
    4095           0 :         addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
    4096           0 :         addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
    4097           0 :         addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
    4098           0 :         addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
    4099           0 :         addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
    4100           0 :         addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
    4101             : 
    4102             :         // stage 8
    4103           0 :         v[0] = u[0];
    4104           0 :         v[1] = u[1];
    4105             : 
    4106           0 :         y = _mm256_mullo_epi32(u[2], cospi32);
    4107           0 :         x = _mm256_mullo_epi32(u[3], cospi32);
    4108           0 :         v[2] = _mm256_add_epi32(y, x);
    4109           0 :         v[2] = _mm256_add_epi32(v[2], rnding);
    4110           0 :         v[2] = _mm256_srai_epi32(v[2], bit);
    4111             : 
    4112           0 :         v[3] = _mm256_sub_epi32(y, x);
    4113           0 :         v[3] = _mm256_add_epi32(v[3], rnding);
    4114           0 :         v[3] = _mm256_srai_epi32(v[3], bit);
    4115             : 
    4116           0 :         v[4] = u[4];
    4117           0 :         v[5] = u[5];
    4118             : 
    4119           0 :         y = _mm256_mullo_epi32(u[6], cospi32);
    4120           0 :         x = _mm256_mullo_epi32(u[7], cospi32);
    4121           0 :         v[6] = _mm256_add_epi32(y, x);
    4122           0 :         v[6] = _mm256_add_epi32(v[6], rnding);
    4123           0 :         v[6] = _mm256_srai_epi32(v[6], bit);
    4124             : 
    4125           0 :         v[7] = _mm256_sub_epi32(y, x);
    4126           0 :         v[7] = _mm256_add_epi32(v[7], rnding);
    4127           0 :         v[7] = _mm256_srai_epi32(v[7], bit);
    4128             : 
    4129           0 :         v[8] = u[8];
    4130           0 :         v[9] = u[9];
    4131             : 
    4132           0 :         y = _mm256_mullo_epi32(u[10], cospi32);
    4133           0 :         x = _mm256_mullo_epi32(u[11], cospi32);
    4134           0 :         v[10] = _mm256_add_epi32(y, x);
    4135           0 :         v[10] = _mm256_add_epi32(v[10], rnding);
    4136           0 :         v[10] = _mm256_srai_epi32(v[10], bit);
    4137             : 
    4138           0 :         v[11] = _mm256_sub_epi32(y, x);
    4139           0 :         v[11] = _mm256_add_epi32(v[11], rnding);
    4140           0 :         v[11] = _mm256_srai_epi32(v[11], bit);
    4141             : 
    4142           0 :         v[12] = u[12];
    4143           0 :         v[13] = u[13];
    4144             : 
    4145           0 :         y = _mm256_mullo_epi32(u[14], cospi32);
    4146           0 :         x = _mm256_mullo_epi32(u[15], cospi32);
    4147           0 :         v[14] = _mm256_add_epi32(y, x);
    4148           0 :         v[14] = _mm256_add_epi32(v[14], rnding);
    4149           0 :         v[14] = _mm256_srai_epi32(v[14], bit);
    4150             : 
    4151           0 :         v[15] = _mm256_sub_epi32(y, x);
    4152           0 :         v[15] = _mm256_add_epi32(v[15], rnding);
    4153           0 :         v[15] = _mm256_srai_epi32(v[15], bit);
    4154             : 
    4155             :         // stage 9
    4156           0 :         if (do_cols) {
    4157           0 :             out[0] = v[0];
    4158           0 :             out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
    4159           0 :             out[2] = v[12];
    4160           0 :             out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
    4161           0 :             out[4] = v[6];
    4162           0 :             out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
    4163           0 :             out[6] = v[10];
    4164           0 :             out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
    4165           0 :             out[8] = v[3];
    4166           0 :             out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
    4167           0 :             out[10] = v[15];
    4168           0 :             out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
    4169           0 :             out[12] = v[5];
    4170           0 :             out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
    4171           0 :             out[14] = v[9];
    4172           0 :             out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
    4173             :         }
    4174             :         else {
    4175           0 :             const int32_t log_range_out = AOMMAX(16, bd + 6);
    4176           0 :             const __m256i clamp_lo_out =
    4177           0 :                 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
    4178           0 :             const __m256i clamp_hi_out =
    4179           0 :                 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
    4180             : 
    4181           0 :             neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
    4182             :                 out_shift);
    4183           0 :             neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
    4184             :                 &clamp_hi_out, out_shift);
    4185           0 :             neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
    4186             :                 &clamp_hi_out, out_shift);
    4187           0 :             neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
    4188             :                 &clamp_hi_out, out_shift);
    4189           0 :             neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
    4190             :                 &clamp_hi_out, out_shift);
    4191           0 :             neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
    4192             :                 &clamp_hi_out, out_shift);
    4193           0 :             neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
    4194             :                 &clamp_hi_out, out_shift);
    4195           0 :             neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
    4196             :                 &clamp_hi_out, out_shift);
    4197             :         }
    4198             :     }
    4199           0 : }
    4200           0 : static void iidentity16_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    4201             :     int32_t bd, int32_t out_shift) {
    4202             :     (void)bit;
    4203           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    4204           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    4205           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    4206             :     __m256i v[16];
    4207           0 :     __m256i fact = _mm256_set1_epi32(2 * NewSqrt2);
    4208           0 :     __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1));
    4209             :     __m256i a0, a1, a2, a3;
    4210             : 
    4211           0 :     for (int32_t i = 0; i < 16; i += 8) {
    4212           0 :         a0 = _mm256_mullo_epi32(in[i], fact);
    4213           0 :         a1 = _mm256_mullo_epi32(in[i + 1], fact);
    4214           0 :         a0 = _mm256_add_epi32(a0, offset);
    4215           0 :         a1 = _mm256_add_epi32(a1, offset);
    4216           0 :         v[i] = _mm256_srai_epi32(a0, NewSqrt2Bits);
    4217           0 :         v[i + 1] = _mm256_srai_epi32(a1, NewSqrt2Bits);
    4218             : 
    4219           0 :         a2 = _mm256_mullo_epi32(in[i + 2], fact);
    4220           0 :         a3 = _mm256_mullo_epi32(in[i + 3], fact);
    4221           0 :         a2 = _mm256_add_epi32(a2, offset);
    4222           0 :         a3 = _mm256_add_epi32(a3, offset);
    4223           0 :         v[i + 2] = _mm256_srai_epi32(a2, NewSqrt2Bits);
    4224           0 :         v[i + 3] = _mm256_srai_epi32(a3, NewSqrt2Bits);
    4225             : 
    4226           0 :         a0 = _mm256_mullo_epi32(in[i + 4], fact);
    4227           0 :         a1 = _mm256_mullo_epi32(in[i + 5], fact);
    4228           0 :         a0 = _mm256_add_epi32(a0, offset);
    4229           0 :         a1 = _mm256_add_epi32(a1, offset);
    4230           0 :         v[i + 4] = _mm256_srai_epi32(a0, NewSqrt2Bits);
    4231           0 :         v[i + 5] = _mm256_srai_epi32(a1, NewSqrt2Bits);
    4232             : 
    4233           0 :         a2 = _mm256_mullo_epi32(in[i + 6], fact);
    4234           0 :         a3 = _mm256_mullo_epi32(in[i + 7], fact);
    4235           0 :         a2 = _mm256_add_epi32(a2, offset);
    4236           0 :         a3 = _mm256_add_epi32(a3, offset);
    4237           0 :         v[i + 6] = _mm256_srai_epi32(a2, NewSqrt2Bits);
    4238           0 :         v[i + 7] = _mm256_srai_epi32(a3, NewSqrt2Bits);
    4239             :     }
    4240             : 
    4241           0 :     if (!do_cols) {
    4242           0 :         const int32_t log_range_out = AOMMAX(16, bd + 6);
    4243           0 :         const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
    4244             :             -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
    4245           0 :         const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
    4246             :             (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
    4247             : 
    4248           0 :         shift_avx2(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 16);
    4249             :     }
    4250             :     else
    4251           0 :         highbd_clamp_epi32_avx2(v, out, &clamp_lo, &clamp_hi, 16);
    4252           0 : }
    4253           0 : static void idct32_low1_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    4254             :     int32_t bd, int32_t out_shift) {
    4255           0 :     const int32_t *cospi = cospi_arr(bit);
    4256           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    4257           0 :     const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
    4258           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    4259           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    4260           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    4261             :     __m256i x;
    4262             :     // stage 0
    4263             :     // stage 1
    4264             :     // stage 2
    4265             :     // stage 3
    4266             :     // stage 4
    4267             :     // stage 5
    4268           0 :     x = _mm256_mullo_epi32(in[0], cospi32);
    4269           0 :     x = _mm256_add_epi32(x, rounding);
    4270           0 :     x = _mm256_srai_epi32(x, bit);
    4271             : 
    4272             :     // stage 6
    4273             :     // stage 7
    4274             :     // stage 8
    4275             :     // stage 9
    4276           0 :     if (do_cols) {
    4277           0 :         x = _mm256_max_epi32(x, clamp_lo);
    4278           0 :         x = _mm256_min_epi32(x, clamp_hi);
    4279             :     }
    4280             :     else {
    4281           0 :         const int32_t log_range_out = AOMMAX(16, bd + 6);
    4282           0 :         const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
    4283             :             -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
    4284           0 :         const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
    4285             :             (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
    4286           0 :         __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
    4287           0 :         x = _mm256_add_epi32(offset, x);
    4288           0 :         x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
    4289           0 :         x = _mm256_max_epi32(x, clamp_lo_out);
    4290           0 :         x = _mm256_min_epi32(x, clamp_hi_out);
    4291             :     }
    4292             : 
    4293           0 :     out[0] = x;
    4294           0 :     out[1] = x;
    4295           0 :     out[2] = x;
    4296           0 :     out[3] = x;
    4297           0 :     out[4] = x;
    4298           0 :     out[5] = x;
    4299           0 :     out[6] = x;
    4300           0 :     out[7] = x;
    4301           0 :     out[8] = x;
    4302           0 :     out[9] = x;
    4303           0 :     out[10] = x;
    4304           0 :     out[11] = x;
    4305           0 :     out[12] = x;
    4306           0 :     out[13] = x;
    4307           0 :     out[14] = x;
    4308           0 :     out[15] = x;
    4309           0 :     out[16] = x;
    4310           0 :     out[17] = x;
    4311           0 :     out[18] = x;
    4312           0 :     out[19] = x;
    4313           0 :     out[20] = x;
    4314           0 :     out[21] = x;
    4315           0 :     out[22] = x;
    4316           0 :     out[23] = x;
    4317           0 :     out[24] = x;
    4318           0 :     out[25] = x;
    4319           0 :     out[26] = x;
    4320           0 :     out[27] = x;
    4321           0 :     out[28] = x;
    4322           0 :     out[29] = x;
    4323           0 :     out[30] = x;
    4324           0 :     out[31] = x;
    4325           0 : }
    4326             : 
    4327           0 : static void idct32_low8_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    4328             :     int32_t bd, int32_t out_shift) {
    4329           0 :     const int32_t *cospi = cospi_arr(bit);
    4330           0 :     const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
    4331           0 :     const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
    4332           0 :     const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
    4333           0 :     const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
    4334           0 :     const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
    4335           0 :     const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
    4336           0 :     const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
    4337           0 :     const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
    4338           0 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    4339           0 :     const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    4340           0 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    4341           0 :     const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
    4342           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    4343           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    4344           0 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    4345           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    4346           0 :     const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
    4347           0 :     const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
    4348           0 :     const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
    4349           0 :     const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
    4350           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    4351           0 :     const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
    4352           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    4353           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    4354           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    4355           0 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    4356           0 :     const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
    4357           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    4358           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    4359           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    4360             :     __m256i bf1[32];
    4361             : 
    4362             :     {
    4363             :         // stage 0
    4364             :         // stage 1
    4365           0 :         bf1[0] = in[0];
    4366           0 :         bf1[4] = in[4];
    4367           0 :         bf1[8] = in[2];
    4368           0 :         bf1[12] = in[6];
    4369           0 :         bf1[16] = in[1];
    4370           0 :         bf1[20] = in[5];
    4371           0 :         bf1[24] = in[3];
    4372           0 :         bf1[28] = in[7];
    4373             : 
    4374             :         // stage 2
    4375           0 :         bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
    4376           0 :         bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
    4377           0 :         bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
    4378           0 :         bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
    4379           0 :         bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
    4380           0 :         bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
    4381           0 :         bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
    4382           0 :         bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
    4383             : 
    4384             :         // stage 3
    4385           0 :         bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
    4386           0 :         bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
    4387             : 
    4388           0 :         bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
    4389           0 :         bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
    4390           0 :         bf1[17] = bf1[16];
    4391           0 :         bf1[18] = bf1[19];
    4392           0 :         bf1[21] = bf1[20];
    4393           0 :         bf1[22] = bf1[23];
    4394           0 :         bf1[25] = bf1[24];
    4395           0 :         bf1[26] = bf1[27];
    4396           0 :         bf1[29] = bf1[28];
    4397           0 :         bf1[30] = bf1[31];
    4398             : 
    4399             :         // stage 4
    4400           0 :         bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
    4401           0 :         bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
    4402             : 
    4403           0 :         bf1[9] = bf1[8];
    4404           0 :         bf1[10] = bf1[11];
    4405           0 :         bf1[13] = bf1[12];
    4406           0 :         bf1[14] = bf1[15];
    4407             : 
    4408           0 :         idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
    4409             :             &cospi24, &cospi40, &cospim24, &rounding, bit);
    4410             : 
    4411             :         // stage 5
    4412           0 :         bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
    4413           0 :         bf1[1] = bf1[0];
    4414           0 :         bf1[5] = bf1[4];
    4415           0 :         bf1[6] = bf1[7];
    4416             : 
    4417           0 :         idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
    4418             :             &clamp_hi, &rounding, bit);
    4419             : 
    4420             :         // stage 6
    4421           0 :         bf1[3] = bf1[0];
    4422           0 :         bf1[2] = bf1[1];
    4423             : 
    4424           0 :         idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
    4425             :             &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
    4426             : 
    4427             :         // stage 7
    4428           0 :         idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
    4429             :             &rounding, bit);
    4430             : 
    4431             :         // stage 8
    4432           0 :         idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
    4433             :             &rounding, bit);
    4434             : 
    4435             :         // stage 9
    4436           0 :         idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
    4437             :     }
    4438           0 : }
    4439             : 
    4440           0 : static void idct32_low16_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    4441             :     int32_t bd, int32_t out_shift) {
    4442           0 :     const int32_t *cospi = cospi_arr(bit);
    4443           0 :     const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
    4444           0 :     const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
    4445           0 :     const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
    4446           0 :     const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
    4447           0 :     const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
    4448           0 :     const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
    4449           0 :     const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
    4450           0 :     const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
    4451           0 :     const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
    4452           0 :     const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
    4453           0 :     const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
    4454           0 :     const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
    4455           0 :     const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
    4456           0 :     const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
    4457           0 :     const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
    4458           0 :     const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
    4459           0 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    4460           0 :     const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
    4461           0 :     const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
    4462           0 :     const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    4463           0 :     const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
    4464           0 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    4465           0 :     const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
    4466           0 :     const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
    4467           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    4468           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    4469           0 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    4470           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    4471           0 :     const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
    4472           0 :     const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
    4473           0 :     const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
    4474           0 :     const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
    4475           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    4476           0 :     const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
    4477           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    4478           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    4479           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    4480           0 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    4481           0 :     const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
    4482           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    4483           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    4484           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    4485             :     __m256i bf1[32];
    4486             : 
    4487             :     {
    4488             :         // stage 0
    4489             :         // stage 1
    4490           0 :         bf1[0] = in[0];
    4491           0 :         bf1[2] = in[8];
    4492           0 :         bf1[4] = in[4];
    4493           0 :         bf1[6] = in[12];
    4494           0 :         bf1[8] = in[2];
    4495           0 :         bf1[10] = in[10];
    4496           0 :         bf1[12] = in[6];
    4497           0 :         bf1[14] = in[14];
    4498           0 :         bf1[16] = in[1];
    4499           0 :         bf1[18] = in[9];
    4500           0 :         bf1[20] = in[5];
    4501           0 :         bf1[22] = in[13];
    4502           0 :         bf1[24] = in[3];
    4503           0 :         bf1[26] = in[11];
    4504           0 :         bf1[28] = in[7];
    4505           0 :         bf1[30] = in[15];
    4506             : 
    4507             :         // stage 2
    4508           0 :         bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
    4509           0 :         bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
    4510           0 :         bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
    4511           0 :         bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
    4512           0 :         bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
    4513           0 :         bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
    4514           0 :         bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
    4515           0 :         bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
    4516           0 :         bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
    4517           0 :         bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
    4518           0 :         bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
    4519           0 :         bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
    4520           0 :         bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
    4521           0 :         bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
    4522           0 :         bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
    4523           0 :         bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
    4524             : 
    4525             :         // stage 3
    4526           0 :         bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
    4527           0 :         bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
    4528           0 :         bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
    4529           0 :         bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
    4530           0 :         bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
    4531           0 :         bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
    4532           0 :         bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
    4533           0 :         bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
    4534             : 
    4535           0 :         addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
    4536           0 :         addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
    4537           0 :         addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
    4538           0 :         addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
    4539           0 :         addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
    4540           0 :         addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
    4541           0 :         addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
    4542           0 :         addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
    4543             : 
    4544             :         // stage 4
    4545           0 :         bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
    4546           0 :         bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
    4547           0 :         bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
    4548           0 :         bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
    4549             : 
    4550           0 :         addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
    4551           0 :         addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
    4552           0 :         addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
    4553           0 :         addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
    4554             : 
    4555           0 :         idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
    4556             :             &cospi24, &cospi40, &cospim24, &rounding, bit);
    4557             : 
    4558             :         // stage 5
    4559           0 :         bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
    4560           0 :         bf1[1] = bf1[0];
    4561           0 :         bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
    4562           0 :         bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
    4563             : 
    4564           0 :         addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
    4565           0 :         addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
    4566             : 
    4567           0 :         idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
    4568             :             &clamp_hi, &rounding, bit);
    4569             : 
    4570             :         // stage 6
    4571           0 :         addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
    4572           0 :         addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
    4573             : 
    4574           0 :         idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
    4575             :             &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
    4576             : 
    4577             :         // stage 7
    4578           0 :         idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
    4579             :             &rounding, bit);
    4580             : 
    4581             :         // stage 8
    4582           0 :         idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
    4583             :             &rounding, bit);
    4584             : 
    4585             :         // stage 9
    4586           0 :         idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
    4587             :     }
    4588           0 : }
    4589             : 
    4590           0 : static void idct32_avx2_new(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols, int32_t bd,
    4591             :     int32_t out_shift) {
    4592           0 :     const int32_t *cospi = cospi_arr(bit);
    4593           0 :     const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
    4594           0 :     const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
    4595           0 :     const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
    4596           0 :     const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
    4597           0 :     const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
    4598           0 :     const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
    4599           0 :     const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
    4600           0 :     const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
    4601           0 :     const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
    4602           0 :     const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
    4603           0 :     const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
    4604           0 :     const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
    4605           0 :     const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
    4606           0 :     const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
    4607           0 :     const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
    4608           0 :     const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
    4609           0 :     const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
    4610           0 :     const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
    4611           0 :     const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
    4612           0 :     const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
    4613           0 :     const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
    4614           0 :     const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
    4615           0 :     const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
    4616           0 :     const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
    4617           0 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    4618           0 :     const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
    4619           0 :     const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
    4620           0 :     const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    4621           0 :     const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
    4622           0 :     const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
    4623           0 :     const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
    4624           0 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    4625           0 :     const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
    4626           0 :     const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
    4627           0 :     const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
    4628           0 :     const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
    4629           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    4630           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    4631           0 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    4632           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    4633           0 :     const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
    4634           0 :     const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
    4635           0 :     const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
    4636           0 :     const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
    4637           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    4638           0 :     const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
    4639           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    4640           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    4641           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    4642           0 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    4643           0 :     const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
    4644           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    4645           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    4646           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    4647             :     __m256i bf1[32], bf0[32];
    4648             : 
    4649             :     {
    4650             :         // stage 0
    4651             :         // stage 1
    4652           0 :         bf1[0] = in[0];
    4653           0 :         bf1[1] = in[16];
    4654           0 :         bf1[2] = in[8];
    4655           0 :         bf1[3] = in[24];
    4656           0 :         bf1[4] = in[4];
    4657           0 :         bf1[5] = in[20];
    4658           0 :         bf1[6] = in[12];
    4659           0 :         bf1[7] = in[28];
    4660           0 :         bf1[8] = in[2];
    4661           0 :         bf1[9] = in[18];
    4662           0 :         bf1[10] = in[10];
    4663           0 :         bf1[11] = in[26];
    4664           0 :         bf1[12] = in[6];
    4665           0 :         bf1[13] = in[22];
    4666           0 :         bf1[14] = in[14];
    4667           0 :         bf1[15] = in[30];
    4668           0 :         bf1[16] = in[1];
    4669           0 :         bf1[17] = in[17];
    4670           0 :         bf1[18] = in[9];
    4671           0 :         bf1[19] = in[25];
    4672           0 :         bf1[20] = in[5];
    4673           0 :         bf1[21] = in[21];
    4674           0 :         bf1[22] = in[13];
    4675           0 :         bf1[23] = in[29];
    4676           0 :         bf1[24] = in[3];
    4677           0 :         bf1[25] = in[19];
    4678           0 :         bf1[26] = in[11];
    4679           0 :         bf1[27] = in[27];
    4680           0 :         bf1[28] = in[7];
    4681           0 :         bf1[29] = in[23];
    4682           0 :         bf1[30] = in[15];
    4683           0 :         bf1[31] = in[31];
    4684             : 
    4685             :         // stage 2
    4686           0 :         bf0[0] = bf1[0];
    4687           0 :         bf0[1] = bf1[1];
    4688           0 :         bf0[2] = bf1[2];
    4689           0 :         bf0[3] = bf1[3];
    4690           0 :         bf0[4] = bf1[4];
    4691           0 :         bf0[5] = bf1[5];
    4692           0 :         bf0[6] = bf1[6];
    4693           0 :         bf0[7] = bf1[7];
    4694           0 :         bf0[8] = bf1[8];
    4695           0 :         bf0[9] = bf1[9];
    4696           0 :         bf0[10] = bf1[10];
    4697           0 :         bf0[11] = bf1[11];
    4698           0 :         bf0[12] = bf1[12];
    4699           0 :         bf0[13] = bf1[13];
    4700           0 :         bf0[14] = bf1[14];
    4701           0 :         bf0[15] = bf1[15];
    4702           0 :         bf0[16] =
    4703           0 :             half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
    4704           0 :         bf0[17] =
    4705           0 :             half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
    4706           0 :         bf0[18] =
    4707           0 :             half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
    4708           0 :         bf0[19] =
    4709           0 :             half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
    4710           0 :         bf0[20] =
    4711           0 :             half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
    4712           0 :         bf0[21] =
    4713           0 :             half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
    4714           0 :         bf0[22] =
    4715           0 :             half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
    4716           0 :         bf0[23] =
    4717           0 :             half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
    4718           0 :         bf0[24] =
    4719           0 :             half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
    4720           0 :         bf0[25] =
    4721           0 :             half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
    4722           0 :         bf0[26] =
    4723           0 :             half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
    4724           0 :         bf0[27] =
    4725           0 :             half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
    4726           0 :         bf0[28] =
    4727           0 :             half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
    4728           0 :         bf0[29] =
    4729           0 :             half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
    4730           0 :         bf0[30] =
    4731           0 :             half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
    4732           0 :         bf0[31] =
    4733           0 :             half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
    4734             : 
    4735             :         // stage 3
    4736           0 :         bf1[0] = bf0[0];
    4737           0 :         bf1[1] = bf0[1];
    4738           0 :         bf1[2] = bf0[2];
    4739           0 :         bf1[3] = bf0[3];
    4740           0 :         bf1[4] = bf0[4];
    4741           0 :         bf1[5] = bf0[5];
    4742           0 :         bf1[6] = bf0[6];
    4743           0 :         bf1[7] = bf0[7];
    4744           0 :         bf1[8] =
    4745           0 :             half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
    4746           0 :         bf1[9] =
    4747           0 :             half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
    4748           0 :         bf1[10] =
    4749           0 :             half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
    4750           0 :         bf1[11] =
    4751           0 :             half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
    4752           0 :         bf1[12] =
    4753           0 :             half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
    4754           0 :         bf1[13] =
    4755           0 :             half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
    4756           0 :         bf1[14] =
    4757           0 :             half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
    4758           0 :         bf1[15] =
    4759           0 :             half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
    4760             : 
    4761           0 :         addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
    4762           0 :         addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
    4763           0 :         addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
    4764           0 :         addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
    4765           0 :         addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
    4766           0 :         addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
    4767           0 :         addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
    4768           0 :         addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
    4769             : 
    4770             :         // stage 4
    4771           0 :         bf0[0] = bf1[0];
    4772           0 :         bf0[1] = bf1[1];
    4773           0 :         bf0[2] = bf1[2];
    4774           0 :         bf0[3] = bf1[3];
    4775           0 :         bf0[4] =
    4776           0 :             half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
    4777           0 :         bf0[5] =
    4778           0 :             half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
    4779           0 :         bf0[6] =
    4780           0 :             half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
    4781           0 :         bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
    4782             : 
    4783           0 :         addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
    4784           0 :         addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
    4785           0 :         addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
    4786           0 :         addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
    4787             : 
    4788           0 :         bf0[16] = bf1[16];
    4789           0 :         bf0[17] =
    4790           0 :             half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
    4791           0 :         bf0[18] =
    4792           0 :             half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
    4793           0 :         bf0[19] = bf1[19];
    4794           0 :         bf0[20] = bf1[20];
    4795           0 :         bf0[21] =
    4796           0 :             half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
    4797           0 :         bf0[22] =
    4798           0 :             half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
    4799           0 :         bf0[23] = bf1[23];
    4800           0 :         bf0[24] = bf1[24];
    4801           0 :         bf0[25] =
    4802           0 :             half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
    4803           0 :         bf0[26] =
    4804           0 :             half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
    4805           0 :         bf0[27] = bf1[27];
    4806           0 :         bf0[28] = bf1[28];
    4807           0 :         bf0[29] =
    4808           0 :             half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
    4809           0 :         bf0[30] =
    4810           0 :             half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
    4811           0 :         bf0[31] = bf1[31];
    4812             : 
    4813             :         // stage 5
    4814           0 :         bf1[0] =
    4815           0 :             half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
    4816           0 :         bf1[1] =
    4817           0 :             half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
    4818           0 :         bf1[2] =
    4819           0 :             half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
    4820           0 :         bf1[3] =
    4821           0 :             half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
    4822           0 :         addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
    4823           0 :         addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
    4824           0 :         bf1[8] = bf0[8];
    4825           0 :         bf1[9] =
    4826           0 :             half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
    4827           0 :         bf1[10] =
    4828           0 :             half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
    4829           0 :         bf1[11] = bf0[11];
    4830           0 :         bf1[12] = bf0[12];
    4831           0 :         bf1[13] =
    4832           0 :             half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
    4833           0 :         bf1[14] =
    4834           0 :             half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
    4835           0 :         bf1[15] = bf0[15];
    4836           0 :         addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
    4837           0 :         addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
    4838           0 :         addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
    4839           0 :         addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
    4840           0 :         addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
    4841           0 :         addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
    4842           0 :         addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
    4843           0 :         addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
    4844             : 
    4845             :         // stage 6
    4846           0 :         addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
    4847           0 :         addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
    4848           0 :         bf0[4] = bf1[4];
    4849           0 :         bf0[5] =
    4850           0 :             half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
    4851           0 :         bf0[6] =
    4852           0 :             half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
    4853           0 :         bf0[7] = bf1[7];
    4854           0 :         addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
    4855           0 :         addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
    4856           0 :         addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
    4857           0 :         addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
    4858           0 :         bf0[16] = bf1[16];
    4859           0 :         bf0[17] = bf1[17];
    4860           0 :         bf0[18] =
    4861           0 :             half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
    4862           0 :         bf0[19] =
    4863           0 :             half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
    4864           0 :         bf0[20] =
    4865           0 :             half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
    4866           0 :         bf0[21] =
    4867           0 :             half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
    4868           0 :         bf0[22] = bf1[22];
    4869           0 :         bf0[23] = bf1[23];
    4870           0 :         bf0[24] = bf1[24];
    4871           0 :         bf0[25] = bf1[25];
    4872           0 :         bf0[26] =
    4873           0 :             half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
    4874           0 :         bf0[27] =
    4875           0 :             half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
    4876           0 :         bf0[28] =
    4877           0 :             half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
    4878           0 :         bf0[29] =
    4879           0 :             half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
    4880           0 :         bf0[30] = bf1[30];
    4881           0 :         bf0[31] = bf1[31];
    4882             : 
    4883             :         // stage 7
    4884           0 :         addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
    4885           0 :         addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
    4886           0 :         addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
    4887           0 :         addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
    4888           0 :         bf1[8] = bf0[8];
    4889           0 :         bf1[9] = bf0[9];
    4890           0 :         bf1[10] =
    4891           0 :             half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
    4892           0 :         bf1[11] =
    4893           0 :             half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
    4894           0 :         bf1[12] =
    4895           0 :             half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
    4896           0 :         bf1[13] =
    4897           0 :             half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
    4898           0 :         bf1[14] = bf0[14];
    4899           0 :         bf1[15] = bf0[15];
    4900           0 :         addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
    4901           0 :         addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
    4902           0 :         addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
    4903           0 :         addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
    4904           0 :         addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
    4905           0 :         addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
    4906           0 :         addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
    4907           0 :         addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
    4908             : 
    4909             :         // stage 8
    4910           0 :         addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
    4911           0 :         addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
    4912           0 :         addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
    4913           0 :         addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
    4914           0 :         addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
    4915           0 :         addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
    4916           0 :         addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
    4917           0 :         addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
    4918           0 :         bf0[16] = bf1[16];
    4919           0 :         bf0[17] = bf1[17];
    4920           0 :         bf0[18] = bf1[18];
    4921           0 :         bf0[19] = bf1[19];
    4922           0 :         bf0[20] =
    4923           0 :             half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
    4924           0 :         bf0[21] =
    4925           0 :             half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
    4926           0 :         bf0[22] =
    4927           0 :             half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
    4928           0 :         bf0[23] =
    4929           0 :             half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
    4930           0 :         bf0[24] =
    4931           0 :             half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
    4932           0 :         bf0[25] =
    4933           0 :             half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
    4934           0 :         bf0[26] =
    4935           0 :             half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
    4936           0 :         bf0[27] =
    4937           0 :             half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
    4938           0 :         bf0[28] = bf1[28];
    4939           0 :         bf0[29] = bf1[29];
    4940           0 :         bf0[30] = bf1[30];
    4941           0 :         bf0[31] = bf1[31];
    4942             : 
    4943             :         // stage 9
    4944           0 :         if (do_cols) {
    4945           0 :             addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31);
    4946           0 :             addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30);
    4947           0 :             addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29);
    4948           0 :             addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28);
    4949           0 :             addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27);
    4950           0 :             addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26);
    4951           0 :             addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25);
    4952           0 :             addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24);
    4953           0 :             addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23);
    4954           0 :             addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22);
    4955           0 :             addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21);
    4956           0 :             addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20);
    4957           0 :             addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19);
    4958           0 :             addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18);
    4959           0 :             addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17);
    4960           0 :             addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16);
    4961             :         }
    4962             :         else {
    4963           0 :             const int32_t log_range_out = AOMMAX(16, bd + 6);
    4964           0 :             const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
    4965             :                 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
    4966           0 :             const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
    4967             :                 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
    4968             : 
    4969           0 :             addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
    4970             :                 &clamp_hi_out, out_shift);
    4971           0 :             addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
    4972             :                 &clamp_hi_out, out_shift);
    4973           0 :             addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
    4974             :                 &clamp_hi_out, out_shift);
    4975           0 :             addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
    4976             :                 &clamp_hi_out, out_shift);
    4977           0 :             addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
    4978             :                 &clamp_hi_out, out_shift);
    4979           0 :             addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
    4980             :                 &clamp_hi_out, out_shift);
    4981           0 :             addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
    4982             :                 &clamp_hi_out, out_shift);
    4983           0 :             addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
    4984             :                 &clamp_hi_out, out_shift);
    4985           0 :             addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
    4986             :                 &clamp_hi_out, out_shift);
    4987           0 :             addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
    4988             :                 &clamp_hi_out, out_shift);
    4989           0 :             addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
    4990             :                 &clamp_hi_out, out_shift);
    4991           0 :             addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
    4992             :                 &clamp_hi_out, out_shift);
    4993           0 :             addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
    4994             :                 &clamp_hi_out, out_shift);
    4995           0 :             addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
    4996             :                 &clamp_hi_out, out_shift);
    4997           0 :             addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
    4998             :                 &clamp_hi_out, out_shift);
    4999           0 :             addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
    5000             :                 &clamp_hi_out, out_shift);
    5001             :         }
    5002             :     }
    5003           0 : }
    5004           0 : static void iidentity32_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    5005             :     int32_t bd, int32_t out_shift) {
    5006             :     (void)bit;
    5007           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    5008           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    5009           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    5010             :     __m256i v[64];
    5011           0 :     for (int32_t i = 0; i < 32; i += 16) {
    5012           0 :         v[i] = _mm256_slli_epi32(in[i], 2);
    5013           0 :         v[i + 1] = _mm256_slli_epi32(in[i + 1], 2);
    5014           0 :         v[i + 2] = _mm256_slli_epi32(in[i + 2], 2);
    5015           0 :         v[i + 3] = _mm256_slli_epi32(in[i + 3], 2);
    5016           0 :         v[i + 4] = _mm256_slli_epi32(in[i + 4], 2);
    5017           0 :         v[i + 5] = _mm256_slli_epi32(in[i + 5], 2);
    5018           0 :         v[i + 6] = _mm256_slli_epi32(in[i + 6], 2);
    5019           0 :         v[i + 7] = _mm256_slli_epi32(in[i + 7], 2);
    5020           0 :         v[i + 8] = _mm256_slli_epi32(in[i + 8], 2);
    5021           0 :         v[i + 9] = _mm256_slli_epi32(in[i + 9], 2);
    5022           0 :         v[i + 10] = _mm256_slli_epi32(in[i + 10], 2);
    5023           0 :         v[i + 11] = _mm256_slli_epi32(in[i + 11], 2);
    5024           0 :         v[i + 12] = _mm256_slli_epi32(in[i + 12], 2);
    5025           0 :         v[i + 13] = _mm256_slli_epi32(in[i + 13], 2);
    5026           0 :         v[i + 14] = _mm256_slli_epi32(in[i + 14], 2);
    5027           0 :         v[i + 15] = _mm256_slli_epi32(in[i + 15], 2);
    5028             :     }
    5029             : 
    5030           0 :     if (!do_cols) {
    5031           0 :         const int32_t log_range_out = AOMMAX(16, bd + 6);
    5032           0 :         const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
    5033             :             -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
    5034           0 :         const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
    5035             :             (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
    5036           0 :         shift_avx2(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 32);
    5037             :     }
    5038             :     else
    5039           0 :         highbd_clamp_epi32_avx2(v, out, &clamp_lo, &clamp_hi, 32);
    5040           0 : }
    5041           0 : static void idct64_low1_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    5042             :     int32_t bd, int32_t out_shift) {
    5043           0 :     const int32_t *cospi = cospi_arr(bit);
    5044           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    5045           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    5046           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    5047           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    5048             : 
    5049           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    5050             : 
    5051             :     {
    5052             :         __m256i x;
    5053             : 
    5054             :         // stage 1
    5055             :         // stage 2
    5056             :         // stage 3
    5057             :         // stage 4
    5058             :         // stage 5
    5059             :         // stage 6
    5060           0 :         x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit);
    5061             : 
    5062             :         // stage 8
    5063             :         // stage 9
    5064             :         // stage 10
    5065             :         // stage 11
    5066           0 :         if (do_cols) {
    5067           0 :             x = _mm256_max_epi32(x, clamp_lo);
    5068           0 :             x = _mm256_min_epi32(x, clamp_hi);
    5069             :         }
    5070             :         else {
    5071           0 :             const int32_t log_range_out = AOMMAX(16, bd + 6);
    5072           0 :             const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
    5073             :                 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
    5074           0 :             const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
    5075             :                 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
    5076             : 
    5077           0 :             __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
    5078           0 :             x = _mm256_add_epi32(x, offset);
    5079           0 :             x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
    5080             : 
    5081           0 :             x = _mm256_max_epi32(x, clamp_lo_out);
    5082           0 :             x = _mm256_min_epi32(x, clamp_hi_out);
    5083             :         }
    5084             : 
    5085           0 :         out[0] = x;
    5086           0 :         out[1] = x;
    5087           0 :         out[2] = x;
    5088           0 :         out[3] = x;
    5089           0 :         out[4] = x;
    5090           0 :         out[5] = x;
    5091           0 :         out[6] = x;
    5092           0 :         out[7] = x;
    5093           0 :         out[8] = x;
    5094           0 :         out[9] = x;
    5095           0 :         out[10] = x;
    5096           0 :         out[11] = x;
    5097           0 :         out[12] = x;
    5098           0 :         out[13] = x;
    5099           0 :         out[14] = x;
    5100           0 :         out[15] = x;
    5101           0 :         out[16] = x;
    5102           0 :         out[17] = x;
    5103           0 :         out[18] = x;
    5104           0 :         out[19] = x;
    5105           0 :         out[20] = x;
    5106           0 :         out[21] = x;
    5107           0 :         out[22] = x;
    5108           0 :         out[23] = x;
    5109           0 :         out[24] = x;
    5110           0 :         out[25] = x;
    5111           0 :         out[26] = x;
    5112           0 :         out[27] = x;
    5113           0 :         out[28] = x;
    5114           0 :         out[29] = x;
    5115           0 :         out[30] = x;
    5116           0 :         out[31] = x;
    5117           0 :         out[32] = x;
    5118           0 :         out[33] = x;
    5119           0 :         out[34] = x;
    5120           0 :         out[35] = x;
    5121           0 :         out[36] = x;
    5122           0 :         out[37] = x;
    5123           0 :         out[38] = x;
    5124           0 :         out[39] = x;
    5125           0 :         out[40] = x;
    5126           0 :         out[41] = x;
    5127           0 :         out[42] = x;
    5128           0 :         out[43] = x;
    5129           0 :         out[44] = x;
    5130           0 :         out[45] = x;
    5131           0 :         out[46] = x;
    5132           0 :         out[47] = x;
    5133           0 :         out[48] = x;
    5134           0 :         out[49] = x;
    5135           0 :         out[50] = x;
    5136           0 :         out[51] = x;
    5137           0 :         out[52] = x;
    5138           0 :         out[53] = x;
    5139           0 :         out[54] = x;
    5140           0 :         out[55] = x;
    5141           0 :         out[56] = x;
    5142           0 :         out[57] = x;
    5143           0 :         out[58] = x;
    5144           0 :         out[59] = x;
    5145           0 :         out[60] = x;
    5146           0 :         out[61] = x;
    5147           0 :         out[62] = x;
    5148           0 :         out[63] = x;
    5149             :     }
    5150           0 : }
    5151             : 
    5152           0 : static void idct64_low8_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    5153             :     int32_t bd, int32_t out_shift) {
    5154             :     int32_t i, j;
    5155           0 :     const int32_t *cospi = cospi_arr(bit);
    5156           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    5157           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    5158           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    5159           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    5160             : 
    5161           0 :     const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
    5162           0 :     const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
    5163           0 :     const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
    5164           0 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    5165           0 :     const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
    5166           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    5167           0 :     const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    5168           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    5169           0 :     const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
    5170           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    5171           0 :     const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
    5172           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    5173           0 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    5174           0 :     const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
    5175           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    5176           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    5177           0 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    5178           0 :     const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
    5179           0 :     const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
    5180           0 :     const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
    5181           0 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    5182           0 :     const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
    5183           0 :     const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
    5184           0 :     const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
    5185           0 :     const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
    5186           0 :     const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
    5187           0 :     const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
    5188           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    5189           0 :     const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
    5190           0 :     const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
    5191           0 :     const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
    5192           0 :     const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
    5193           0 :     const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
    5194           0 :     const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
    5195           0 :     const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
    5196           0 :     const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
    5197           0 :     const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
    5198           0 :     const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
    5199             : 
    5200             :     {
    5201             :         __m256i u[64];
    5202             : 
    5203             :         // stage 1
    5204           0 :         u[0] = in[0];
    5205           0 :         u[8] = in[4];
    5206           0 :         u[16] = in[2];
    5207           0 :         u[24] = in[6];
    5208           0 :         u[32] = in[1];
    5209           0 :         u[40] = in[5];
    5210           0 :         u[48] = in[3];
    5211           0 :         u[56] = in[7];
    5212             : 
    5213             :         // stage 2
    5214           0 :         u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
    5215           0 :         u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
    5216           0 :         u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
    5217           0 :         u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
    5218           0 :         u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
    5219           0 :         u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
    5220           0 :         u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
    5221           0 :         u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
    5222             : 
    5223             :         // stage 3
    5224           0 :         u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
    5225           0 :         u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
    5226           0 :         u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
    5227           0 :         u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
    5228           0 :         u[33] = u[32];
    5229           0 :         u[38] = u[39];
    5230           0 :         u[41] = u[40];
    5231           0 :         u[46] = u[47];
    5232           0 :         u[49] = u[48];
    5233           0 :         u[54] = u[55];
    5234           0 :         u[57] = u[56];
    5235           0 :         u[62] = u[63];
    5236             : 
    5237             :         // stage 4
    5238             :         __m256i temp1, temp2;
    5239           0 :         u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
    5240           0 :         u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
    5241           0 :         u[17] = u[16];
    5242           0 :         u[22] = u[23];
    5243           0 :         u[25] = u[24];
    5244           0 :         u[30] = u[31];
    5245             : 
    5246           0 :         temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
    5247           0 :         u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
    5248           0 :         u[33] = temp1;
    5249             : 
    5250           0 :         temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
    5251           0 :         u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
    5252           0 :         u[57] = temp2;
    5253             : 
    5254           0 :         temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
    5255           0 :         u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
    5256           0 :         u[41] = temp1;
    5257             : 
    5258           0 :         temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
    5259           0 :         u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
    5260           0 :         u[46] = temp2;
    5261             : 
    5262             :         // stage 5
    5263           0 :         u[9] = u[8];
    5264           0 :         u[14] = u[15];
    5265             : 
    5266           0 :         temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
    5267           0 :         u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
    5268           0 :         u[17] = temp1;
    5269             : 
    5270           0 :         temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
    5271           0 :         u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
    5272           0 :         u[22] = temp2;
    5273             : 
    5274           0 :         u[35] = u[32];
    5275           0 :         u[34] = u[33];
    5276           0 :         u[36] = u[39];
    5277           0 :         u[37] = u[38];
    5278           0 :         u[43] = u[40];
    5279           0 :         u[42] = u[41];
    5280           0 :         u[44] = u[47];
    5281           0 :         u[45] = u[46];
    5282           0 :         u[51] = u[48];
    5283           0 :         u[50] = u[49];
    5284           0 :         u[52] = u[55];
    5285           0 :         u[53] = u[54];
    5286           0 :         u[59] = u[56];
    5287           0 :         u[58] = u[57];
    5288           0 :         u[60] = u[63];
    5289           0 :         u[61] = u[62];
    5290             : 
    5291             :         // stage 6
    5292           0 :         temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
    5293           0 :         u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
    5294           0 :         u[0] = temp1;
    5295             : 
    5296           0 :         temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
    5297           0 :         u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
    5298           0 :         u[9] = temp2;
    5299           0 :         u[19] = u[16];
    5300           0 :         u[18] = u[17];
    5301           0 :         u[20] = u[23];
    5302           0 :         u[21] = u[22];
    5303           0 :         u[27] = u[24];
    5304           0 :         u[26] = u[25];
    5305           0 :         u[28] = u[31];
    5306           0 :         u[29] = u[30];
    5307             : 
    5308           0 :         temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
    5309           0 :         u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
    5310           0 :         u[34] = temp1;
    5311           0 :         temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
    5312           0 :         u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
    5313           0 :         u[35] = temp2;
    5314           0 :         temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
    5315           0 :         u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
    5316           0 :         u[36] = temp1;
    5317           0 :         temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
    5318           0 :         u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
    5319           0 :         u[37] = temp2;
    5320           0 :         temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
    5321           0 :         u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
    5322           0 :         u[42] = temp1;
    5323           0 :         temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
    5324           0 :         u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
    5325           0 :         u[43] = temp2;
    5326           0 :         temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
    5327           0 :         u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
    5328           0 :         u[44] = temp1;
    5329           0 :         temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
    5330           0 :         u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
    5331           0 :         u[45] = temp2;
    5332             : 
    5333             :         // stage 7
    5334           0 :         u[3] = u[0];
    5335           0 :         u[2] = u[1];
    5336           0 :         u[11] = u[8];
    5337           0 :         u[10] = u[9];
    5338           0 :         u[12] = u[15];
    5339           0 :         u[13] = u[14];
    5340             : 
    5341           0 :         temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
    5342           0 :         u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
    5343           0 :         u[18] = temp1;
    5344           0 :         temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
    5345           0 :         u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
    5346           0 :         u[19] = temp2;
    5347           0 :         temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
    5348           0 :         u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
    5349           0 :         u[20] = temp1;
    5350           0 :         temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
    5351           0 :         u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
    5352           0 :         u[21] = temp2;
    5353           0 :         for (i = 32; i < 64; i += 16) {
    5354           0 :             for (j = i; j < i + 4; j++) {
    5355           0 :                 addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
    5356           0 :                 addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
    5357             :                     &clamp_hi);
    5358             :             }
    5359             :         }
    5360             : 
    5361             :         // stage 8
    5362           0 :         u[7] = u[0];
    5363           0 :         u[6] = u[1];
    5364           0 :         u[5] = u[2];
    5365           0 :         u[4] = u[3];
    5366           0 :         u[9] = u[9];
    5367             : 
    5368           0 :         idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
    5369             :             &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
    5370             : 
    5371             :         // stage 9
    5372           0 :         idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
    5373             :             bit);
    5374             : 
    5375             :         // stage 10
    5376           0 :         idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
    5377             :             bit);
    5378             : 
    5379             :         // stage 11
    5380           0 :         idct64_stage11_avx2(u, out, do_cols, bd, out_shift, log_range);
    5381             :     }
    5382           0 : }
    5383             : 
    5384           0 : static void idct64_low16_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols,
    5385             :     int32_t bd, int32_t out_shift) {
    5386             :     int32_t i, j;
    5387           0 :     const int32_t *cospi = cospi_arr(bit);
    5388           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    5389           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    5390           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    5391           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    5392             : 
    5393           0 :     const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
    5394           0 :     const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
    5395           0 :     const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
    5396           0 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    5397           0 :     const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
    5398           0 :     const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
    5399           0 :     const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
    5400           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    5401           0 :     const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
    5402           0 :     const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
    5403           0 :     const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
    5404           0 :     const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    5405           0 :     const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
    5406           0 :     const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
    5407           0 :     const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
    5408           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    5409           0 :     const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
    5410           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    5411           0 :     const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
    5412           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    5413           0 :     const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
    5414           0 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    5415           0 :     const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
    5416           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    5417           0 :     const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
    5418           0 :     const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
    5419           0 :     const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
    5420           0 :     const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
    5421           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    5422           0 :     const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
    5423           0 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    5424           0 :     const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
    5425           0 :     const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
    5426             : 
    5427           0 :     const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
    5428           0 :     const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
    5429           0 :     const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
    5430           0 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    5431           0 :     const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
    5432           0 :     const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
    5433           0 :     const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
    5434           0 :     const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
    5435           0 :     const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
    5436           0 :     const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
    5437           0 :     const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
    5438           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    5439           0 :     const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
    5440           0 :     const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
    5441           0 :     const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
    5442           0 :     const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
    5443           0 :     const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
    5444           0 :     const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
    5445           0 :     const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
    5446           0 :     const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
    5447           0 :     const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
    5448             : 
    5449             :     {
    5450             :         __m256i u[64];
    5451             :         __m256i tmp1, tmp2, tmp3, tmp4;
    5452             :         // stage 1
    5453           0 :         u[0] = in[0];
    5454           0 :         u[32] = in[1];
    5455           0 :         u[36] = in[9];
    5456           0 :         u[40] = in[5];
    5457           0 :         u[44] = in[13];
    5458           0 :         u[48] = in[3];
    5459           0 :         u[52] = in[11];
    5460           0 :         u[56] = in[7];
    5461           0 :         u[60] = in[15];
    5462           0 :         u[16] = in[2];
    5463           0 :         u[20] = in[10];
    5464           0 :         u[24] = in[6];
    5465           0 :         u[28] = in[14];
    5466           0 :         u[4] = in[8];
    5467           0 :         u[8] = in[4];
    5468           0 :         u[12] = in[12];
    5469             : 
    5470             :         // stage 2
    5471           0 :         u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
    5472           0 :         u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
    5473           0 :         u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
    5474           0 :         u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
    5475           0 :         u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
    5476           0 :         u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
    5477           0 :         u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
    5478           0 :         u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
    5479           0 :         u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
    5480           0 :         u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
    5481           0 :         u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
    5482           0 :         u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
    5483           0 :         u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
    5484           0 :         u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
    5485           0 :         u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
    5486           0 :         u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
    5487             : 
    5488             :         // stage 3
    5489           0 :         u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
    5490           0 :         u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
    5491           0 :         u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit);
    5492           0 :         u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit);
    5493           0 :         u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit);
    5494           0 :         u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit);
    5495           0 :         u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
    5496           0 :         u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
    5497           0 :         u[33] = u[32];
    5498           0 :         u[34] = u[35];
    5499           0 :         u[37] = u[36];
    5500           0 :         u[38] = u[39];
    5501           0 :         u[41] = u[40];
    5502           0 :         u[42] = u[43];
    5503           0 :         u[45] = u[44];
    5504           0 :         u[46] = u[47];
    5505           0 :         u[49] = u[48];
    5506           0 :         u[50] = u[51];
    5507           0 :         u[53] = u[52];
    5508           0 :         u[54] = u[55];
    5509           0 :         u[57] = u[56];
    5510           0 :         u[58] = u[59];
    5511           0 :         u[61] = u[60];
    5512           0 :         u[62] = u[63];
    5513             : 
    5514             :         // stage 4
    5515           0 :         u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
    5516           0 :         u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
    5517           0 :         u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
    5518           0 :         u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
    5519             : 
    5520           0 :         u[17] = u[16];
    5521           0 :         u[18] = u[19];
    5522           0 :         u[21] = u[20];
    5523           0 :         u[22] = u[23];
    5524           0 :         u[25] = u[24];
    5525           0 :         u[26] = u[27];
    5526           0 :         u[29] = u[28];
    5527           0 :         u[30] = u[31];
    5528             : 
    5529           0 :         tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
    5530           0 :         tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
    5531           0 :         tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
    5532           0 :         tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
    5533           0 :         u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
    5534           0 :         u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
    5535           0 :         u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
    5536           0 :         u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
    5537           0 :         u[33] = tmp1;
    5538           0 :         u[34] = tmp2;
    5539           0 :         u[37] = tmp3;
    5540           0 :         u[38] = tmp4;
    5541             : 
    5542           0 :         tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
    5543           0 :         tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
    5544           0 :         tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
    5545           0 :         tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
    5546           0 :         u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
    5547           0 :         u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
    5548           0 :         u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
    5549           0 :         u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
    5550           0 :         u[41] = tmp1;
    5551           0 :         u[42] = tmp2;
    5552           0 :         u[45] = tmp3;
    5553           0 :         u[46] = tmp4;
    5554             : 
    5555             :         // stage 5
    5556           0 :         u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
    5557           0 :         u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
    5558             : 
    5559           0 :         u[9] = u[8];
    5560           0 :         u[10] = u[11];
    5561           0 :         u[13] = u[12];
    5562           0 :         u[14] = u[15];
    5563             : 
    5564           0 :         tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
    5565           0 :         tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
    5566           0 :         tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
    5567           0 :         tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
    5568           0 :         u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
    5569           0 :         u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
    5570           0 :         u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
    5571           0 :         u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
    5572           0 :         u[17] = tmp1;
    5573           0 :         u[18] = tmp2;
    5574           0 :         u[21] = tmp3;
    5575           0 :         u[22] = tmp4;
    5576             : 
    5577           0 :         for (i = 32; i < 64; i += 8) {
    5578           0 :             addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
    5579             :                 &clamp_hi);
    5580           0 :             addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
    5581             :                 &clamp_hi);
    5582             : 
    5583           0 :             addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
    5584             :                 &clamp_hi);
    5585           0 :             addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
    5586             :                 &clamp_hi);
    5587             :         }
    5588             : 
    5589             :         // stage 6
    5590           0 :         tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
    5591           0 :         u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
    5592           0 :         u[0] = tmp1;
    5593           0 :         u[5] = u[4];
    5594           0 :         u[6] = u[7];
    5595             : 
    5596           0 :         tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
    5597           0 :         u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
    5598           0 :         u[9] = tmp1;
    5599           0 :         tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
    5600           0 :         u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
    5601           0 :         u[10] = tmp2;
    5602             : 
    5603           0 :         for (i = 16; i < 32; i += 8) {
    5604           0 :             addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
    5605             :                 &clamp_hi);
    5606           0 :             addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
    5607             :                 &clamp_hi);
    5608             : 
    5609           0 :             addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
    5610             :                 &clamp_hi);
    5611           0 :             addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
    5612             :                 &clamp_hi);
    5613             :         }
    5614             : 
    5615           0 :         tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
    5616           0 :         tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
    5617           0 :         tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
    5618           0 :         tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
    5619           0 :         u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
    5620           0 :         u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
    5621           0 :         u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
    5622           0 :         u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
    5623           0 :         u[34] = tmp1;
    5624           0 :         u[35] = tmp2;
    5625           0 :         u[36] = tmp3;
    5626           0 :         u[37] = tmp4;
    5627             : 
    5628           0 :         tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
    5629           0 :         tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
    5630           0 :         tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
    5631           0 :         tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
    5632           0 :         u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
    5633           0 :         u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
    5634           0 :         u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
    5635           0 :         u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
    5636           0 :         u[42] = tmp1;
    5637           0 :         u[43] = tmp2;
    5638           0 :         u[44] = tmp3;
    5639           0 :         u[45] = tmp4;
    5640             : 
    5641             :         // stage 7
    5642           0 :         u[3] = u[0];
    5643           0 :         u[2] = u[1];
    5644           0 :         tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
    5645           0 :         u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
    5646           0 :         u[5] = tmp1;
    5647           0 :         addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
    5648           0 :         addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
    5649           0 :         addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
    5650           0 :         addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
    5651             : 
    5652           0 :         tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
    5653           0 :         tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
    5654           0 :         tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
    5655           0 :         tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
    5656           0 :         u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
    5657           0 :         u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
    5658           0 :         u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
    5659           0 :         u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
    5660           0 :         u[18] = tmp1;
    5661           0 :         u[19] = tmp2;
    5662           0 :         u[20] = tmp3;
    5663           0 :         u[21] = tmp4;
    5664             : 
    5665           0 :         for (i = 32; i < 64; i += 16) {
    5666           0 :             for (j = i; j < i + 4; j++) {
    5667           0 :                 addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
    5668           0 :                 addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
    5669             :                     &clamp_hi);
    5670             :             }
    5671             :         }
    5672             : 
    5673             :         // stage 8
    5674           0 :         for (i = 0; i < 4; ++i)
    5675           0 :             addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
    5676           0 :         idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
    5677             :             &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
    5678             : 
    5679             :         // stage 9
    5680           0 :         idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
    5681             :             bit);
    5682             : 
    5683             :         // stage 10
    5684           0 :         idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
    5685             :             bit);
    5686             : 
    5687             :         // stage 11
    5688           0 :         idct64_stage11_avx2(u, out, do_cols, bd, out_shift, log_range);
    5689             :     }
    5690           0 : }
    5691             : 
    5692           0 : static void idct64_avx2(__m256i *in, __m256i *out, int32_t bit, int32_t do_cols, int32_t bd,
    5693             :     int32_t out_shift) {
    5694             :     int32_t i, j;
    5695           0 :     const int32_t *cospi = cospi_arr(bit);
    5696           0 :     const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
    5697           0 :     const int32_t log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
    5698           0 :     const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
    5699           0 :     const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
    5700             : 
    5701           0 :     const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
    5702           0 :     const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
    5703           0 :     const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
    5704           0 :     const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
    5705           0 :     const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
    5706           0 :     const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
    5707           0 :     const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
    5708           0 :     const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
    5709           0 :     const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
    5710           0 :     const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
    5711           0 :     const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
    5712           0 :     const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
    5713           0 :     const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
    5714           0 :     const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
    5715           0 :     const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
    5716           0 :     const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
    5717           0 :     const __m256i cospi17 = _mm256_set1_epi32(cospi[17]);
    5718           0 :     const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
    5719           0 :     const __m256i cospi19 = _mm256_set1_epi32(cospi[19]);
    5720           0 :     const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
    5721           0 :     const __m256i cospi21 = _mm256_set1_epi32(cospi[21]);
    5722           0 :     const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
    5723           0 :     const __m256i cospi23 = _mm256_set1_epi32(cospi[23]);
    5724           0 :     const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
    5725           0 :     const __m256i cospi25 = _mm256_set1_epi32(cospi[25]);
    5726           0 :     const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
    5727           0 :     const __m256i cospi27 = _mm256_set1_epi32(cospi[27]);
    5728           0 :     const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
    5729           0 :     const __m256i cospi29 = _mm256_set1_epi32(cospi[29]);
    5730           0 :     const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
    5731           0 :     const __m256i cospi31 = _mm256_set1_epi32(cospi[31]);
    5732           0 :     const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
    5733           0 :     const __m256i cospi35 = _mm256_set1_epi32(cospi[35]);
    5734           0 :     const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
    5735           0 :     const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
    5736           0 :     const __m256i cospi39 = _mm256_set1_epi32(cospi[39]);
    5737           0 :     const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
    5738           0 :     const __m256i cospi43 = _mm256_set1_epi32(cospi[43]);
    5739           0 :     const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
    5740           0 :     const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
    5741           0 :     const __m256i cospi47 = _mm256_set1_epi32(cospi[47]);
    5742           0 :     const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
    5743           0 :     const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
    5744           0 :     const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
    5745           0 :     const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
    5746           0 :     const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
    5747           0 :     const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
    5748           0 :     const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
    5749           0 :     const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
    5750           0 :     const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
    5751           0 :     const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
    5752             : 
    5753           0 :     const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
    5754           0 :     const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
    5755           0 :     const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
    5756           0 :     const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
    5757           0 :     const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
    5758           0 :     const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
    5759           0 :     const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
    5760           0 :     const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
    5761           0 :     const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]);
    5762           0 :     const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
    5763           0 :     const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
    5764           0 :     const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]);
    5765           0 :     const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
    5766           0 :     const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]);
    5767           0 :     const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
    5768           0 :     const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
    5769           0 :     const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]);
    5770           0 :     const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
    5771           0 :     const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
    5772           0 :     const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
    5773           0 :     const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
    5774           0 :     const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
    5775           0 :     const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
    5776           0 :     const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
    5777           0 :     const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
    5778           0 :     const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
    5779           0 :     const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
    5780             : 
    5781             :     {
    5782             :         __m256i u[64], v[64];
    5783             : 
    5784             :         // stage 1
    5785           0 :         u[32] = in[1];
    5786           0 :         u[34] = in[17];
    5787           0 :         u[36] = in[9];
    5788           0 :         u[38] = in[25];
    5789           0 :         u[40] = in[5];
    5790           0 :         u[42] = in[21];
    5791           0 :         u[44] = in[13];
    5792           0 :         u[46] = in[29];
    5793           0 :         u[48] = in[3];
    5794           0 :         u[50] = in[19];
    5795           0 :         u[52] = in[11];
    5796           0 :         u[54] = in[27];
    5797           0 :         u[56] = in[7];
    5798           0 :         u[58] = in[23];
    5799           0 :         u[60] = in[15];
    5800           0 :         u[62] = in[31];
    5801             : 
    5802           0 :         v[16] = in[2];
    5803           0 :         v[18] = in[18];
    5804           0 :         v[20] = in[10];
    5805           0 :         v[22] = in[26];
    5806           0 :         v[24] = in[6];
    5807           0 :         v[26] = in[22];
    5808           0 :         v[28] = in[14];
    5809           0 :         v[30] = in[30];
    5810             : 
    5811           0 :         u[8] = in[4];
    5812           0 :         u[10] = in[20];
    5813           0 :         u[12] = in[12];
    5814           0 :         u[14] = in[28];
    5815             : 
    5816           0 :         v[4] = in[8];
    5817           0 :         v[6] = in[24];
    5818             : 
    5819           0 :         u[0] = in[0];
    5820           0 :         u[2] = in[16];
    5821             : 
    5822             :         // stage 2
    5823           0 :         v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
    5824           0 :         v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit);
    5825           0 :         v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit);
    5826           0 :         v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
    5827           0 :         v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
    5828           0 :         v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit);
    5829           0 :         v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit);
    5830           0 :         v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
    5831           0 :         v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
    5832           0 :         v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit);
    5833           0 :         v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit);
    5834           0 :         v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
    5835           0 :         v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
    5836           0 :         v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit);
    5837           0 :         v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit);
    5838           0 :         v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
    5839           0 :         v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
    5840           0 :         v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit);
    5841           0 :         v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit);
    5842           0 :         v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
    5843           0 :         v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
    5844           0 :         v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit);
    5845           0 :         v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit);
    5846           0 :         v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
    5847           0 :         v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
    5848           0 :         v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit);
    5849           0 :         v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit);
    5850           0 :         v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
    5851           0 :         v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
    5852           0 :         v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit);
    5853           0 :         v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit);
    5854           0 :         v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
    5855             : 
    5856             :         // stage 3
    5857           0 :         u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit);
    5858           0 :         u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit);
    5859           0 :         u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit);
    5860           0 :         u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit);
    5861           0 :         u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit);
    5862           0 :         u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit);
    5863           0 :         u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit);
    5864           0 :         u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit);
    5865           0 :         u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit);
    5866           0 :         u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit);
    5867           0 :         u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit);
    5868           0 :         u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit);
    5869           0 :         u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit);
    5870           0 :         u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit);
    5871           0 :         u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit);
    5872           0 :         u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit);
    5873             : 
    5874           0 :         for (i = 32; i < 64; i += 4) {
    5875           0 :             addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
    5876             :                 &clamp_hi);
    5877           0 :             addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
    5878             :                 &clamp_hi);
    5879             :         }
    5880             : 
    5881             :         // stage 4
    5882           0 :         v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
    5883           0 :         v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
    5884           0 :         v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
    5885           0 :         v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
    5886           0 :         v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
    5887           0 :         v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
    5888           0 :         v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
    5889           0 :         v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
    5890             : 
    5891           0 :         for (i = 16; i < 32; i += 4) {
    5892           0 :             addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
    5893             :                 &clamp_hi);
    5894           0 :             addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
    5895             :                 &clamp_hi);
    5896             :         }
    5897             : 
    5898           0 :         for (i = 32; i < 64; i += 4) {
    5899           0 :             v[i + 0] = u[i + 0];
    5900           0 :             v[i + 3] = u[i + 3];
    5901             :         }
    5902             : 
    5903           0 :         v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
    5904           0 :         v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
    5905           0 :         v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
    5906           0 :         v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
    5907           0 :         v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
    5908           0 :         v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
    5909           0 :         v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
    5910           0 :         v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
    5911           0 :         v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
    5912           0 :         v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
    5913           0 :         v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
    5914           0 :         v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
    5915           0 :         v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
    5916           0 :         v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
    5917           0 :         v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
    5918           0 :         v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
    5919             : 
    5920             :         // stage 5
    5921           0 :         u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit);
    5922           0 :         u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit);
    5923           0 :         u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit);
    5924           0 :         u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit);
    5925             : 
    5926           0 :         for (i = 8; i < 16; i += 4) {
    5927           0 :             addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
    5928             :                 &clamp_hi);
    5929           0 :             addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
    5930             :                 &clamp_hi);
    5931             :         }
    5932             : 
    5933           0 :         for (i = 16; i < 32; i += 4) {
    5934           0 :             u[i + 0] = v[i + 0];
    5935           0 :             u[i + 3] = v[i + 3];
    5936             :         }
    5937             : 
    5938           0 :         u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
    5939           0 :         u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
    5940           0 :         u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
    5941           0 :         u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
    5942           0 :         u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
    5943           0 :         u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
    5944           0 :         u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
    5945           0 :         u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
    5946             : 
    5947           0 :         for (i = 32; i < 64; i += 8) {
    5948           0 :             addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
    5949             :                 &clamp_hi);
    5950           0 :             addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
    5951             :                 &clamp_hi);
    5952             : 
    5953           0 :             addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
    5954             :                 &clamp_hi);
    5955           0 :             addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
    5956             :                 &clamp_hi);
    5957             :         }
    5958             : 
    5959             :         // stage 6
    5960           0 :         v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
    5961           0 :         v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
    5962           0 :         v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
    5963           0 :         v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
    5964             : 
    5965           0 :         addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
    5966           0 :         addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
    5967             : 
    5968           0 :         for (i = 8; i < 16; i += 4) {
    5969           0 :             v[i + 0] = u[i + 0];
    5970           0 :             v[i + 3] = u[i + 3];
    5971             :         }
    5972             : 
    5973           0 :         v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
    5974           0 :         v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
    5975           0 :         v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
    5976           0 :         v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
    5977             : 
    5978           0 :         for (i = 16; i < 32; i += 8) {
    5979           0 :             addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
    5980             :                 &clamp_hi);
    5981           0 :             addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
    5982             :                 &clamp_hi);
    5983             : 
    5984           0 :             addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
    5985             :                 &clamp_hi);
    5986           0 :             addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
    5987             :                 &clamp_hi);
    5988             :         }
    5989             : 
    5990           0 :         for (i = 32; i < 64; i += 8) {
    5991           0 :             v[i + 0] = u[i + 0];
    5992           0 :             v[i + 1] = u[i + 1];
    5993           0 :             v[i + 6] = u[i + 6];
    5994           0 :             v[i + 7] = u[i + 7];
    5995             :         }
    5996             : 
    5997           0 :         v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
    5998           0 :         v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
    5999           0 :         v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
    6000           0 :         v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
    6001           0 :         v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
    6002           0 :         v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
    6003           0 :         v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
    6004           0 :         v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
    6005           0 :         v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
    6006           0 :         v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
    6007           0 :         v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
    6008           0 :         v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
    6009           0 :         v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
    6010           0 :         v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
    6011           0 :         v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
    6012           0 :         v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
    6013             : 
    6014             :         // stage 7
    6015           0 :         addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
    6016           0 :         addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
    6017             : 
    6018           0 :         u[4] = v[4];
    6019           0 :         u[7] = v[7];
    6020           0 :         u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
    6021           0 :         u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
    6022             : 
    6023           0 :         addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
    6024           0 :         addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
    6025           0 :         addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
    6026           0 :         addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
    6027             : 
    6028           0 :         for (i = 16; i < 32; i += 8) {
    6029           0 :             u[i + 0] = v[i + 0];
    6030           0 :             u[i + 1] = v[i + 1];
    6031           0 :             u[i + 6] = v[i + 6];
    6032           0 :             u[i + 7] = v[i + 7];
    6033             :         }
    6034             : 
    6035           0 :         u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
    6036           0 :         u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
    6037           0 :         u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
    6038           0 :         u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
    6039           0 :         u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
    6040           0 :         u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
    6041           0 :         u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
    6042           0 :         u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
    6043             : 
    6044           0 :         for (i = 32; i < 64; i += 16) {
    6045           0 :             for (j = i; j < i + 4; j++) {
    6046           0 :                 addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
    6047           0 :                 addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
    6048             :                     &clamp_hi);
    6049             :             }
    6050             :         }
    6051             : 
    6052             :         // stage 8
    6053           0 :         for (i = 0; i < 4; ++i)
    6054           0 :             addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
    6055           0 :         v[8] = u[8];
    6056           0 :         v[9] = u[9];
    6057           0 :         v[14] = u[14];
    6058           0 :         v[15] = u[15];
    6059             : 
    6060           0 :         v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
    6061           0 :         v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
    6062           0 :         v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
    6063           0 :         v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
    6064             : 
    6065           0 :         for (i = 16; i < 20; ++i) {
    6066           0 :             addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
    6067           0 :             addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
    6068             :                 &clamp_hi);
    6069             :         }
    6070             : 
    6071           0 :         for (i = 32; i < 36; ++i) {
    6072           0 :             v[i] = u[i];
    6073           0 :             v[i + 12] = u[i + 12];
    6074           0 :             v[i + 16] = u[i + 16];
    6075           0 :             v[i + 28] = u[i + 28];
    6076             :         }
    6077             : 
    6078           0 :         v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
    6079           0 :         v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
    6080           0 :         v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
    6081           0 :         v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
    6082           0 :         v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
    6083           0 :         v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
    6084           0 :         v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
    6085           0 :         v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
    6086           0 :         v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
    6087           0 :         v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
    6088           0 :         v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
    6089           0 :         v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
    6090           0 :         v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
    6091           0 :         v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
    6092           0 :         v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
    6093           0 :         v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
    6094             : 
    6095             :         // stage 9
    6096           0 :         for (i = 0; i < 8; ++i)
    6097           0 :             addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
    6098           0 :         for (i = 16; i < 20; ++i) {
    6099           0 :             u[i] = v[i];
    6100           0 :             u[i + 12] = v[i + 12];
    6101             :         }
    6102             : 
    6103           0 :         u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
    6104           0 :         u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
    6105           0 :         u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
    6106           0 :         u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
    6107           0 :         u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
    6108           0 :         u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
    6109           0 :         u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
    6110           0 :         u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
    6111             : 
    6112           0 :         for (i = 32; i < 40; i++)
    6113           0 :             addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
    6114           0 :         for (i = 48; i < 56; i++)
    6115           0 :             addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
    6116             :         // stage 10
    6117           0 :         for (i = 0; i < 16; i++)
    6118           0 :             addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
    6119           0 :         for (i = 32; i < 40; i++) v[i] = u[i];
    6120             : 
    6121           0 :         v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
    6122           0 :         v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
    6123           0 :         v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
    6124           0 :         v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
    6125           0 :         v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
    6126           0 :         v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
    6127           0 :         v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
    6128           0 :         v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
    6129           0 :         v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
    6130           0 :         v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
    6131           0 :         v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
    6132           0 :         v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
    6133           0 :         v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
    6134           0 :         v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
    6135           0 :         v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
    6136           0 :         v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
    6137             : 
    6138           0 :         for (i = 56; i < 64; i++) v[i] = u[i];
    6139             : 
    6140             :         // stage 11
    6141           0 :         if (do_cols) {
    6142           0 :             for (i = 0; i < 32; i++)
    6143           0 :                 addsub_no_clamp_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
    6144             :         }
    6145             :         else {
    6146           0 :             const int32_t log_range_out = AOMMAX(16, bd + 6);
    6147           0 :             const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
    6148             :                 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
    6149           0 :             const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
    6150             :                 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
    6151             : 
    6152           0 :             for (i = 0; i < 32; i++) {
    6153           0 :                 addsub_shift_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
    6154             :                     &clamp_lo_out, &clamp_hi_out, out_shift);
    6155             :             }
    6156             :         }
    6157             :     }
    6158           0 : }
    6159             : 
    6160             : typedef void(*transform_1d_avx2)(__m256i *in, __m256i *out, int32_t bit,
    6161             :     int32_t do_cols, int32_t bd, int32_t out_shift);
    6162             : 
    6163             : static const transform_1d_avx2
    6164             : highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
    6165             :     {
    6166             :         { NULL, NULL, NULL, NULL },
    6167             :         { NULL, NULL, NULL, NULL },
    6168             :         { NULL, NULL, NULL, NULL },
    6169             :     },
    6170             :     {
    6171             :         { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL },
    6172             :         { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL },
    6173             :         { iidentity8_avx2, iidentity8_avx2, iidentity8_avx2, iidentity8_avx2 },
    6174             :     },
    6175             :     {
    6176             :         { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
    6177             :         { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
    6178             :         { iidentity16_avx2, iidentity16_avx2, iidentity16_avx2, iidentity16_avx2 },
    6179             :     },
    6180             :     { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2_new },
    6181             :     { NULL, NULL, NULL, NULL },
    6182             :     { iidentity32_avx2, iidentity32_avx2, iidentity32_avx2, iidentity32_avx2 } },
    6183             :     { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 },
    6184             :     { NULL, NULL, NULL, NULL },
    6185             :     { NULL, NULL, NULL, NULL } }
    6186             : };
    6187             : 
    6188           0 : static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
    6189             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    6190             :     TxType tx_type,
    6191             :     TxSize tx_size, int32_t eob,
    6192             :     const int32_t bd) {
    6193             :     __m256i buf1[64 * 8];
    6194             :     int32_t eobx, eoby;
    6195           0 :     get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
    6196           0 :     const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
    6197           0 :     const int32_t txw_idx = get_txw_idx(tx_size);
    6198           0 :     const int32_t txh_idx = get_txh_idx(tx_size);
    6199           0 :     const int32_t txfm_size_col = tx_size_wide[tx_size];
    6200           0 :     const int32_t txfm_size_row = tx_size_high[tx_size];
    6201           0 :     const int32_t buf_size_w_div8 = txfm_size_col >> 3;
    6202           0 :     const int32_t buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
    6203           0 :     const int32_t buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
    6204           0 :     const int32_t input_stride = AOMMIN(32, txfm_size_col);
    6205           0 :     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
    6206             :     ASSERT(eobx < 32);
    6207             :     ASSERT(eoby < 32);
    6208           0 :     const int32_t fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
    6209           0 :     const int32_t fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
    6210           0 :     const transform_1d_avx2 row_txfm =
    6211           0 :         highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
    6212           0 :     const transform_1d_avx2 col_txfm =
    6213           0 :         highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
    6214             : 
    6215           0 :     assert(col_txfm != NULL);
    6216           0 :     assert(row_txfm != NULL);
    6217             :     int32_t ud_flip, lr_flip;
    6218           0 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    6219             : 
    6220             :     // 1st stage: column transform
    6221           0 :     for (int32_t i = 0; i < buf_size_nonzero_h_div8; i++) {
    6222             :         __m256i buf0[64];
    6223           0 :         const int32_t *input_row = input + i * input_stride * 8;
    6224           0 :         for (int32_t j = 0; j < buf_size_nonzero_w_div8; ++j) {
    6225           0 :             __m256i *buf0_cur = buf0 + j * 8;
    6226           0 :             load_buffer_32x32_new(input_row + j * 8, buf0_cur, input_stride, 8);
    6227             : 
    6228           0 :             transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
    6229             :         }
    6230           0 :         if (rect_type == 1 || rect_type == -1) {
    6231           0 :             av1_round_shift_rect_array_32_avx2(
    6232             :                 buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
    6233             :         }
    6234           0 :         row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
    6235             : 
    6236           0 :         __m256i *_buf1 = buf1 + i * 8;
    6237           0 :         if (lr_flip) {
    6238           0 :             for (int32_t j = 0; j < buf_size_w_div8; ++j) {
    6239           0 :                 transpose_8x8_flip_avx2(
    6240           0 :                     &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
    6241             :             }
    6242             :         }
    6243             :         else {
    6244           0 :             for (int32_t j = 0; j < buf_size_w_div8; ++j)
    6245           0 :                 transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
    6246             :         }
    6247             :     }
    6248             :     // 2nd stage: column transform
    6249           0 :     for (int32_t i = 0; i < buf_size_w_div8; i++) {
    6250           0 :         col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
    6251           0 :             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
    6252             : 
    6253           0 :         av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
    6254           0 :             buf1 + i * txfm_size_row, txfm_size_row,
    6255           0 :             -shift[1]);
    6256             :     }
    6257             : 
    6258             :     // write to buffer
    6259           0 :     if (txfm_size_col >= 16) {
    6260           0 :         for (int32_t i = 0; i < (txfm_size_col >> 4); i++) {
    6261           0 :             highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
    6262           0 :                 output_r + 16 * i, stride_r,
    6263           0 :                 output_w + 16 * i, stride_w,
    6264             :                 ud_flip, txfm_size_row, bd);
    6265             :         }
    6266             :     }
    6267           0 :     else if (txfm_size_col == 8) {
    6268           0 :         highbd_write_buffer_8xn_avx2(buf1,
    6269             :             output_r, stride_r, output_w, stride_w,
    6270             :             ud_flip, txfm_size_row, bd);
    6271             :     }
    6272           0 : }
    6273             : 
    6274           0 : static void highbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
    6275             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    6276             :     TxType tx_type, TxSize tx_size,
    6277             :     int32_t eob, const int8_t bd) {
    6278             :     (void)eob;
    6279             :     __m256i buf1[64 * 2];
    6280             : 
    6281           0 :     const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
    6282           0 :     const int32_t txw_idx = get_txw_idx(tx_size);
    6283           0 :     const int32_t txh_idx = get_txh_idx(tx_size);
    6284           0 :     const int32_t txfm_size_col = tx_size_wide[tx_size];
    6285           0 :     const int32_t txfm_size_row = tx_size_high[tx_size];
    6286             : 
    6287           0 :     const int32_t input_stride = AOMMIN(32, txfm_size_col);
    6288           0 :     const int32_t row_max = AOMMIN(32, txfm_size_row);
    6289           0 :     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
    6290             : 
    6291           0 :     const transform_1d_avx2 row_txfm =
    6292           0 :         highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
    6293           0 :     const transform_1d_avx2 col_txfm =
    6294           0 :         highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
    6295             : 
    6296             :     int32_t ud_flip, lr_flip, j;
    6297           0 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    6298             : 
    6299             :     // 1st stage: row transform
    6300           0 :     for (int32_t i = 0; i < (row_max >> 3); ++i) {
    6301             :         __m256i buf0[32];
    6302           0 :         const int32_t *input_row = input + i * input_stride * 8;
    6303           0 :         for (int32_t j = 0; j < (input_stride >> 3); ++j) {
    6304           0 :             __m256i *buf0_cur = buf0 + j * 8;
    6305           0 :             load_buffer_32x32_new(input_row + j * 8, buf0_cur, input_stride, 8);
    6306             :         }
    6307           0 :         if (rect_type == 1 || rect_type == -1) {
    6308           0 :             av1_round_shift_rect_array_32_avx2(buf0, buf0, input_stride, 0,
    6309             :                 NewInvSqrt2);
    6310             :         }
    6311           0 :         row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
    6312             : 
    6313           0 :         __m256i *_buf1 = buf1 + i * 8;
    6314             : 
    6315           0 :         for (j = 0; j < (input_stride >> 3); ++j) {
    6316           0 :             _buf1[j * txfm_size_row + 0] = buf0[j * 8 + 0];
    6317           0 :             _buf1[j * txfm_size_row + 1] = buf0[j * 8 + 1];
    6318           0 :             _buf1[j * txfm_size_row + 2] = buf0[j * 8 + 2];
    6319           0 :             _buf1[j * txfm_size_row + 3] = buf0[j * 8 + 3];
    6320           0 :             _buf1[j * txfm_size_row + 4] = buf0[j * 8 + 4];
    6321           0 :             _buf1[j * txfm_size_row + 5] = buf0[j * 8 + 5];
    6322           0 :             _buf1[j * txfm_size_row + 6] = buf0[j * 8 + 6];
    6323           0 :             _buf1[j * txfm_size_row + 7] = buf0[j * 8 + 7];
    6324             :         }
    6325             :     }
    6326             :     // 2nd stage: column transform
    6327           0 :     for (int32_t i = 0; i < (input_stride >> 3); i++) {
    6328           0 :         col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
    6329           0 :             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
    6330             : 
    6331           0 :         av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
    6332           0 :             buf1 + i * txfm_size_row, txfm_size_row,
    6333           0 :             -shift[1]);
    6334             :     }
    6335             : 
    6336             :     // write to buffer
    6337           0 :     if (txfm_size_col >= 16) {
    6338           0 :         for (int32_t i = 0; i < (txfm_size_col >> 4); i++) {
    6339           0 :             highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
    6340           0 :                 output_r + 16 * i, stride_r,
    6341           0 :                 output_w + 16 * i, stride_w,
    6342             :                 ud_flip, txfm_size_row, bd);
    6343             :         }
    6344             :     }
    6345           0 :     else if (txfm_size_col == 8) {
    6346           0 :         highbd_write_buffer_8xn_avx2(buf1,
    6347             :             output_r, stride_r, output_w, stride_w,
    6348             :             ud_flip, txfm_size_row,
    6349             :             bd);
    6350             :     }
    6351           0 : }
    6352           0 : static void highbd_inv_txfm2d_add_v_identity_avx2(const int32_t *input,
    6353             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    6354             :     TxType tx_type, TxSize tx_size,
    6355             :     int32_t eob, const int8_t bd) {
    6356             :     __m256i buf1[64];
    6357             :     int32_t eobx, eoby;
    6358           0 :     get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
    6359           0 :     const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
    6360           0 :     const int32_t txw_idx = get_txw_idx(tx_size);
    6361           0 :     const int32_t txh_idx = get_txh_idx(tx_size);
    6362           0 :     const int32_t txfm_size_col = tx_size_wide[tx_size];
    6363           0 :     const int32_t txfm_size_row = tx_size_high[tx_size];
    6364           0 :     const int32_t input_stride = AOMMIN(32, txfm_size_col);
    6365           0 :     const int32_t buf_size_w_div4 = input_stride >> 3;
    6366           0 :     const int32_t buf_size_h_div8 = (eoby + 8) >> 3;
    6367           0 :     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
    6368           0 :     const int32_t fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
    6369           0 :     const transform_1d_avx2 row_txfm =
    6370           0 :         highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
    6371           0 :     const transform_1d_avx2 col_txfm =
    6372           0 :         highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
    6373             :     int32_t ud_flip, lr_flip;
    6374           0 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    6375             : 
    6376           0 :     for (int32_t i = 0; i < (buf_size_h_div8 /*<< 1*/); ++i) {
    6377             :         __m256i buf0[16];
    6378           0 :         const int32_t *input_row = input + i * input_stride * 8;
    6379           0 :         for (int32_t j = 0; j < buf_size_w_div4; ++j) {
    6380           0 :             __m256i *buf0_cur = buf0 + j * 8;
    6381           0 :             load_buffer_32x32_new(input_row + j * 8, buf0_cur, input_stride, 8);
    6382             :         }
    6383           0 :         if (rect_type == 1 || rect_type == -1) {
    6384           0 :             av1_round_shift_rect_array_32_avx2(buf0, buf0, input_stride, 0,
    6385             :                 NewInvSqrt2);
    6386             :         }
    6387           0 :         row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
    6388             : 
    6389           0 :         __m256i *_buf1 = buf1 + i * 8;
    6390             : 
    6391           0 :         for (int32_t j = 0; j < buf_size_w_div4; ++j) {
    6392           0 :             _buf1[j * txfm_size_row + 0] = buf0[j * 8 + 0];
    6393           0 :             _buf1[j * txfm_size_row + 1] = buf0[j * 8 + 1];
    6394           0 :             _buf1[j * txfm_size_row + 2] = buf0[j * 8 + 2];
    6395           0 :             _buf1[j * txfm_size_row + 3] = buf0[j * 8 + 3];
    6396           0 :             _buf1[j * txfm_size_row + 4] = buf0[j * 8 + 4];
    6397           0 :             _buf1[j * txfm_size_row + 5] = buf0[j * 8 + 5];
    6398           0 :             _buf1[j * txfm_size_row + 6] = buf0[j * 8 + 6];
    6399           0 :             _buf1[j * txfm_size_row + 7] = buf0[j * 8 + 7];
    6400             :         }
    6401             :     }
    6402           0 :     for (int32_t i = 0; i < buf_size_w_div4; i++) {
    6403           0 :         col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
    6404           0 :             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
    6405             : 
    6406           0 :         av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
    6407           0 :             buf1 + i * txfm_size_row, txfm_size_row,
    6408           0 :             -shift[1]);
    6409             :     }
    6410             : 
    6411             :     // write to buffer
    6412           0 :     if (txfm_size_col >= 16) {
    6413           0 :         for (int32_t i = 0; i < (txfm_size_col >> 4); i++) {
    6414           0 :             highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
    6415           0 :                 output_r + 16 * i, stride_r,
    6416           0 :                 output_w + 16 * i, stride_w,
    6417             :                 ud_flip, txfm_size_row, bd);
    6418             :         }
    6419             :     }
    6420           0 :     else if (txfm_size_col == 8) {
    6421           0 :         highbd_write_buffer_8xn_avx2(buf1,
    6422             :             output_r, stride_r, output_w, stride_w,
    6423             :             ud_flip, txfm_size_row, bd);
    6424             :     }
    6425           0 : }
    6426           0 : static void highbd_inv_txfm2d_add_h_identity_avx2(const int32_t *input,
    6427             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    6428             :     TxType tx_type, TxSize tx_size, int32_t eob, const int32_t bd) {
    6429             :     __m256i buf1[32];
    6430             :     int32_t eobx, eoby;
    6431           0 :     get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
    6432           0 :     const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
    6433           0 :     const int32_t txw_idx = get_txw_idx(tx_size);
    6434           0 :     const int32_t txh_idx = get_txh_idx(tx_size);
    6435           0 :     const int32_t txfm_size_col = tx_size_wide[tx_size];
    6436           0 :     const int32_t txfm_size_row = tx_size_high[tx_size];
    6437           0 :     const int32_t input_stride = AOMMIN(32, txfm_size_col);
    6438           0 :     const int32_t buf_size_w_div8 = input_stride >> 3;
    6439           0 :     const int32_t row_max = AOMMIN(32, txfm_size_row);
    6440           0 :     const int32_t buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
    6441           0 :     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
    6442           0 :     const int32_t fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
    6443           0 :     const transform_1d_avx2 row_txfm =
    6444           0 :         highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
    6445           0 :     const transform_1d_avx2 col_txfm =
    6446           0 :         highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
    6447             :     int32_t ud_flip, lr_flip;
    6448           0 :     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
    6449             : 
    6450           0 :     for (int32_t i = 0; i < (row_max >> 3); ++i) {
    6451             :         __m256i buf0[32];
    6452           0 :         const int32_t *input_row = input + i * input_stride * 8;
    6453           0 :         for (int32_t j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
    6454           0 :             __m256i *buf0_cur = buf0 + j * 8;
    6455           0 :             load_buffer_32x32_new(input_row + j * 8, buf0_cur, input_stride, 8);
    6456             : 
    6457           0 :             transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
    6458             :         }
    6459           0 :         if (rect_type == 1 || rect_type == -1) {
    6460           0 :             av1_round_shift_rect_array_32_avx2(
    6461             :                 buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
    6462             :         }
    6463           0 :         row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
    6464             : 
    6465           0 :         __m256i *_buf1 = buf1 + i * 8;
    6466           0 :         if (lr_flip) {
    6467           0 :             for (int32_t j = 0; j < buf_size_w_div8; ++j) {
    6468           0 :                 transpose_8x8_flip_avx2(
    6469           0 :                     &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
    6470             :             }
    6471             :         }
    6472             :         else {
    6473           0 :             for (int32_t j = 0; j < buf_size_w_div8; ++j)
    6474           0 :                 transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
    6475             :         }
    6476             :     }
    6477           0 :     for (int32_t i = 0; i < buf_size_w_div8; i++) {
    6478           0 :         col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
    6479           0 :             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
    6480             : 
    6481           0 :         av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
    6482           0 :             buf1 + i * txfm_size_row, txfm_size_row,
    6483           0 :             -shift[1]);
    6484             :     }
    6485             : 
    6486             :     // write to buffer
    6487           0 :     if (txfm_size_col >= 16) {
    6488           0 :         for (int32_t i = 0; i < (txfm_size_col >> 4); i++) {
    6489           0 :             highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
    6490           0 :                 output_r + 16 * i, stride_r,
    6491           0 :                 output_w + 16 * i, stride_w,
    6492             :                 ud_flip, txfm_size_row, bd);
    6493             :         }
    6494             :     }
    6495           0 :     else if (txfm_size_col == 8) {
    6496           0 :         highbd_write_buffer_8xn_avx2(buf1,
    6497             :             output_r, stride_r, output_w, stride_w,
    6498             :             ud_flip, txfm_size_row, bd);
    6499             :     }
    6500           0 : }
    6501           0 : void eb_av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
    6502             :     uint16_t *output_r, int32_t stride_r,
    6503             :     uint16_t *output_w, int32_t stride_w,
    6504             :     TxType tx_type, TxSize tx_size,
    6505             :     int32_t eob, const int32_t bd) {
    6506           0 :     switch (tx_type)
    6507             :     {
    6508           0 :     case DCT_DCT:
    6509             :     case ADST_DCT:
    6510             :     case DCT_ADST:
    6511             :     case ADST_ADST:
    6512             :     case FLIPADST_DCT:
    6513             :     case DCT_FLIPADST:
    6514             :     case FLIPADST_FLIPADST:
    6515             :     case ADST_FLIPADST:
    6516             :     case FLIPADST_ADST:
    6517           0 :         highbd_inv_txfm2d_add_no_identity_avx2(input,
    6518             :             output_r, stride_r, output_w, stride_w,
    6519             :             tx_type, tx_size, eob, bd);
    6520           0 :         break;
    6521           0 :     case IDTX:
    6522           0 :         highbd_inv_txfm2d_add_idtx_avx2(input,
    6523             :             output_r, stride_r, output_w, stride_w,
    6524             :             tx_type, tx_size, eob, bd);
    6525           0 :         break;
    6526           0 :     case V_DCT:
    6527             :     case V_ADST:
    6528             :     case V_FLIPADST:
    6529           0 :         highbd_inv_txfm2d_add_v_identity_avx2(input,
    6530             :             output_r, stride_r, output_w, stride_w,
    6531             :             tx_type, tx_size, eob, bd);
    6532           0 :         break;
    6533           0 :     case H_DCT:
    6534             :     case H_ADST:
    6535             :     case H_FLIPADST:
    6536           0 :         highbd_inv_txfm2d_add_h_identity_avx2(input,
    6537             :             output_r, stride_r, output_w, stride_w,
    6538             :             tx_type, tx_size, eob, bd);
    6539           0 :         break;
    6540           0 :     default: break;
    6541             :     }
    6542           0 : }
    6543           0 : void eb_av1_highbd_inv_txfm_add_avx2(const int32_t *input,
    6544             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    6545             :     TxType tx_type, TxSize tx_size, int32_t eob, int32_t bd) {
    6546             :     //assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
    6547             : 
    6548           0 :     eb_av1_highbd_inv_txfm2d_add_universe_avx2(
    6549             :         input, output_r, stride_r, output_w, stride_w, tx_type, tx_size,
    6550             :         eob, bd);
    6551           0 : }

Generated by: LCOV version 1.14