LCOV - code coverage report
Current view: top level - Codec - EbTransforms.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 674 5484 12.3 %
Date: 2019-11-25 17:38:06 Functions: 14 138 10.1 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : /*
       7             : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       8             : *
       9             : * This source code is subject to the terms of the BSD 2 Clause License and
      10             : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      11             : * was not distributed with this source code in the LICENSE file, you can
      12             : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      13             : * Media Patent License 1.0 was not distributed with this source code in the
      14             : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      15             : */
      16             : 
      17             : #include <stdlib.h>
      18             : #include "EbUtility.h"
      19             : #include "EbPictureOperators.h"
      20             : #include "EbDefinitions.h"
      21             : #include "EbTransforms.h"
      22             : #include "aom_dsp_rtcd.h"
      23             : 
      24           0 : uint32_t CheckNZero4x4(
      25             :     int16_t  *coeff,
      26             :     uint32_t   coeff_stride){
      27           0 :     const uint32_t stride = coeff_stride / 4;
      28             : 
      29           0 :     uint64_t * coefPtr = (uint64_t *)coeff;
      30             : 
      31           0 :     if (coefPtr[0] > 0)
      32           0 :         return 1;
      33           0 :     else if (coefPtr[stride] > 0)
      34           0 :         return 1;
      35           0 :     else if (coefPtr[2 * stride] > 0)
      36           0 :         return 1;
      37           0 :     else if (coefPtr[3 * stride] > 0)
      38           0 :         return 1;
      39             : 
      40           0 :     return 0;
      41             : }
      42             : 
      43             : const int8_t *eb_inv_txfm_shift_ls[TX_SIZES_ALL] = {
      44             :     inv_shift_4x4, inv_shift_8x8, inv_shift_16x16, inv_shift_32x32,
      45             :     inv_shift_64x64, inv_shift_4x8, inv_shift_8x4, inv_shift_8x16,
      46             :     inv_shift_16x8, inv_shift_16x32, inv_shift_32x16, inv_shift_32x64,
      47             :     inv_shift_64x32, inv_shift_4x16, inv_shift_16x4, inv_shift_8x32,
      48             :     inv_shift_32x8, inv_shift_16x64, inv_shift_64x16,
      49             : };
      50             : 
      51             : static const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {
      52             :     fdct4_range_mult2, fdct8_range_mult2, fdct16_range_mult2,
      53             :     fdct32_range_mult2, fdct64_range_mult2, fadst4_range_mult2,
      54             :     fadst8_range_mult2, fadst16_range_mult2, fadst32_range_mult2,
      55             :     fidtx4_range_mult2, fidtx8_range_mult2, fidtx16_range_mult2,
      56             :     fidtx32_range_mult2, fidtx64_range_mult2
      57             : };
      58             : 
      59             : static const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = {
      60             :     fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32,
      61             :     fwd_shift_64x64, fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16,
      62             :     fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64,
      63             :     fwd_shift_64x32, fwd_shift_4x16, fwd_shift_16x4, fwd_shift_8x32,
      64             :     fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16,
      65             : };
      66             : 
      67           0 : void mat_mult_out(
      68             :     int16_t           *coeff,
      69             :     const uint32_t     coeff_stride,
      70             :     int16_t*          coeff_out,
      71             :     const uint32_t     coeff_out_stride,
      72             :     const uint16_t     *masking_matrix,
      73             :     const uint32_t     masking_matrix_stride,
      74             :     const uint32_t     compute_size,
      75             :     const int32_t     offset,
      76             :     const int32_t     shift_num,
      77             :     uint32_t             *nonzerocoeff) {
      78           0 :     uint32_t coeffLocation = 0, coeffOutLocation = 0;
      79             :     uint32_t row_index, colIndex;
      80             :     int32_t coeffTemp;
      81             : 
      82           0 :     *nonzerocoeff = 0;
      83             : 
      84           0 :     for (row_index = 0; row_index < compute_size; ++row_index) {
      85           0 :         for (colIndex = 0; colIndex < compute_size; ++colIndex) {
      86           0 :             coeffTemp = (ABS(coeff[coeffLocation]) * masking_matrix[colIndex + row_index * masking_matrix_stride] + offset) >> shift_num;
      87           0 :             coeffTemp = (coeff[coeffLocation] < 0) ? -coeffTemp : coeffTemp;
      88             : 
      89           0 :             coeff_out[coeffOutLocation] = (int16_t)CLIP3(MIN_NEG_16BIT_NUM, MAX_POS_16BIT_NUM, coeffTemp);
      90             : 
      91           0 :             (*nonzerocoeff) += (coeffTemp != 0);
      92           0 :             ++coeffLocation;
      93           0 :             ++coeffOutLocation;
      94             :         }
      95           0 :         coeffLocation += coeff_stride - compute_size;
      96           0 :         coeffOutLocation += coeff_out_stride - compute_size;
      97             :     }
      98           0 : }
      99             : 
     100             : /*****************************
     101             :  * function header
     102             :  *****************************/
     103             : 
     104             : uint64_t GetPMCost(
     105             :     uint64_t                   lambda,
     106             :     uint64_t                   tuDistortion,
     107             :     uint64_t                   y_tu_coeff_bits
     108             : );
     109             : 
     110             : /*****************************
     111             :  * Defines
     112             :  *****************************/
     113             : 
     114             : #define BETA_P              1
     115             : #define BETA_N              3
     116             : 
     117             :  /********************************************
     118             :   * Constants
     119             :   ********************************************/
     120             : 
     121             : #define ALPHA_0000   0
     122             : #define ALPHA_0050   50
     123             : 
     124             : #define ALPHA_0100   100
     125             : #define ALPHA_0200   200
     126             : #define ALPHA_0300   300
     127             : #define ALPHA_0500   500
     128             : #define ALPHA_1000  1000
     129             : 
     130             : EB_EXTERN EB_ALIGN(16) const int16_t TransformAsmConst[] = {
     131             :     2, 0, 2, 0, 2, 0, 2, 0,
     132             :     4, 0, 4, 0, 4, 0, 4, 0,
     133             :     8, 0, 8, 0, 8, 0, 8, 0,
     134             :     9, 0, 9, 0, 9, 0, 9, 0,
     135             :     13, 0, 13, 0, 13, 0, 13, 0,
     136             :     16, 0, 16, 0, 16, 0, 16, 0,
     137             :     18, 0, 18, 0, 18, 0, 18, 0,
     138             :     22, 0, 22, 0, 22, 0, 22, 0,
     139             :     25, 0, 25, 0, 25, 0, 25, 0,
     140             :     31, 0, 31, 0, 31, 0, 31, 0,
     141             :     36, 0, 36, 0, 36, 0, 36, 0,
     142             :     38, 0, 38, 0, 38, 0, 38, 0,
     143             :     43, 0, 43, 0, 43, 0, 43, 0,
     144             :     46, 0, 46, 0, 46, 0, 46, 0,
     145             :     50, 0, 50, 0, 50, 0, 50, 0,
     146             :     54, 0, 54, 0, 54, 0, 54, 0,
     147             :     57, 0, 57, 0, 57, 0, 57, 0,
     148             :     61, 0, 61, 0, 61, 0, 61, 0,
     149             :     67, 0, 67, 0, 67, 0, 67, 0,
     150             :     70, 0, 70, 0, 70, 0, 70, 0,
     151             :     73, 0, 73, 0, 73, 0, 73, 0,
     152             :     75, 0, 75, 0, 75, 0, 75, 0,
     153             :     78, 0, 78, 0, 78, 0, 78, 0,
     154             :     80, 0, 80, 0, 80, 0, 80, 0,
     155             :     82, 0, 82, 0, 82, 0, 82, 0,
     156             :     83, 0, 83, 0, 83, 0, 83, 0,
     157             :     85, 0, 85, 0, 85, 0, 85, 0,
     158             :     87, 0, 87, 0, 87, 0, 87, 0,
     159             :     88, 0, 88, 0, 88, 0, 88, 0,
     160             :     89, 0, 89, 0, 89, 0, 89, 0,
     161             :     90, 0, 90, 0, 90, 0, 90, 0,
     162             :     256, 0, 256, 0, 256, 0, 256, 0,
     163             :     512, 0, 512, 0, 512, 0, 512, 0,
     164             :     1024, 0, 1024, 0, 1024, 0, 1024, 0,
     165             :     83, 36, 83, 36, 83, 36, 83, 36,
     166             :     36, -83, 36, -83, 36, -83, 36, -83,
     167             :     89, 75, 89, 75, 89, 75, 89, 75,
     168             :     50, 18, 50, 18, 50, 18, 50, 18,
     169             :     75, -18, 75, -18, 75, -18, 75, -18,
     170             :     -89, -50, -89, -50, -89, -50, -89, -50,
     171             :     50, -89, 50, -89, 50, -89, 50, -89,
     172             :     18, 75, 18, 75, 18, 75, 18, 75,
     173             :     18, -50, 18, -50, 18, -50, 18, -50,
     174             :     75, -89, 75, -89, 75, -89, 75, -89,
     175             :     90, 87, 90, 87, 90, 87, 90, 87, // 16x16
     176             :     80, 70, 80, 70, 80, 70, 80, 70,
     177             :     57, 43, 57, 43, 57, 43, 57, 43,
     178             :     25, 9, 25, 9, 25, 9, 25, 9,
     179             :     87, 57, 87, 57, 87, 57, 87, 57,
     180             :     9, -43, 9, -43, 9, -43, 9, -43,
     181             :     -80, -90, -80, -90, -80, -90, -80, -90,
     182             :     -70, -25, -70, -25, -70, -25, -70, -25,
     183             :     80, 9, 80, 9, 80, 9, 80, 9,
     184             :     -70, -87, -70, -87, -70, -87, -70, -87,
     185             :     -25, 57, -25, 57, -25, 57, -25, 57,
     186             :     90, 43, 90, 43, 90, 43, 90, 43,
     187             :     70, -43, 70, -43, 70, -43, 70, -43,
     188             :     -87, 9, -87, 9, -87, 9, -87, 9,
     189             :     90, 25, 90, 25, 90, 25, 90, 25,
     190             :     -80, -57, -80, -57, -80, -57, -80, -57,
     191             :     57, -80, 57, -80, 57, -80, 57, -80,
     192             :     -25, 90, -25, 90, -25, 90, -25, 90,
     193             :     -9, -87, -9, -87, -9, -87, -9, -87,
     194             :     43, 70, 43, 70, 43, 70, 43, 70,
     195             :     43, -90, 43, -90, 43, -90, 43, -90,
     196             :     57, 25, 57, 25, 57, 25, 57, 25,
     197             :     -87, 70, -87, 70, -87, 70, -87, 70,
     198             :     9, -80, 9, -80, 9, -80, 9, -80,
     199             :     25, -70, 25, -70, 25, -70, 25, -70,
     200             :     90, -80, 90, -80, 90, -80, 90, -80,
     201             :     43, 9, 43, 9, 43, 9, 43, 9,
     202             :     -57, 87, -57, 87, -57, 87, -57, 87,
     203             :     9, -25, 9, -25, 9, -25, 9, -25,
     204             :     43, -57, 43, -57, 43, -57, 43, -57,
     205             :     70, -80, 70, -80, 70, -80, 70, -80,
     206             :     87, -90, 87, -90, 87, -90, 87, -90,
     207             : };
     208             : 
     209             : EB_ALIGN(16) const int16_t transform_asm_const_sse4_1[] = {
     210             :     2, 0, 2, 0, 2, 0, 2, 0,
     211             :     4, 0, 4, 0, 4, 0, 4, 0,
     212             :     8, 0, 8, 0, 8, 0, 8, 0,
     213             :     9, 0, 9, 0, 9, 0, 9, 0,
     214             :     13, 0, 13, 0, 13, 0, 13, 0,
     215             :     16, 0, 16, 0, 16, 0, 16, 0,
     216             :     18, 0, 18, 0, 18, 0, 18, 0,
     217             :     22, 0, 22, 0, 22, 0, 22, 0,
     218             :     25, 0, 25, 0, 25, 0, 25, 0,
     219             :     31, 0, 31, 0, 31, 0, 31, 0,
     220             :     36, 0, 36, 0, 36, 0, 36, 0,
     221             :     38, 0, 38, 0, 38, 0, 38, 0,
     222             :     43, 0, 43, 0, 43, 0, 43, 0,
     223             :     46, 0, 46, 0, 46, 0, 46, 0,
     224             :     50, 0, 50, 0, 50, 0, 50, 0,
     225             :     54, 0, 54, 0, 54, 0, 54, 0,
     226             :     57, 0, 57, 0, 57, 0, 57, 0,
     227             :     61, 0, 61, 0, 61, 0, 61, 0,
     228             :     67, 0, 67, 0, 67, 0, 67, 0,
     229             :     70, 0, 70, 0, 70, 0, 70, 0,
     230             :     73, 0, 73, 0, 73, 0, 73, 0,
     231             :     75, 0, 75, 0, 75, 0, 75, 0,
     232             :     78, 0, 78, 0, 78, 0, 78, 0,
     233             :     80, 0, 80, 0, 80, 0, 80, 0,
     234             :     82, 0, 82, 0, 82, 0, 82, 0,
     235             :     83, 0, 83, 0, 83, 0, 83, 0,
     236             :     85, 0, 85, 0, 85, 0, 85, 0,
     237             :     87, 0, 87, 0, 87, 0, 87, 0,
     238             :     88, 0, 88, 0, 88, 0, 88, 0,
     239             :     89, 0, 89, 0, 89, 0, 89, 0,
     240             :     90, 0, 90, 0, 90, 0, 90, 0,
     241             :     256, 0, 256, 0, 256, 0, 256, 0,
     242             :     512, 0, 512, 0, 512, 0, 512, 0,
     243             :     1024, 0, 1024, 0, 1024, 0, 1024, 0,
     244             :     83, 36, 83, 36, 83, 36, 83, 36,
     245             :     36, -83, 36, -83, 36, -83, 36, -83,
     246             :     89, 75, 89, 75, 89, 75, 89, 75,
     247             :     50, 18, 50, 18, 50, 18, 50, 18,
     248             :     75, -18, 75, -18, 75, -18, 75, -18,
     249             :     -89, -50, -89, -50, -89, -50, -89, -50,
     250             :     50, -89, 50, -89, 50, -89, 50, -89,
     251             :     18, 75, 18, 75, 18, 75, 18, 75,
     252             :     18, -50, 18, -50, 18, -50, 18, -50,
     253             :     75, -89, 75, -89, 75, -89, 75, -89,
     254             :     90, 87, 90, 87, 90, 87, 90, 87, // 16x16
     255             :     80, 70, 80, 70, 80, 70, 80, 70,
     256             :     57, 43, 57, 43, 57, 43, 57, 43,
     257             :     25, 9, 25, 9, 25, 9, 25, 9,
     258             :     87, 57, 87, 57, 87, 57, 87, 57,
     259             :     9, -43, 9, -43, 9, -43, 9, -43,
     260             :     -80, -90, -80, -90, -80, -90, -80, -90,
     261             :     -70, -25, -70, -25, -70, -25, -70, -25,
     262             :     80, 9, 80, 9, 80, 9, 80, 9,
     263             :     -70, -87, -70, -87, -70, -87, -70, -87,
     264             :     -25, 57, -25, 57, -25, 57, -25, 57,
     265             :     90, 43, 90, 43, 90, 43, 90, 43,
     266             :     70, -43, 70, -43, 70, -43, 70, -43,
     267             :     -87, 9, -87, 9, -87, 9, -87, 9,
     268             :     90, 25, 90, 25, 90, 25, 90, 25,
     269             :     -80, -57, -80, -57, -80, -57, -80, -57,
     270             :     57, -80, 57, -80, 57, -80, 57, -80,
     271             :     -25, 90, -25, 90, -25, 90, -25, 90,
     272             :     -9, -87, -9, -87, -9, -87, -9, -87,
     273             :     43, 70, 43, 70, 43, 70, 43, 70,
     274             :     43, -90, 43, -90, 43, -90, 43, -90,
     275             :     57, 25, 57, 25, 57, 25, 57, 25,
     276             :     -87, 70, -87, 70, -87, 70, -87, 70,
     277             :     9, -80, 9, -80, 9, -80, 9, -80,
     278             :     25, -70, 25, -70, 25, -70, 25, -70,
     279             :     90, -80, 90, -80, 90, -80, 90, -80,
     280             :     43, 9, 43, 9, 43, 9, 43, 9,
     281             :     -57, 87, -57, 87, -57, 87, -57, 87,
     282             :     9, -25, 9, -25, 9, -25, 9, -25,
     283             :     43, -57, 43, -57, 43, -57, 43, -57,
     284             :     70, -80, 70, -80, 70, -80, 70, -80,
     285             :     87, -90, 87, -90, 87, -90, 87, -90,
     286             :     90, 90, 90, 90, 90, 90, 90, 90, // 32x32
     287             :     88, 85, 88, 85, 88, 85, 88, 85,
     288             :     82, 78, 82, 78, 82, 78, 82, 78,
     289             :     73, 67, 73, 67, 73, 67, 73, 67,
     290             :     61, 54, 61, 54, 61, 54, 61, 54,
     291             :     46, 38, 46, 38, 46, 38, 46, 38,
     292             :     31, 22, 31, 22, 31, 22, 31, 22,
     293             :     13, 4, 13, 4, 13, 4, 13, 4,
     294             :     90, 82, 90, 82, 90, 82, 90, 82,
     295             :     67, 46, 67, 46, 67, 46, 67, 46,
     296             :     22, -4, 22, -4, 22, -4, 22, -4,
     297             :     -31, -54, -31, -54, -31, -54, -31, -54,
     298             :     -73, -85, -73, -85, -73, -85, -73, -85,
     299             :     -90, -88, -90, -88, -90, -88, -90, -88,
     300             :     -78, -61, -78, -61, -78, -61, -78, -61,
     301             :     -38, -13, -38, -13, -38, -13, -38, -13,
     302             :     88, 67, 88, 67, 88, 67, 88, 67,
     303             :     31, -13, 31, -13, 31, -13, 31, -13,
     304             :     -54, -82, -54, -82, -54, -82, -54, -82,
     305             :     -90, -78, -90, -78, -90, -78, -90, -78,
     306             :     -46, -4, -46, -4, -46, -4, -46, -4,
     307             :     38, 73, 38, 73, 38, 73, 38, 73,
     308             :     90, 85, 90, 85, 90, 85, 90, 85,
     309             :     61, 22, 61, 22, 61, 22, 61, 22,
     310             :     85, 46, 85, 46, 85, 46, 85, 46,
     311             :     -13, -67, -13, -67, -13, -67, -13, -67,
     312             :     -90, -73, -90, -73, -90, -73, -90, -73,
     313             :     -22, 38, -22, 38, -22, 38, -22, 38,
     314             :     82, 88, 82, 88, 82, 88, 82, 88,
     315             :     54, -4, 54, -4, 54, -4, 54, -4,
     316             :     -61, -90, -61, -90, -61, -90, -61, -90,
     317             :     -78, -31, -78, -31, -78, -31, -78, -31,
     318             :     82, 22, 82, 22, 82, 22, 82, 22,
     319             :     -54, -90, -54, -90, -54, -90, -54, -90,
     320             :     -61, 13, -61, 13, -61, 13, -61, 13,
     321             :     78, 85, 78, 85, 78, 85, 78, 85,
     322             :     31, -46, 31, -46, 31, -46, 31, -46,
     323             :     -90, -67, -90, -67, -90, -67, -90, -67,
     324             :     4, 73, 4, 73, 4, 73, 4, 73,
     325             :     88, 38, 88, 38, 88, 38, 88, 38,
     326             :     78, -4, 78, -4, 78, -4, 78, -4,
     327             :     -82, -73, -82, -73, -82, -73, -82, -73,
     328             :     13, 85, 13, 85, 13, 85, 13, 85,
     329             :     67, -22, 67, -22, 67, -22, 67, -22,
     330             :     -88, -61, -88, -61, -88, -61, -88, -61,
     331             :     31, 90, 31, 90, 31, 90, 31, 90,
     332             :     54, -38, 54, -38, 54, -38, 54, -38,
     333             :     -90, -46, -90, -46, -90, -46, -90, -46,
     334             :     73, -31, 73, -31, 73, -31, 73, -31,
     335             :     -90, -22, -90, -22, -90, -22, -90, -22,
     336             :     78, 67, 78, 67, 78, 67, 78, 67,
     337             :     -38, -90, -38, -90, -38, -90, -38, -90,
     338             :     -13, 82, -13, 82, -13, 82, -13, 82,
     339             :     61, -46, 61, -46, 61, -46, 61, -46,
     340             :     -88, -4, -88, -4, -88, -4, -88, -4,
     341             :     85, 54, 85, 54, 85, 54, 85, 54,
     342             :     67, -54, 67, -54, 67, -54, 67, -54,
     343             :     -78, 38, -78, 38, -78, 38, -78, 38,
     344             :     85, -22, 85, -22, 85, -22, 85, -22,
     345             :     -90, 4, -90, 4, -90, 4, -90, 4,
     346             :     90, 13, 90, 13, 90, 13, 90, 13,
     347             :     -88, -31, -88, -31, -88, -31, -88, -31,
     348             :     82, 46, 82, 46, 82, 46, 82, 46,
     349             :     -73, -61, -73, -61, -73, -61, -73, -61,
     350             :     61, -73, 61, -73, 61, -73, 61, -73,
     351             :     -46, 82, -46, 82, -46, 82, -46, 82,
     352             :     31, -88, 31, -88, 31, -88, 31, -88,
     353             :     -13, 90, -13, 90, -13, 90, -13, 90,
     354             :     -4, -90, -4, -90, -4, -90, -4, -90,
     355             :     22, 85, 22, 85, 22, 85, 22, 85,
     356             :     -38, -78, -38, -78, -38, -78, -38, -78,
     357             :     54, 67, 54, 67, 54, 67, 54, 67,
     358             :     54, -85, 54, -85, 54, -85, 54, -85,
     359             :     -4, 88, -4, 88, -4, 88, -4, 88,
     360             :     -46, -61, -46, -61, -46, -61, -46, -61,
     361             :     82, 13, 82, 13, 82, 13, 82, 13,
     362             :     -90, 38, -90, 38, -90, 38, -90, 38,
     363             :     67, -78, 67, -78, 67, -78, 67, -78,
     364             :     -22, 90, -22, 90, -22, 90, -22, 90,
     365             :     -31, -73, -31, -73, -31, -73, -31, -73,
     366             :     46, -90, 46, -90, 46, -90, 46, -90,
     367             :     38, 54, 38, 54, 38, 54, 38, 54,
     368             :     -90, 31, -90, 31, -90, 31, -90, 31,
     369             :     61, -88, 61, -88, 61, -88, 61, -88,
     370             :     22, 67, 22, 67, 22, 67, 22, 67,
     371             :     -85, 13, -85, 13, -85, 13, -85, 13,
     372             :     73, -82, 73, -82, 73, -82, 73, -82,
     373             :     4, 78, 4, 78, 4, 78, 4, 78,
     374             :     38, -88, 38, -88, 38, -88, 38, -88,
     375             :     73, -4, 73, -4, 73, -4, 73, -4,
     376             :     -67, 90, -67, 90, -67, 90, -67, 90,
     377             :     -46, -31, -46, -31, -46, -31, -46, -31,
     378             :     85, -78, 85, -78, 85, -78, 85, -78,
     379             :     13, 61, 13, 61, 13, 61, 13, 61,
     380             :     -90, 54, -90, 54, -90, 54, -90, 54,
     381             :     22, -82, 22, -82, 22, -82, 22, -82,
     382             :     31, -78, 31, -78, 31, -78, 31, -78,
     383             :     90, -61, 90, -61, 90, -61, 90, -61,
     384             :     4, 54, 4, 54, 4, 54, 4, 54,
     385             :     -88, 82, -88, 82, -88, 82, -88, 82,
     386             :     -38, -22, -38, -22, -38, -22, -38, -22,
     387             :     73, -90, 73, -90, 73, -90, 73, -90,
     388             :     67, -13, 67, -13, 67, -13, 67, -13,
     389             :     -46, 85, -46, 85, -46, 85, -46, 85,
     390             :     22, -61, 22, -61, 22, -61, 22, -61,
     391             :     85, -90, 85, -90, 85, -90, 85, -90,
     392             :     73, -38, 73, -38, 73, -38, 73, -38,
     393             :     -4, 46, -4, 46, -4, 46, -4, 46,
     394             :     -78, 90, -78, 90, -78, 90, -78, 90,
     395             :     -82, 54, -82, 54, -82, 54, -82, 54,
     396             :     -13, -31, -13, -31, -13, -31, -13, -31,
     397             :     67, -88, 67, -88, 67, -88, 67, -88,
     398             :     13, -38, 13, -38, 13, -38, 13, -38,
     399             :     61, -78, 61, -78, 61, -78, 61, -78,
     400             :     88, -90, 88, -90, 88, -90, 88, -90,
     401             :     85, -73, 85, -73, 85, -73, 85, -73,
     402             :     54, -31, 54, -31, 54, -31, 54, -31,
     403             :     4, 22, 4, 22, 4, 22, 4, 22,
     404             :     -46, 67, -46, 67, -46, 67, -46, 67,
     405             :     -82, 90, -82, 90, -82, 90, -82, 90,
     406             :     4, -13, 4, -13, 4, -13, 4, -13,
     407             :     22, -31, 22, -31, 22, -31, 22, -31,
     408             :     38, -46, 38, -46, 38, -46, 38, -46,
     409             :     54, -61, 54, -61, 54, -61, 54, -61,
     410             :     67, -73, 67, -73, 67, -73, 67, -73,
     411             :     78, -82, 78, -82, 78, -82, 78, -82,
     412             :     85, -88, 85, -88, 85, -88, 85, -88,
     413             :     90, -90, 90, -90, 90, -90, 90, -90,
     414             : };
     415             : 
     416             : #define PMP_PRECISION     8
     417             : #define PMP_MAX          (1<<PMP_PRECISION)
     418             : 
     419             : #define M_100      100*PMP_MAX/100
     420             : #define M_90        90*PMP_MAX/100
     421             : #define M_80        80*PMP_MAX/100
     422             : #define M_70        70*PMP_MAX/100
     423             : #define M_60        60*PMP_MAX/100
     424             : #define M_50        50*PMP_MAX/100
     425             : #define M_40        40*PMP_MAX/100
     426             : #define M_30        30*PMP_MAX/100
     427             : #define M_25        25*PMP_MAX/100
     428             : #define M_20        20*PMP_MAX/100
     429             : #define M_10        10*PMP_MAX/100
     430             : #define M_0             0*PMP_MAX/100
     431             : 
     432             : // Level0
     433             : // 4K
     434             : // 4x4
     435             : static const uint16_t MaskingMatrix4x4_Level0_4K[] = {
     436             :     M_100, M_100, M_100, M_100,
     437             :     M_100, M_100, M_100, M_100,
     438             :     M_100, M_100, M_100, M_100,
     439             :     M_100, M_100, M_100, M_100
     440             : };
     441             : // 8x8
     442             : static const uint16_t MaskingMatrix8x8_Level0_4K[] = {
     443             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     444             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     445             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     446             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     447             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     448             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     449             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     450             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100
     451             : };
     452             : // 16x16
     453             : static const uint16_t MaskingMatrix16x16_Level0_4K[] = {
     454             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     455             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     456             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     457             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     458             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     459             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     460             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     461             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     462             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     463             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     464             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     465             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     466             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     467             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     468             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     469             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100
     470             : };
     471             : // 32x32
     472             : static const uint16_t MaskingMatrix32x32_Level0_4K[] = {
     473             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     474             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     475             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     476             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     477             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     478             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     479             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     480             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     481             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     482             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     483             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     484             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     485             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     486             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     487             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     488             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     489             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     490             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     491             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     492             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     493             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     494             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     495             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     496             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     497             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     498             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     499             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     500             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     501             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     502             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     503             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     504             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100
     505             : };
     506             : 
     507             : // 1080
     508             : // 4x4
     509             : static const uint16_t MaskingMatrix4x4_Level0_1080p[] = {
     510             :     M_100, M_100, M_100, M_100,
     511             :     M_100, M_100, M_100, M_100,
     512             :     M_100, M_100, M_100, M_100,
     513             :     M_100, M_100, M_100, M_100
     514             : };
     515             : // 8x8
     516             : static const uint16_t MaskingMatrix8x8_Level0_1080p[] = {
     517             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     518             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     519             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     520             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     521             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     522             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     523             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     524             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100
     525             : };
     526             : // 16x16
     527             : static const uint16_t MaskingMatrix16x16_Level0_1080p[] = {
     528             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     529             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     530             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     531             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     532             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     533             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     534             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     535             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     536             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     537             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     538             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     539             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     540             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     541             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     542             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     543             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100
     544             : };
     545             : // 32x32
     546             : static const uint16_t MaskingMatrix32x32_Level0_1080p[] = {
     547             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     548             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     549             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     550             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     551             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     552             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     553             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     554             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     555             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     556             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     557             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     558             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     559             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     560             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     561             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     562             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     563             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     564             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     565             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     566             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     567             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     568             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     569             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     570             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     571             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     572             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     573             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     574             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     575             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     576             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     577             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     578             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100
     579             : };
     580             : 
     581             : // Level1
     582             : // 4K
     583             : 
     584             : // 4x4
     585             : static const uint16_t MaskingMatrix4x4_Level1_4K[] = {
     586             :     M_100, M_100, M_50, M_50,
     587             :     M_100, M_100, M_50, M_50,
     588             :     M_50, M_50, M_50, M_50,
     589             :     M_50, M_50, M_50, M_50,
     590             : };
     591             : // 8x8
     592             : static const uint16_t MaskingMatrix8x8_Level1_4K[] = {
     593             :     M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60,
     594             :     M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60,
     595             :     M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50,
     596             :     M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50,
     597             :     M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50,
     598             :     M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50,
     599             :     M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50,
     600             :     M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50,
     601             : };
     602             : 
     603             : static const uint16_t MaskingMatrix8x8_Level1_MOD1_4K[] = {
     604             :     M_100, M_100, M_100, M_100, M_80, M_80, M_80, M_80,
     605             :     M_100, M_100, M_100, M_100, M_80, M_80, M_80, M_80,
     606             :     M_100, M_100, M_100, M_100, M_70, M_70, M_70, M_70,
     607             :     M_100, M_100, M_100, M_100, M_70, M_70, M_70, M_70,
     608             :     M_80, M_80, M_70, M_70, M_70, M_70, M_70, M_70,
     609             :     M_80, M_80, M_70, M_70, M_70, M_70, M_70, M_70,
     610             :     M_80, M_80, M_70, M_70, M_70, M_70, M_70, M_70,
     611             :     M_80, M_80, M_70, M_70, M_70, M_70, M_70, M_70,
     612             : };
     613             : 
     614             : // 16x16
     615             : static const uint16_t MaskingMatrix16x16_Level1_4K[] = {
     616             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     617             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     618             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     619             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     620             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     621             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     622             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     623             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     624             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     625             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     626             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     627             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     628             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     629             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     630             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     631             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     632             : };
     633             : // 32x32
     634             : static const uint16_t MaskingMatrix32x32_Level1_4K[] = {
     635             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     636             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     637             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     638             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     639             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     640             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     641             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     642             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     643             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     644             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     645             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     646             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     647             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     648             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     649             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     650             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     651             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     652             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     653             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     654             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     655             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     656             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     657             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     658             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     659             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     660             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     661             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     662             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     663             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     664             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     665             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     666             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     667             : };
     668             : 
     669             : // 1080
     670             : 
     671             : // 4x4
     672             : static const uint16_t MaskingMatrix4x4_Level1_1080p[] = {
     673             :     M_100, M_100, M_100, M_100,
     674             :     M_100, M_100, M_100, M_100,
     675             :     M_100, M_100, M_100, M_100,
     676             :     M_100, M_100, M_100, M_100
     677             : };
     678             : // 8x8
     679             : static const uint16_t MaskingMatrix8x8_Level1_1080p[] = {
     680             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     681             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     682             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     683             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     684             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     685             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     686             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     687             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100
     688             : };
     689             : // 16x16
     690             : static const uint16_t MaskingMatrix16x16_Level1_1080p[] = {
     691             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     692             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     693             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     694             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     695             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     696             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     697             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     698             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     699             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     700             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     701             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     702             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     703             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     704             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     705             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     706             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100
     707             : };
     708             : // 32x32
     709             : static const uint16_t MaskingMatrix32x32_Level1_1080p[] = {
     710             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     711             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     712             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     713             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     714             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     715             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     716             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     717             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     718             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     719             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     720             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     721             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     722             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     723             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     724             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     725             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     726             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     727             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     728             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     729             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     730             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     731             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     732             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     733             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     734             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     735             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     736             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     737             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     738             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     739             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     740             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     741             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100
     742             : };
     743             : 
     744             : // Level2
     745             : // 4K
     746             : // 4x4
     747             : static const uint16_t MaskingMatrix4x4_Level2_4K[] = {
     748             :     M_100, M_100, M_0, M_0,
     749             :     M_100, M_100, M_0, M_0,
     750             :     M_0, M_0, M_0, M_0,
     751             :     M_0, M_0, M_0, M_0,
     752             : };
     753             : // 8x8
     754             : static const uint16_t MaskingMatrix8x8_Level2_4K[] = {
     755             :     M_100, M_100, M_100, M_100, M_0, M_0, M_0, M_0,
     756             :     M_100, M_100, M_100, M_100, M_0, M_0, M_0, M_0,
     757             :     M_100, M_100, M_100, M_100, M_0, M_0, M_0, M_0,
     758             :     M_100, M_100, M_100, M_100, M_0, M_0, M_0, M_0,
     759             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     760             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     761             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     762             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     763             : };
     764             : // 16x16
     765             : static const uint16_t MaskingMatrix16x16_Level2_4K[] = {
     766             :     M_100, M_100, M_100, M_100, M_90, M_90, M_90, M_90, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     767             :     M_100, M_100, M_100, M_100, M_90, M_90, M_90, M_90, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     768             :     M_100, M_100, M_100, M_100, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     769             :     M_100, M_100, M_100, M_100, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     770             :     M_90, M_90, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     771             :     M_90, M_90, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     772             :     M_90, M_90, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     773             :     M_90, M_90, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     774             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     775             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     776             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     777             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     778             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     779             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     780             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     781             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     782             : };
     783             : // 32x32
     784             : static const uint16_t MaskingMatrix32x32_Level2_4K[] = {
     785             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_90, M_90, M_90, M_90, M_90, M_90, M_90, M_90, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     786             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_90, M_90, M_90, M_90, M_90, M_90, M_90, M_90, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     787             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_90, M_90, M_90, M_90, M_90, M_90, M_90, M_90, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     788             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_90, M_90, M_90, M_90, M_90, M_90, M_90, M_90, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     789             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     790             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     791             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     792             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     793             :     M_90, M_90, M_90, M_90, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     794             :     M_90, M_90, M_90, M_90, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     795             :     M_90, M_90, M_90, M_90, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     796             :     M_90, M_90, M_90, M_90, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     797             :     M_90, M_90, M_90, M_90, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     798             :     M_90, M_90, M_90, M_90, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     799             :     M_90, M_90, M_90, M_90, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     800             :     M_90, M_90, M_90, M_90, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_80, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     801             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     802             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     803             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     804             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     805             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     806             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     807             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     808             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     809             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     810             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     811             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     812             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     813             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     814             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     815             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     816             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
     817             : };
     818             : 
     819             : // 1080
     820             : // 4x4
     821             : static const uint16_t MaskingMatrix4x4_Level2_1080p[] = {
     822             :     M_100, M_100, M_100, M_100,
     823             :     M_100, M_100, M_100, M_100,
     824             :     M_100, M_100, M_100, M_100,
     825             :     M_100, M_100, M_100, M_100
     826             : };
     827             : // 8x8
     828             : static const uint16_t MaskingMatrix8x8_Level2_1080p[] = {
     829             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     830             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     831             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     832             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     833             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     834             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     835             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     836             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100
     837             : };
     838             : // 16x16
     839             : static const uint16_t MaskingMatrix16x16_Level2_1080p[] = {
     840             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     841             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     842             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     843             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     844             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     845             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     846             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     847             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     848             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     849             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     850             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     851             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     852             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     853             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     854             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     855             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100
     856             : };
     857             : // 32x32
     858             : static const uint16_t MaskingMatrix32x32_Level2_1080p[] = {
     859             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     860             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     861             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     862             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     863             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     864             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     865             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     866             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     867             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     868             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     869             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     870             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     871             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     872             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     873             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     874             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     875             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     876             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     877             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     878             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     879             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     880             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     881             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     882             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     883             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     884             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     885             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     886             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     887             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     888             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     889             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100,
     890             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100
     891             : };
     892             : // Level3
     893             : // 4K
     894             : // 4x4
     895             : static const uint16_t MaskingMatrix4x4_Level3_4K[] = {
     896             :     M_100, M_90, M_0, M_0,
     897             :     M_90, M_90, M_0, M_0,
     898             :     M_0, M_0, M_0, M_0,
     899             :     M_0, M_0, M_0, M_0,
     900             : };
     901             : 
     902             : // 4x4
     903             : static const uint16_t MaskingMatrix4x4_Level3_1080p[] = {
     904             :     M_100, M_100, M_50, M_50,
     905             :     M_100, M_100, M_50, M_50,
     906             :     M_50, M_50, M_50, M_50,
     907             :     M_50, M_50, M_50, M_50,
     908             : };
     909             : // 8x8
     910             : static const uint16_t MaskingMatrix8x8_Level3_1080p[] = {
     911             :     M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60,
     912             :     M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60,
     913             :     M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50,
     914             :     M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50,
     915             :     M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50,
     916             :     M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50,
     917             :     M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50,
     918             :     M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50,
     919             : };
     920             : // 16x16
     921             : static const uint16_t MaskingMatrix16x16_Level3_1080p[] = {
     922             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     923             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     924             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     925             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     926             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     927             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     928             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     929             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     930             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     931             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     932             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     933             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     934             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     935             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     936             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     937             :     M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     938             : };
     939             : // 32x32
     940             : static const uint16_t MaskingMatrix32x32_Level3_1080p[] = {
     941             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     942             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     943             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     944             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     945             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     946             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     947             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     948             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60,
     949             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     950             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     951             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     952             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     953             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     954             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     955             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     956             :     M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_100, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     957             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     958             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     959             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     960             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     961             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     962             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     963             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     964             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     965             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     966             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     967             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     968             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     969             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     970             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     971             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     972             :     M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50, M_50,
     973             : };
     974             : 
     975             : // Set 1
     976             : // 4x4
     977             : static const uint16_t MaskingMatrix4x4_Level4_4K_Set1[] = {
     978             :     M_100, M_60, M_0, M_0,
     979             :     M_60, M_60, M_0, M_0,
     980             :     M_0, M_0, M_0, M_0,
     981             :     M_0, M_0, M_0, M_0,
     982             : };
     983             : 
     984             : // Set 1
     985             : // 4x4
     986             : static const uint16_t MaskingMatrix4x4_Level5_4K_Set1[] = {
     987             :     M_100, M_50, M_0, M_0,
     988             :     M_50, M_50, M_0, M_0,
     989             :     M_0, M_0, M_0, M_0,
     990             :     M_0, M_0, M_0, M_0,
     991             : };
     992             : // 8x8
     993             : 
     994             : // Set 1
     995             : // 4x4
     996             : static const uint16_t MaskingMatrix4x4_Level6_4K_Set1[] = {
     997             :     M_100, M_25, M_0, M_0,
     998             :     M_25, M_25, M_0, M_0,
     999             :     M_0, M_0, M_0, M_0,
    1000             :     M_0, M_0, M_0, M_0,
    1001             : };
    1002             : // 8x8
    1003             : static const uint16_t MaskingMatrix8x8_Level6_4K_Set1[] = {
    1004             :     M_100, M_25, M_25, M_25, M_0, M_0, M_0, M_0,
    1005             :     M_25, M_25, M_25, M_25, M_0, M_0, M_0, M_0,
    1006             :     M_25, M_25, M_25, M_25, M_0, M_0, M_0, M_0,
    1007             :     M_25, M_25, M_25, M_25, M_0, M_0, M_0, M_0,
    1008             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1009             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1010             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1011             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1012             : };
    1013             : 
    1014             : // Set 2
    1015             : // 16x16
    1016             : static const uint16_t MaskingMatrix16x16_Level6_4K_Set2[] = {
    1017             :     M_100, M_80, M_80, M_80, M_40, M_40, M_40, M_40, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1018             :     M_80, M_80, M_80, M_80, M_40, M_40, M_40, M_40, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1019             :     M_80, M_80, M_80, M_80, M_25, M_25, M_25, M_25, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1020             :     M_80, M_80, M_80, M_80, M_25, M_25, M_25, M_25, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1021             :     M_40, M_40, M_25, M_25, M_25, M_25, M_25, M_25, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1022             :     M_40, M_40, M_25, M_25, M_25, M_25, M_25, M_25, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1023             :     M_40, M_40, M_25, M_25, M_25, M_25, M_25, M_25, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1024             :     M_40, M_40, M_25, M_25, M_25, M_25, M_25, M_25, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1025             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1026             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1027             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1028             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1029             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1030             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1031             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1032             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1033             : };
    1034             : // 32x32
    1035             : static const uint16_t MaskingMatrix32x32_Level6_4K_Set2[] = {
    1036             :     M_100, M_90, M_90, M_90, M_60, M_60, M_60, M_60, M_20, M_20, M_20, M_20, M_20, M_20, M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1037             :     M_90, M_90, M_90, M_90, M_60, M_60, M_60, M_60, M_20, M_20, M_20, M_20, M_20, M_20, M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1038             :     M_90, M_90, M_90, M_90, M_50, M_50, M_50, M_50, M_20, M_20, M_20, M_20, M_20, M_20, M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1039             :     M_90, M_90, M_90, M_90, M_50, M_50, M_50, M_50, M_20, M_20, M_20, M_20, M_20, M_20, M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1040             :     M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1041             :     M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1042             :     M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1043             :     M_60, M_60, M_50, M_50, M_50, M_50, M_50, M_50, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1044             :     M_20, M_20, M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1045             :     M_20, M_20, M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1046             :     M_20, M_20, M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1047             :     M_20, M_20, M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1048             :     M_20, M_20, M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1049             :     M_20, M_20, M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1050             :     M_20, M_20, M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1051             :     M_20, M_20, M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1052             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1053             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1054             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1055             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1056             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1057             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1058             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1059             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1060             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1061             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1062             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1063             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1064             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1065             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1066             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1067             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1068             : };
    1069             : 
    1070             : // Set 1
    1071             : // 4x4
    1072             : static const uint16_t MaskingMatrix4x4_Level7_4K_Set1[] = {
    1073             :     M_100, M_0, M_0, M_0,
    1074             :     M_0, M_0, M_0, M_0,
    1075             :     M_0, M_0, M_0, M_0,
    1076             :     M_0, M_0, M_0, M_0,
    1077             : };
    1078             : 
    1079             : // Set 2
    1080             : // 16x16
    1081             : // 32x32
    1082             : static const uint16_t MaskingMatrix32x32_Level7_4K_Set2[] = {
    1083             :     M_100, M_100, M_100, M_100, M_20, M_20, M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1084             :     M_100, M_100, M_100, M_100, M_20, M_20, M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1085             :     M_100, M_100, M_100, M_100, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1086             :     M_100, M_100, M_100, M_100, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1087             :     M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1088             :     M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1089             :     M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1090             :     M_20, M_20, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1091             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1092             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1093             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1094             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1095             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1096             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1097             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1098             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1099             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1100             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1101             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1102             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1103             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1104             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1105             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1106             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1107             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1108             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1109             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1110             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1111             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1112             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1113             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1114             :     M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0, M_0,
    1115             : };
    1116             : 
    1117             : static const uint16_t *masking_matrix[2][8][4] =//
    1118             : {
    1119             :     /****************** 4K ************************/
    1120             :     {
    1121             :         { MaskingMatrix4x4_Level0_4K, MaskingMatrix8x8_Level0_4K, MaskingMatrix16x16_Level0_4K, MaskingMatrix32x32_Level0_4K }, // Level 0 OFF
    1122             :         { MaskingMatrix4x4_Level1_4K, MaskingMatrix8x8_Level1_4K, MaskingMatrix16x16_Level1_4K, MaskingMatrix32x32_Level1_4K }, // Level 1 I_SLICE
    1123             :         /*************************  L23_SETTING *************************/
    1124             :         { MaskingMatrix4x4_Level2_4K, MaskingMatrix8x8_Level2_4K, MaskingMatrix16x16_Level2_4K, MaskingMatrix32x32_Level2_4K }, // Level 2 Base Intra
    1125             :         { MaskingMatrix4x4_Level3_4K, MaskingMatrix8x8_Level1_MOD1_4K, MaskingMatrix16x16_Level1_4K, MaskingMatrix32x32_Level1_4K }, // Level 3 Base Inter
    1126             :         /*************************  L45_SETTING *************************/
    1127             :         { MaskingMatrix4x4_Level4_4K_Set1, MaskingMatrix8x8_Level2_4K, MaskingMatrix16x16_Level2_4K, MaskingMatrix32x32_Level2_4K }, // Level 4 Ref Intra
    1128             :         { MaskingMatrix4x4_Level5_4K_Set1, MaskingMatrix8x8_Level1_MOD1_4K, MaskingMatrix16x16_Level1_4K, MaskingMatrix32x32_Level1_4K }, // Level 5 Ref Inter
    1129             :         /*************************  L67_SETTING *************************/
    1130             :         { MaskingMatrix4x4_Level6_4K_Set1, MaskingMatrix8x8_Level6_4K_Set1, MaskingMatrix16x16_Level6_4K_Set2, MaskingMatrix32x32_Level6_4K_Set2 }, // Level 6 Non Ref Intra
    1131             :         { MaskingMatrix4x4_Level7_4K_Set1, MaskingMatrix8x8_Level1_MOD1_4K, MaskingMatrix16x16_Level1_4K, MaskingMatrix32x32_Level7_4K_Set2 }  // Level 7 Non Ref Inter
    1132             :     },
    1133             :     /****************** 1080P ************************/
    1134             :     {
    1135             :         { MaskingMatrix4x4_Level0_1080p, MaskingMatrix8x8_Level0_1080p, MaskingMatrix16x16_Level0_1080p, MaskingMatrix32x32_Level0_1080p }, // Level 0 OFF
    1136             :         { MaskingMatrix4x4_Level1_1080p, MaskingMatrix8x8_Level1_1080p, MaskingMatrix16x16_Level1_1080p, MaskingMatrix32x32_Level1_1080p }, // Level 1 I_SLICE
    1137             :         /*************************  L23_SETTING *************************/
    1138             :         { MaskingMatrix4x4_Level2_1080p, MaskingMatrix8x8_Level2_1080p, MaskingMatrix16x16_Level2_1080p, MaskingMatrix32x32_Level2_1080p }, // Level 2 Base Intra
    1139             :         { MaskingMatrix4x4_Level2_1080p, MaskingMatrix8x8_Level2_1080p, MaskingMatrix16x16_Level2_1080p, MaskingMatrix32x32_Level2_1080p }, // Level 3 Base Inter
    1140             :         /*************************  L45_SETTING *************************/
    1141             :         { MaskingMatrix4x4_Level2_1080p, MaskingMatrix8x8_Level2_1080p, MaskingMatrix16x16_Level2_1080p, MaskingMatrix32x32_Level2_1080p }, // Level 4 Ref Intra
    1142             :         { MaskingMatrix4x4_Level3_1080p, MaskingMatrix8x8_Level3_1080p, MaskingMatrix16x16_Level3_1080p, MaskingMatrix32x32_Level3_1080p }, // Level 5 Ref Inter
    1143             :         /*************************  L67_SETTING *************************/
    1144             :         { MaskingMatrix4x4_Level3_1080p, MaskingMatrix8x8_Level3_1080p, MaskingMatrix16x16_Level3_1080p, MaskingMatrix32x32_Level3_1080p }, // Level 6 Non Ref Intra
    1145             :         { MaskingMatrix4x4_Level3_1080p, MaskingMatrix8x8_Level3_1080p, MaskingMatrix16x16_Level3_1080p, MaskingMatrix32x32_Level3_1080p }, // Level 7 Non Ref Inter
    1146             :     },
    1147             : };
    1148             : 
    1149           0 : void mat_mult(
    1150             :     int16_t           *coeff,
    1151             :     const uint32_t     coeff_stride,
    1152             :     const uint16_t    *masking_matrix,
    1153             :     const uint32_t     masking_matrix_stride,
    1154             :     const uint32_t     compute_size,
    1155             :     const int32_t      offset,
    1156             :     const int32_t      shift_num,
    1157             :     uint32_t          *nonzerocoeff) {
    1158           0 :     uint32_t coeffLocation = 0;
    1159             :     uint32_t row_index, colIndex;
    1160             :     int32_t coeffTemp;
    1161             : 
    1162           0 :     *nonzerocoeff = 0;
    1163             : 
    1164           0 :     for (row_index = 0; row_index < compute_size; ++row_index) {
    1165           0 :         for (colIndex = 0; colIndex < compute_size; ++colIndex) {
    1166           0 :             coeffTemp = (ABS(coeff[coeffLocation]) * masking_matrix[colIndex + row_index * masking_matrix_stride] + offset) >> shift_num;
    1167           0 :             coeffTemp = (coeff[coeffLocation] < 0) ? -coeffTemp : coeffTemp;
    1168             : 
    1169           0 :             coeff[coeffLocation] = (int16_t)CLIP3(MIN_NEG_16BIT_NUM, MAX_POS_16BIT_NUM, coeffTemp);
    1170           0 :             (*nonzerocoeff) += (coeffTemp != 0);
    1171           0 :             ++coeffLocation;
    1172             :         }
    1173           0 :         coeffLocation += coeff_stride - compute_size;
    1174             :     }
    1175           0 : }
    1176             : 
    1177     1916660 : void eb_av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
    1178             :     const Txfm2DFlipCfg *cfg, int32_t bd) {
    1179             :     // Take the shift from the larger dimension in the rectangular case.
    1180     1916660 :     const int8_t *shift = cfg->shift;
    1181             :     // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
    1182    17249700 :     for (int32_t i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i)
    1183    15333100 :         stage_range_col[i] = (int8_t)(cfg->stage_range_col[i] + shift[0] + bd + 1);
    1184             :     // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
    1185    21082900 :     for (int32_t i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i)
    1186    19166300 :         stage_range_row[i] = (int8_t)(cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1);
    1187     1916660 : }
    1188             : 
    1189             : typedef void(*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit,
    1190             :     const int8_t *stage_range);
    1191             : 
    1192             : #define range_check(stage, input, buf, size, bit) \
    1193             :   {                                               \
    1194             :     (void)stage;                                  \
    1195             :     (void)input;                                  \
    1196             :     (void)buf;                                    \
    1197             :     (void)size;                                   \
    1198             :     (void)bit;                                    \
    1199             :   }
    1200             : 
    1201             : // av1_cospi_arr[i][j] = (int32_t)round(cos(M_PI*j/128) * (1<<(cos_bit_min+i)));
    1202             : const int32_t eb_av1_cospi_arr_data[7][64] = {
    1203             :     { 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
    1204             :     972, 964, 955, 946, 936, 926, 915, 903, 891, 878, 865, 851, 837,
    1205             :     822, 807, 792, 775, 759, 742, 724, 706, 688, 669, 650, 630, 610,
    1206             :     590, 569, 548, 526, 505, 483, 460, 438, 415, 392, 369, 345, 321,
    1207             :     297, 273, 249, 224, 200, 175, 150, 125, 100, 75, 50, 25 },
    1208             :     { 2048, 2047, 2046, 2042, 2038, 2033, 2026, 2018, 2009, 1998, 1987,
    1209             :     1974, 1960, 1945, 1928, 1911, 1892, 1872, 1851, 1829, 1806, 1782,
    1210             :     1757, 1730, 1703, 1674, 1645, 1615, 1583, 1551, 1517, 1483, 1448,
    1211             :     1412, 1375, 1338, 1299, 1260, 1220, 1179, 1138, 1096, 1053, 1009,
    1212             :     965, 921, 876, 830, 784, 737, 690, 642, 595, 546, 498,
    1213             :     449, 400, 350, 301, 251, 201, 151, 100, 50 },
    1214             :     { 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
    1215             :     3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
    1216             :     3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
    1217             :     2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
    1218             :     1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
    1219             :     897, 799, 700, 601, 501, 401, 301, 201, 101 },
    1220             :     { 8192, 8190, 8182, 8170, 8153, 8130, 8103, 8071, 8035, 7993, 7946,
    1221             :     7895, 7839, 7779, 7713, 7643, 7568, 7489, 7405, 7317, 7225, 7128,
    1222             :     7027, 6921, 6811, 6698, 6580, 6458, 6333, 6203, 6070, 5933, 5793,
    1223             :     5649, 5501, 5351, 5197, 5040, 4880, 4717, 4551, 4383, 4212, 4038,
    1224             :     3862, 3683, 3503, 3320, 3135, 2948, 2760, 2570, 2378, 2185, 1990,
    1225             :     1795, 1598, 1401, 1202, 1003, 803, 603, 402, 201 },
    1226             :     { 16384, 16379, 16364, 16340, 16305, 16261, 16207, 16143, 16069, 15986, 15893,
    1227             :     15791, 15679, 15557, 15426, 15286, 15137, 14978, 14811, 14635, 14449, 14256,
    1228             :     14053, 13842, 13623, 13395, 13160, 12916, 12665, 12406, 12140, 11866, 11585,
    1229             :     11297, 11003, 10702, 10394, 10080, 9760, 9434, 9102, 8765, 8423, 8076,
    1230             :     7723, 7366, 7005, 6639, 6270, 5897, 5520, 5139, 4756, 4370, 3981,
    1231             :     3590, 3196, 2801, 2404, 2006, 1606, 1205, 804, 402 },
    1232             :     { 32768, 32758, 32729, 32679, 32610, 32522, 32413, 32286, 32138, 31972, 31786,
    1233             :     31581, 31357, 31114, 30853, 30572, 30274, 29957, 29622, 29269, 28899, 28511,
    1234             :     28106, 27684, 27246, 26791, 26320, 25833, 25330, 24812, 24279, 23732, 23170,
    1235             :     22595, 22006, 21403, 20788, 20160, 19520, 18868, 18205, 17531, 16846, 16151,
    1236             :     15447, 14733, 14010, 13279, 12540, 11793, 11039, 10279, 9512, 8740, 7962,
    1237             :     7180, 6393, 5602, 4808, 4011, 3212, 2411, 1608, 804 },
    1238             :     { 65536, 65516, 65457, 65358, 65220, 65043, 64827, 64571, 64277, 63944, 63572,
    1239             :     63162, 62714, 62228, 61705, 61145, 60547, 59914, 59244, 58538, 57798, 57022,
    1240             :     56212, 55368, 54491, 53581, 52639, 51665, 50660, 49624, 48559, 47464, 46341,
    1241             :     45190, 44011, 42806, 41576, 40320, 39040, 37736, 36410, 35062, 33692, 32303,
    1242             :     30893, 29466, 28020, 26558, 25080, 23586, 22078, 20557, 19024, 17479, 15924,
    1243             :     14359, 12785, 11204, 9616, 8022, 6424, 4821, 3216, 1608 }
    1244             : };
    1245  5360390000 : static INLINE int32_t round_shift(int64_t value, int32_t bit) {
    1246  5360390000 :     assert(bit >= 1);
    1247  5360390000 :     return (int32_t)((value + (1ll << (bit - 1))) >> bit);
    1248             : }
    1249  3506400000 : static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
    1250             :     int32_t bit) {
    1251  3506400000 :     int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1);
    1252             : #if CONFIG_COEFFICIENT_RANGE_CHECKING
    1253             :     assert(result_64 >= INT32_MIN && result_64 <= INT32_MAX);
    1254             : #endif
    1255  3506400000 :     return round_shift(result_64, bit);
    1256             : }
    1257             : 
    1258             : // eb_av1_sinpi_arr_data[i][j] = (int32_t)round((sqrt(2) * sin(j*Pi/9) * 2 / 3) * (1
    1259             : // << (cos_bit_min + i))) modified so that elements j=1,2 sum to element j=4.
    1260             : const int32_t eb_av1_sinpi_arr_data[7][5] = {
    1261             :     { 0, 330, 621, 836, 951 }, { 0, 660, 1241, 1672, 1901 },
    1262             :     { 0, 1321, 2482, 3344, 3803 }, { 0, 2642, 4964, 6689, 7606 },
    1263             :     { 0, 5283, 9929, 13377, 15212 }, { 0, 10566, 19858, 26755, 30424 },
    1264             :     { 0, 21133, 39716, 53510, 60849 }
    1265             : };
    1266             : 
    1267           0 : void eb_av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    1268             :     const int8_t *stage_range) {
    1269           0 :     const int32_t size = 4;
    1270             :     const int32_t *cospi;
    1271             : 
    1272           0 :     int32_t stage = 0;
    1273             :     int32_t *bf0, *bf1;
    1274             :     int32_t step[4];
    1275             : 
    1276             :     // stage 0;
    1277           0 :     range_check(stage, input, input, size, stage_range[stage]);
    1278             : 
    1279             :     // stage 1;
    1280           0 :     stage++;
    1281           0 :     bf1 = output;
    1282           0 :     bf1[0] = input[0] + input[3];
    1283           0 :     bf1[1] = input[1] + input[2];
    1284           0 :     bf1[2] = -input[2] + input[1];
    1285           0 :     bf1[3] = -input[3] + input[0];
    1286           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1287             : 
    1288             :     // stage 2
    1289           0 :     stage++;
    1290           0 :     cospi = cospi_arr(cos_bit);
    1291           0 :     bf0 = output;
    1292           0 :     bf1 = step;
    1293           0 :     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    1294           0 :     bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
    1295           0 :     bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
    1296           0 :     bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
    1297           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1298             : 
    1299             :     // stage 3
    1300           0 :     stage++;
    1301           0 :     bf0 = step;
    1302           0 :     bf1 = output;
    1303           0 :     bf1[0] = bf0[0];
    1304           0 :     bf1[1] = bf0[2];
    1305           0 :     bf1[2] = bf0[1];
    1306           0 :     bf1[3] = bf0[3];
    1307           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1308           0 : }
    1309             : 
    1310           0 : void eb_av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    1311             :     const int8_t *stage_range) {
    1312           0 :     const int32_t size = 8;
    1313             :     const int32_t *cospi;
    1314             : 
    1315           0 :     int32_t stage = 0;
    1316             :     int32_t *bf0, *bf1;
    1317             :     int32_t step[8];
    1318             : 
    1319             :     // stage 0;
    1320           0 :     range_check(stage, input, input, size, stage_range[stage]);
    1321             : 
    1322             :     // stage 1;
    1323           0 :     stage++;
    1324           0 :     bf1 = output;
    1325           0 :     bf1[0] = input[0] + input[7];
    1326           0 :     bf1[1] = input[1] + input[6];
    1327           0 :     bf1[2] = input[2] + input[5];
    1328           0 :     bf1[3] = input[3] + input[4];
    1329           0 :     bf1[4] = -input[4] + input[3];
    1330           0 :     bf1[5] = -input[5] + input[2];
    1331           0 :     bf1[6] = -input[6] + input[1];
    1332           0 :     bf1[7] = -input[7] + input[0];
    1333           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1334             : 
    1335             :     // stage 2
    1336           0 :     stage++;
    1337           0 :     cospi = cospi_arr(cos_bit);
    1338           0 :     bf0 = output;
    1339           0 :     bf1 = step;
    1340           0 :     bf1[0] = bf0[0] + bf0[3];
    1341           0 :     bf1[1] = bf0[1] + bf0[2];
    1342           0 :     bf1[2] = -bf0[2] + bf0[1];
    1343           0 :     bf1[3] = -bf0[3] + bf0[0];
    1344           0 :     bf1[4] = bf0[4];
    1345           0 :     bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    1346           0 :     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
    1347           0 :     bf1[7] = bf0[7];
    1348           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1349             : 
    1350             :     // stage 3
    1351           0 :     stage++;
    1352           0 :     cospi = cospi_arr(cos_bit);
    1353           0 :     bf0 = step;
    1354           0 :     bf1 = output;
    1355           0 :     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    1356           0 :     bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
    1357           0 :     bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
    1358           0 :     bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
    1359           0 :     bf1[4] = bf0[4] + bf0[5];
    1360           0 :     bf1[5] = -bf0[5] + bf0[4];
    1361           0 :     bf1[6] = -bf0[6] + bf0[7];
    1362           0 :     bf1[7] = bf0[7] + bf0[6];
    1363           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1364             : 
    1365             :     // stage 4
    1366           0 :     stage++;
    1367           0 :     cospi = cospi_arr(cos_bit);
    1368           0 :     bf0 = output;
    1369           0 :     bf1 = step;
    1370           0 :     bf1[0] = bf0[0];
    1371           0 :     bf1[1] = bf0[1];
    1372           0 :     bf1[2] = bf0[2];
    1373           0 :     bf1[3] = bf0[3];
    1374           0 :     bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
    1375           0 :     bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
    1376           0 :     bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
    1377           0 :     bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
    1378           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1379             : 
    1380             :     // stage 5
    1381           0 :     stage++;
    1382           0 :     bf0 = step;
    1383           0 :     bf1 = output;
    1384           0 :     bf1[0] = bf0[0];
    1385           0 :     bf1[1] = bf0[4];
    1386           0 :     bf1[2] = bf0[2];
    1387           0 :     bf1[3] = bf0[6];
    1388           0 :     bf1[4] = bf0[1];
    1389           0 :     bf1[5] = bf0[5];
    1390           0 :     bf1[6] = bf0[3];
    1391           0 :     bf1[7] = bf0[7];
    1392           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1393           0 : }
    1394             : 
    1395    61226300 : void eb_av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    1396             :     const int8_t *stage_range) {
    1397    61226300 :     const int32_t size = 16;
    1398             :     const int32_t *cospi;
    1399             : 
    1400    61226300 :     int32_t stage = 0;
    1401             :     int32_t *bf0, *bf1;
    1402             :     int32_t step[16];
    1403             : 
    1404             :     // stage 0;
    1405    61226300 :     range_check(stage, input, input, size, stage_range[stage]);
    1406             : 
    1407             :     // stage 1;
    1408    61226300 :     stage++;
    1409    61226300 :     bf1 = output;
    1410    61226300 :     bf1[0] = input[0] + input[15];
    1411    61226300 :     bf1[1] = input[1] + input[14];
    1412    61226300 :     bf1[2] = input[2] + input[13];
    1413    61226300 :     bf1[3] = input[3] + input[12];
    1414    61226300 :     bf1[4] = input[4] + input[11];
    1415    61226300 :     bf1[5] = input[5] + input[10];
    1416    61226300 :     bf1[6] = input[6] + input[9];
    1417    61226300 :     bf1[7] = input[7] + input[8];
    1418    61226300 :     bf1[8] = -input[8] + input[7];
    1419    61226300 :     bf1[9] = -input[9] + input[6];
    1420    61226300 :     bf1[10] = -input[10] + input[5];
    1421    61226300 :     bf1[11] = -input[11] + input[4];
    1422    61226300 :     bf1[12] = -input[12] + input[3];
    1423    61226300 :     bf1[13] = -input[13] + input[2];
    1424    61226300 :     bf1[14] = -input[14] + input[1];
    1425    61226300 :     bf1[15] = -input[15] + input[0];
    1426    61226300 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1427             : 
    1428             :     // stage 2
    1429    61226300 :     stage++;
    1430    61226300 :     cospi = cospi_arr(cos_bit);
    1431    61252600 :     bf0 = output;
    1432    61252600 :     bf1 = step;
    1433    61252600 :     bf1[0] = bf0[0] + bf0[7];
    1434    61252600 :     bf1[1] = bf0[1] + bf0[6];
    1435    61252600 :     bf1[2] = bf0[2] + bf0[5];
    1436    61252600 :     bf1[3] = bf0[3] + bf0[4];
    1437    61252600 :     bf1[4] = -bf0[4] + bf0[3];
    1438    61252600 :     bf1[5] = -bf0[5] + bf0[2];
    1439    61252600 :     bf1[6] = -bf0[6] + bf0[1];
    1440    61252600 :     bf1[7] = -bf0[7] + bf0[0];
    1441    61252600 :     bf1[8] = bf0[8];
    1442    61252600 :     bf1[9] = bf0[9];
    1443    61252600 :     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    1444    61244200 :     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    1445    61219000 :     bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
    1446    61218500 :     bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
    1447    61206300 :     bf1[14] = bf0[14];
    1448    61206300 :     bf1[15] = bf0[15];
    1449    61206300 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1450             : 
    1451             :     // stage 3
    1452    61206300 :     stage++;
    1453    61206300 :     cospi = cospi_arr(cos_bit);
    1454    61207100 :     bf0 = step;
    1455    61207100 :     bf1 = output;
    1456    61207100 :     bf1[0] = bf0[0] + bf0[3];
    1457    61207100 :     bf1[1] = bf0[1] + bf0[2];
    1458    61207100 :     bf1[2] = -bf0[2] + bf0[1];
    1459    61207100 :     bf1[3] = -bf0[3] + bf0[0];
    1460    61207100 :     bf1[4] = bf0[4];
    1461    61207100 :     bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    1462    61195400 :     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
    1463    61188200 :     bf1[7] = bf0[7];
    1464    61188200 :     bf1[8] = bf0[8] + bf0[11];
    1465    61188200 :     bf1[9] = bf0[9] + bf0[10];
    1466    61188200 :     bf1[10] = -bf0[10] + bf0[9];
    1467    61188200 :     bf1[11] = -bf0[11] + bf0[8];
    1468    61188200 :     bf1[12] = -bf0[12] + bf0[15];
    1469    61188200 :     bf1[13] = -bf0[13] + bf0[14];
    1470    61188200 :     bf1[14] = bf0[14] + bf0[13];
    1471    61188200 :     bf1[15] = bf0[15] + bf0[12];
    1472    61188200 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1473             : 
    1474             :     // stage 4
    1475    61188200 :     stage++;
    1476    61188200 :     cospi = cospi_arr(cos_bit);
    1477    61199900 :     bf0 = output;
    1478    61199900 :     bf1 = step;
    1479    61199900 :     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    1480    61197900 :     bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
    1481    61190000 :     bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
    1482    61183000 :     bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
    1483    61185100 :     bf1[4] = bf0[4] + bf0[5];
    1484    61185100 :     bf1[5] = -bf0[5] + bf0[4];
    1485    61185100 :     bf1[6] = -bf0[6] + bf0[7];
    1486    61185100 :     bf1[7] = bf0[7] + bf0[6];
    1487    61185100 :     bf1[8] = bf0[8];
    1488    61185100 :     bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
    1489    61194800 :     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
    1490    61183800 :     bf1[11] = bf0[11];
    1491    61183800 :     bf1[12] = bf0[12];
    1492    61183800 :     bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
    1493    61191600 :     bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
    1494    61189300 :     bf1[15] = bf0[15];
    1495    61189300 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1496             : 
    1497             :     // stage 5
    1498    61189300 :     stage++;
    1499    61189300 :     cospi = cospi_arr(cos_bit);
    1500    61193400 :     bf0 = step;
    1501    61193400 :     bf1 = output;
    1502    61193400 :     bf1[0] = bf0[0];
    1503    61193400 :     bf1[1] = bf0[1];
    1504    61193400 :     bf1[2] = bf0[2];
    1505    61193400 :     bf1[3] = bf0[3];
    1506    61193400 :     bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
    1507    61203900 :     bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
    1508    61202500 :     bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
    1509    61217100 :     bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
    1510    61209800 :     bf1[8] = bf0[8] + bf0[9];
    1511    61209800 :     bf1[9] = -bf0[9] + bf0[8];
    1512    61209800 :     bf1[10] = -bf0[10] + bf0[11];
    1513    61209800 :     bf1[11] = bf0[11] + bf0[10];
    1514    61209800 :     bf1[12] = bf0[12] + bf0[13];
    1515    61209800 :     bf1[13] = -bf0[13] + bf0[12];
    1516    61209800 :     bf1[14] = -bf0[14] + bf0[15];
    1517    61209800 :     bf1[15] = bf0[15] + bf0[14];
    1518    61209800 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1519             : 
    1520             :     // stage 6
    1521    61209800 :     stage++;
    1522    61209800 :     cospi = cospi_arr(cos_bit);
    1523    61220600 :     bf0 = output;
    1524    61220600 :     bf1 = step;
    1525    61220600 :     bf1[0] = bf0[0];
    1526    61220600 :     bf1[1] = bf0[1];
    1527    61220600 :     bf1[2] = bf0[2];
    1528    61220600 :     bf1[3] = bf0[3];
    1529    61220600 :     bf1[4] = bf0[4];
    1530    61220600 :     bf1[5] = bf0[5];
    1531    61220600 :     bf1[6] = bf0[6];
    1532    61220600 :     bf1[7] = bf0[7];
    1533    61220600 :     bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
    1534    61215800 :     bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
    1535    61205400 :     bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
    1536    61208400 :     bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
    1537    61204400 :     bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
    1538    61204400 :     bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
    1539    61203800 :     bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
    1540    61207200 :     bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
    1541    61209900 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1542             : 
    1543             :     // stage 7
    1544    61209900 :     stage++;
    1545    61209900 :     bf0 = step;
    1546    61209900 :     bf1 = output;
    1547    61209900 :     bf1[0] = bf0[0];
    1548    61209900 :     bf1[1] = bf0[8];
    1549    61209900 :     bf1[2] = bf0[4];
    1550    61209900 :     bf1[3] = bf0[12];
    1551    61209900 :     bf1[4] = bf0[2];
    1552    61209900 :     bf1[5] = bf0[10];
    1553    61209900 :     bf1[6] = bf0[6];
    1554    61209900 :     bf1[7] = bf0[14];
    1555    61209900 :     bf1[8] = bf0[1];
    1556    61209900 :     bf1[9] = bf0[9];
    1557    61209900 :     bf1[10] = bf0[5];
    1558    61209900 :     bf1[11] = bf0[13];
    1559    61209900 :     bf1[12] = bf0[3];
    1560    61209900 :     bf1[13] = bf0[11];
    1561    61209900 :     bf1[14] = bf0[7];
    1562    61209900 :     bf1[15] = bf0[15];
    1563    61209900 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1564    61209900 : }
    1565             : 
    1566    30621900 : void eb_av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    1567             :     const int8_t *stage_range) {
    1568    30621900 :     const int32_t size = 32;
    1569             :     const int32_t *cospi;
    1570             : 
    1571    30621900 :     int32_t stage = 0;
    1572             :     int32_t *bf0, *bf1;
    1573             :     int32_t step[32];
    1574             : 
    1575             :     // stage 0;
    1576    30621900 :     range_check(stage, input, input, size, stage_range[stage]);
    1577             : 
    1578             :     // stage 1;
    1579    30621900 :     stage++;
    1580    30621900 :     bf1 = output;
    1581    30621900 :     bf1[0] = input[0] + input[31];
    1582    30621900 :     bf1[1] = input[1] + input[30];
    1583    30621900 :     bf1[2] = input[2] + input[29];
    1584    30621900 :     bf1[3] = input[3] + input[28];
    1585    30621900 :     bf1[4] = input[4] + input[27];
    1586    30621900 :     bf1[5] = input[5] + input[26];
    1587    30621900 :     bf1[6] = input[6] + input[25];
    1588    30621900 :     bf1[7] = input[7] + input[24];
    1589    30621900 :     bf1[8] = input[8] + input[23];
    1590    30621900 :     bf1[9] = input[9] + input[22];
    1591    30621900 :     bf1[10] = input[10] + input[21];
    1592    30621900 :     bf1[11] = input[11] + input[20];
    1593    30621900 :     bf1[12] = input[12] + input[19];
    1594    30621900 :     bf1[13] = input[13] + input[18];
    1595    30621900 :     bf1[14] = input[14] + input[17];
    1596    30621900 :     bf1[15] = input[15] + input[16];
    1597    30621900 :     bf1[16] = -input[16] + input[15];
    1598    30621900 :     bf1[17] = -input[17] + input[14];
    1599    30621900 :     bf1[18] = -input[18] + input[13];
    1600    30621900 :     bf1[19] = -input[19] + input[12];
    1601    30621900 :     bf1[20] = -input[20] + input[11];
    1602    30621900 :     bf1[21] = -input[21] + input[10];
    1603    30621900 :     bf1[22] = -input[22] + input[9];
    1604    30621900 :     bf1[23] = -input[23] + input[8];
    1605    30621900 :     bf1[24] = -input[24] + input[7];
    1606    30621900 :     bf1[25] = -input[25] + input[6];
    1607    30621900 :     bf1[26] = -input[26] + input[5];
    1608    30621900 :     bf1[27] = -input[27] + input[4];
    1609    30621900 :     bf1[28] = -input[28] + input[3];
    1610    30621900 :     bf1[29] = -input[29] + input[2];
    1611    30621900 :     bf1[30] = -input[30] + input[1];
    1612    30621900 :     bf1[31] = -input[31] + input[0];
    1613    30621900 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1614             : 
    1615             :     // stage 2
    1616    30621900 :     stage++;
    1617    30621900 :     cospi = cospi_arr(cos_bit);
    1618    30646000 :     bf0 = output;
    1619    30646000 :     bf1 = step;
    1620    30646000 :     bf1[0] = bf0[0] + bf0[15];
    1621    30646000 :     bf1[1] = bf0[1] + bf0[14];
    1622    30646000 :     bf1[2] = bf0[2] + bf0[13];
    1623    30646000 :     bf1[3] = bf0[3] + bf0[12];
    1624    30646000 :     bf1[4] = bf0[4] + bf0[11];
    1625    30646000 :     bf1[5] = bf0[5] + bf0[10];
    1626    30646000 :     bf1[6] = bf0[6] + bf0[9];
    1627    30646000 :     bf1[7] = bf0[7] + bf0[8];
    1628    30646000 :     bf1[8] = -bf0[8] + bf0[7];
    1629    30646000 :     bf1[9] = -bf0[9] + bf0[6];
    1630    30646000 :     bf1[10] = -bf0[10] + bf0[5];
    1631    30646000 :     bf1[11] = -bf0[11] + bf0[4];
    1632    30646000 :     bf1[12] = -bf0[12] + bf0[3];
    1633    30646000 :     bf1[13] = -bf0[13] + bf0[2];
    1634    30646000 :     bf1[14] = -bf0[14] + bf0[1];
    1635    30646000 :     bf1[15] = -bf0[15] + bf0[0];
    1636    30646000 :     bf1[16] = bf0[16];
    1637    30646000 :     bf1[17] = bf0[17];
    1638    30646000 :     bf1[18] = bf0[18];
    1639    30646000 :     bf1[19] = bf0[19];
    1640    30646000 :     bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
    1641    30642100 :     bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
    1642    30631900 :     bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
    1643    30619600 :     bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
    1644    30612500 :     bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
    1645    30609800 :     bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
    1646    30608100 :     bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
    1647    30607600 :     bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
    1648    30607400 :     bf1[28] = bf0[28];
    1649    30607400 :     bf1[29] = bf0[29];
    1650    30607400 :     bf1[30] = bf0[30];
    1651    30607400 :     bf1[31] = bf0[31];
    1652    30607400 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1653             : 
    1654             :     // stage 3
    1655    30607400 :     stage++;
    1656    30607400 :     cospi = cospi_arr(cos_bit);
    1657    30612200 :     bf0 = step;
    1658    30612200 :     bf1 = output;
    1659    30612200 :     bf1[0] = bf0[0] + bf0[7];
    1660    30612200 :     bf1[1] = bf0[1] + bf0[6];
    1661    30612200 :     bf1[2] = bf0[2] + bf0[5];
    1662    30612200 :     bf1[3] = bf0[3] + bf0[4];
    1663    30612200 :     bf1[4] = -bf0[4] + bf0[3];
    1664    30612200 :     bf1[5] = -bf0[5] + bf0[2];
    1665    30612200 :     bf1[6] = -bf0[6] + bf0[1];
    1666    30612200 :     bf1[7] = -bf0[7] + bf0[0];
    1667    30612200 :     bf1[8] = bf0[8];
    1668    30612200 :     bf1[9] = bf0[9];
    1669    30612200 :     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    1670    30615600 :     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    1671    30609200 :     bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
    1672    30611800 :     bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
    1673    30609400 :     bf1[14] = bf0[14];
    1674    30609400 :     bf1[15] = bf0[15];
    1675    30609400 :     bf1[16] = bf0[16] + bf0[23];
    1676    30609400 :     bf1[17] = bf0[17] + bf0[22];
    1677    30609400 :     bf1[18] = bf0[18] + bf0[21];
    1678    30609400 :     bf1[19] = bf0[19] + bf0[20];
    1679    30609400 :     bf1[20] = -bf0[20] + bf0[19];
    1680    30609400 :     bf1[21] = -bf0[21] + bf0[18];
    1681    30609400 :     bf1[22] = -bf0[22] + bf0[17];
    1682    30609400 :     bf1[23] = -bf0[23] + bf0[16];
    1683    30609400 :     bf1[24] = -bf0[24] + bf0[31];
    1684    30609400 :     bf1[25] = -bf0[25] + bf0[30];
    1685    30609400 :     bf1[26] = -bf0[26] + bf0[29];
    1686    30609400 :     bf1[27] = -bf0[27] + bf0[28];
    1687    30609400 :     bf1[28] = bf0[28] + bf0[27];
    1688    30609400 :     bf1[29] = bf0[29] + bf0[26];
    1689    30609400 :     bf1[30] = bf0[30] + bf0[25];
    1690    30609400 :     bf1[31] = bf0[31] + bf0[24];
    1691    30609400 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1692             : 
    1693             :     // stage 4
    1694    30609400 :     stage++;
    1695    30609400 :     cospi = cospi_arr(cos_bit);
    1696    30632300 :     bf0 = output;
    1697    30632300 :     bf1 = step;
    1698    30632300 :     bf1[0] = bf0[0] + bf0[3];
    1699    30632300 :     bf1[1] = bf0[1] + bf0[2];
    1700    30632300 :     bf1[2] = -bf0[2] + bf0[1];
    1701    30632300 :     bf1[3] = -bf0[3] + bf0[0];
    1702    30632300 :     bf1[4] = bf0[4];
    1703    30632300 :     bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    1704    30627300 :     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
    1705    30622800 :     bf1[7] = bf0[7];
    1706    30622800 :     bf1[8] = bf0[8] + bf0[11];
    1707    30622800 :     bf1[9] = bf0[9] + bf0[10];
    1708    30622800 :     bf1[10] = -bf0[10] + bf0[9];
    1709    30622800 :     bf1[11] = -bf0[11] + bf0[8];
    1710    30622800 :     bf1[12] = -bf0[12] + bf0[15];
    1711    30622800 :     bf1[13] = -bf0[13] + bf0[14];
    1712    30622800 :     bf1[14] = bf0[14] + bf0[13];
    1713    30622800 :     bf1[15] = bf0[15] + bf0[12];
    1714    30622800 :     bf1[16] = bf0[16];
    1715    30622800 :     bf1[17] = bf0[17];
    1716    30622800 :     bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
    1717    30619200 :     bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
    1718    30610400 :     bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
    1719    30609000 :     bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
    1720    30602500 :     bf1[22] = bf0[22];
    1721    30602500 :     bf1[23] = bf0[23];
    1722    30602500 :     bf1[24] = bf0[24];
    1723    30602500 :     bf1[25] = bf0[25];
    1724    30602500 :     bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
    1725    30602000 :     bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
    1726    30599600 :     bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
    1727    30604400 :     bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
    1728    30604100 :     bf1[30] = bf0[30];
    1729    30604100 :     bf1[31] = bf0[31];
    1730    30604100 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1731             : 
    1732             :     // stage 5
    1733    30604100 :     stage++;
    1734    30604100 :     cospi = cospi_arr(cos_bit);
    1735    30609600 :     bf0 = step;
    1736    30609600 :     bf1 = output;
    1737    30609600 :     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    1738    30604800 :     bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
    1739    30604000 :     bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
    1740    30603200 :     bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
    1741    30606000 :     bf1[4] = bf0[4] + bf0[5];
    1742    30606000 :     bf1[5] = -bf0[5] + bf0[4];
    1743    30606000 :     bf1[6] = -bf0[6] + bf0[7];
    1744    30606000 :     bf1[7] = bf0[7] + bf0[6];
    1745    30606000 :     bf1[8] = bf0[8];
    1746    30606000 :     bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
    1747    30611500 :     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
    1748    30611400 :     bf1[11] = bf0[11];
    1749    30611400 :     bf1[12] = bf0[12];
    1750    30611400 :     bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
    1751    30612900 :     bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
    1752    30608100 :     bf1[15] = bf0[15];
    1753    30608100 :     bf1[16] = bf0[16] + bf0[19];
    1754    30608100 :     bf1[17] = bf0[17] + bf0[18];
    1755    30608100 :     bf1[18] = -bf0[18] + bf0[17];
    1756    30608100 :     bf1[19] = -bf0[19] + bf0[16];
    1757    30608100 :     bf1[20] = -bf0[20] + bf0[23];
    1758    30608100 :     bf1[21] = -bf0[21] + bf0[22];
    1759    30608100 :     bf1[22] = bf0[22] + bf0[21];
    1760    30608100 :     bf1[23] = bf0[23] + bf0[20];
    1761    30608100 :     bf1[24] = bf0[24] + bf0[27];
    1762    30608100 :     bf1[25] = bf0[25] + bf0[26];
    1763    30608100 :     bf1[26] = -bf0[26] + bf0[25];
    1764    30608100 :     bf1[27] = -bf0[27] + bf0[24];
    1765    30608100 :     bf1[28] = -bf0[28] + bf0[31];
    1766    30608100 :     bf1[29] = -bf0[29] + bf0[30];
    1767    30608100 :     bf1[30] = bf0[30] + bf0[29];
    1768    30608100 :     bf1[31] = bf0[31] + bf0[28];
    1769    30608100 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1770             : 
    1771             :     // stage 6
    1772    30608100 :     stage++;
    1773    30608100 :     cospi = cospi_arr(cos_bit);
    1774    30626900 :     bf0 = output;
    1775    30626900 :     bf1 = step;
    1776    30626900 :     bf1[0] = bf0[0];
    1777    30626900 :     bf1[1] = bf0[1];
    1778    30626900 :     bf1[2] = bf0[2];
    1779    30626900 :     bf1[3] = bf0[3];
    1780    30626900 :     bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
    1781    30617100 :     bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
    1782    30612000 :     bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
    1783    30609900 :     bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
    1784    30607700 :     bf1[8] = bf0[8] + bf0[9];
    1785    30607700 :     bf1[9] = -bf0[9] + bf0[8];
    1786    30607700 :     bf1[10] = -bf0[10] + bf0[11];
    1787    30607700 :     bf1[11] = bf0[11] + bf0[10];
    1788    30607700 :     bf1[12] = bf0[12] + bf0[13];
    1789    30607700 :     bf1[13] = -bf0[13] + bf0[12];
    1790    30607700 :     bf1[14] = -bf0[14] + bf0[15];
    1791    30607700 :     bf1[15] = bf0[15] + bf0[14];
    1792    30607700 :     bf1[16] = bf0[16];
    1793    30607700 :     bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
    1794    30616600 :     bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
    1795    30608800 :     bf1[19] = bf0[19];
    1796    30608800 :     bf1[20] = bf0[20];
    1797    30608800 :     bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
    1798    30608200 :     bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
    1799    30606500 :     bf1[23] = bf0[23];
    1800    30606500 :     bf1[24] = bf0[24];
    1801    30606500 :     bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
    1802    30606500 :     bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
    1803    30605800 :     bf1[27] = bf0[27];
    1804    30605800 :     bf1[28] = bf0[28];
    1805    30605800 :     bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
    1806    30606400 :     bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
    1807    30605400 :     bf1[31] = bf0[31];
    1808    30605400 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1809             : 
    1810             :     // stage 7
    1811    30605400 :     stage++;
    1812    30605400 :     cospi = cospi_arr(cos_bit);
    1813    30609200 :     bf0 = step;
    1814    30609200 :     bf1 = output;
    1815    30609200 :     bf1[0] = bf0[0];
    1816    30609200 :     bf1[1] = bf0[1];
    1817    30609200 :     bf1[2] = bf0[2];
    1818    30609200 :     bf1[3] = bf0[3];
    1819    30609200 :     bf1[4] = bf0[4];
    1820    30609200 :     bf1[5] = bf0[5];
    1821    30609200 :     bf1[6] = bf0[6];
    1822    30609200 :     bf1[7] = bf0[7];
    1823    30609200 :     bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
    1824    30612100 :     bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
    1825    30600700 :     bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
    1826    30604600 :     bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
    1827    30600800 :     bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
    1828    30600700 :     bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
    1829    30600600 :     bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
    1830    30600700 :     bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
    1831    30599900 :     bf1[16] = bf0[16] + bf0[17];
    1832    30599900 :     bf1[17] = -bf0[17] + bf0[16];
    1833    30599900 :     bf1[18] = -bf0[18] + bf0[19];
    1834    30599900 :     bf1[19] = bf0[19] + bf0[18];
    1835    30599900 :     bf1[20] = bf0[20] + bf0[21];
    1836    30599900 :     bf1[21] = -bf0[21] + bf0[20];
    1837    30599900 :     bf1[22] = -bf0[22] + bf0[23];
    1838    30599900 :     bf1[23] = bf0[23] + bf0[22];
    1839    30599900 :     bf1[24] = bf0[24] + bf0[25];
    1840    30599900 :     bf1[25] = -bf0[25] + bf0[24];
    1841    30599900 :     bf1[26] = -bf0[26] + bf0[27];
    1842    30599900 :     bf1[27] = bf0[27] + bf0[26];
    1843    30599900 :     bf1[28] = bf0[28] + bf0[29];
    1844    30599900 :     bf1[29] = -bf0[29] + bf0[28];
    1845    30599900 :     bf1[30] = -bf0[30] + bf0[31];
    1846    30599900 :     bf1[31] = bf0[31] + bf0[30];
    1847    30599900 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1848             : 
    1849             :     // stage 8
    1850    30599900 :     stage++;
    1851    30599900 :     cospi = cospi_arr(cos_bit);
    1852    30625700 :     bf0 = output;
    1853    30625700 :     bf1 = step;
    1854    30625700 :     bf1[0] = bf0[0];
    1855    30625700 :     bf1[1] = bf0[1];
    1856    30625700 :     bf1[2] = bf0[2];
    1857    30625700 :     bf1[3] = bf0[3];
    1858    30625700 :     bf1[4] = bf0[4];
    1859    30625700 :     bf1[5] = bf0[5];
    1860    30625700 :     bf1[6] = bf0[6];
    1861    30625700 :     bf1[7] = bf0[7];
    1862    30625700 :     bf1[8] = bf0[8];
    1863    30625700 :     bf1[9] = bf0[9];
    1864    30625700 :     bf1[10] = bf0[10];
    1865    30625700 :     bf1[11] = bf0[11];
    1866    30625700 :     bf1[12] = bf0[12];
    1867    30625700 :     bf1[13] = bf0[13];
    1868    30625700 :     bf1[14] = bf0[14];
    1869    30625700 :     bf1[15] = bf0[15];
    1870    30625700 :     bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
    1871    30635000 :     bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
    1872    30620800 :     bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
    1873    30612800 :     bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
    1874    30607900 :     bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
    1875    30605800 :     bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
    1876    30604800 :     bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
    1877    30607100 :     bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
    1878    30604000 :     bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
    1879    30600600 :     bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
    1880    30596700 :     bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
    1881    30595700 :     bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
    1882    30595600 :     bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
    1883    30595600 :     bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
    1884    30595400 :     bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
    1885    30595500 :     bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
    1886    30595700 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1887             : 
    1888             :     // stage 9
    1889    30595700 :     stage++;
    1890    30595700 :     bf0 = step;
    1891    30595700 :     bf1 = output;
    1892    30595700 :     bf1[0] = bf0[0];
    1893    30595700 :     bf1[1] = bf0[16];
    1894    30595700 :     bf1[2] = bf0[8];
    1895    30595700 :     bf1[3] = bf0[24];
    1896    30595700 :     bf1[4] = bf0[4];
    1897    30595700 :     bf1[5] = bf0[20];
    1898    30595700 :     bf1[6] = bf0[12];
    1899    30595700 :     bf1[7] = bf0[28];
    1900    30595700 :     bf1[8] = bf0[2];
    1901    30595700 :     bf1[9] = bf0[18];
    1902    30595700 :     bf1[10] = bf0[10];
    1903    30595700 :     bf1[11] = bf0[26];
    1904    30595700 :     bf1[12] = bf0[6];
    1905    30595700 :     bf1[13] = bf0[22];
    1906    30595700 :     bf1[14] = bf0[14];
    1907    30595700 :     bf1[15] = bf0[30];
    1908    30595700 :     bf1[16] = bf0[1];
    1909    30595700 :     bf1[17] = bf0[17];
    1910    30595700 :     bf1[18] = bf0[9];
    1911    30595700 :     bf1[19] = bf0[25];
    1912    30595700 :     bf1[20] = bf0[5];
    1913    30595700 :     bf1[21] = bf0[21];
    1914    30595700 :     bf1[22] = bf0[13];
    1915    30595700 :     bf1[23] = bf0[29];
    1916    30595700 :     bf1[24] = bf0[3];
    1917    30595700 :     bf1[25] = bf0[19];
    1918    30595700 :     bf1[26] = bf0[11];
    1919    30595700 :     bf1[27] = bf0[27];
    1920    30595700 :     bf1[28] = bf0[7];
    1921    30595700 :     bf1[29] = bf0[23];
    1922    30595700 :     bf1[30] = bf0[15];
    1923    30595700 :     bf1[31] = bf0[31];
    1924    30595700 :     range_check(stage, input, bf1, size, stage_range[stage]);
    1925    30595700 : }
    1926           0 : void eb_av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    1927             :     const int8_t *stage_range) {
    1928           0 :     const int32_t size = 64;
    1929             :     const int32_t *cospi;
    1930             : 
    1931           0 :     int32_t stage = 0;
    1932             :     int32_t *bf0, *bf1;
    1933             :     int32_t step[64];
    1934             : 
    1935             :     // stage 0;
    1936           0 :     range_check(stage, input, input, size, stage_range[stage]);
    1937             : 
    1938             :     // stage 1;
    1939           0 :     stage++;
    1940           0 :     bf1 = output;
    1941           0 :     bf1[0] = input[0] + input[63];
    1942           0 :     bf1[1] = input[1] + input[62];
    1943           0 :     bf1[2] = input[2] + input[61];
    1944           0 :     bf1[3] = input[3] + input[60];
    1945           0 :     bf1[4] = input[4] + input[59];
    1946           0 :     bf1[5] = input[5] + input[58];
    1947           0 :     bf1[6] = input[6] + input[57];
    1948           0 :     bf1[7] = input[7] + input[56];
    1949           0 :     bf1[8] = input[8] + input[55];
    1950           0 :     bf1[9] = input[9] + input[54];
    1951           0 :     bf1[10] = input[10] + input[53];
    1952           0 :     bf1[11] = input[11] + input[52];
    1953           0 :     bf1[12] = input[12] + input[51];
    1954           0 :     bf1[13] = input[13] + input[50];
    1955           0 :     bf1[14] = input[14] + input[49];
    1956           0 :     bf1[15] = input[15] + input[48];
    1957           0 :     bf1[16] = input[16] + input[47];
    1958           0 :     bf1[17] = input[17] + input[46];
    1959           0 :     bf1[18] = input[18] + input[45];
    1960           0 :     bf1[19] = input[19] + input[44];
    1961           0 :     bf1[20] = input[20] + input[43];
    1962           0 :     bf1[21] = input[21] + input[42];
    1963           0 :     bf1[22] = input[22] + input[41];
    1964           0 :     bf1[23] = input[23] + input[40];
    1965           0 :     bf1[24] = input[24] + input[39];
    1966           0 :     bf1[25] = input[25] + input[38];
    1967           0 :     bf1[26] = input[26] + input[37];
    1968           0 :     bf1[27] = input[27] + input[36];
    1969           0 :     bf1[28] = input[28] + input[35];
    1970           0 :     bf1[29] = input[29] + input[34];
    1971           0 :     bf1[30] = input[30] + input[33];
    1972           0 :     bf1[31] = input[31] + input[32];
    1973           0 :     bf1[32] = -input[32] + input[31];
    1974           0 :     bf1[33] = -input[33] + input[30];
    1975           0 :     bf1[34] = -input[34] + input[29];
    1976           0 :     bf1[35] = -input[35] + input[28];
    1977           0 :     bf1[36] = -input[36] + input[27];
    1978           0 :     bf1[37] = -input[37] + input[26];
    1979           0 :     bf1[38] = -input[38] + input[25];
    1980           0 :     bf1[39] = -input[39] + input[24];
    1981           0 :     bf1[40] = -input[40] + input[23];
    1982           0 :     bf1[41] = -input[41] + input[22];
    1983           0 :     bf1[42] = -input[42] + input[21];
    1984           0 :     bf1[43] = -input[43] + input[20];
    1985           0 :     bf1[44] = -input[44] + input[19];
    1986           0 :     bf1[45] = -input[45] + input[18];
    1987           0 :     bf1[46] = -input[46] + input[17];
    1988           0 :     bf1[47] = -input[47] + input[16];
    1989           0 :     bf1[48] = -input[48] + input[15];
    1990           0 :     bf1[49] = -input[49] + input[14];
    1991           0 :     bf1[50] = -input[50] + input[13];
    1992           0 :     bf1[51] = -input[51] + input[12];
    1993           0 :     bf1[52] = -input[52] + input[11];
    1994           0 :     bf1[53] = -input[53] + input[10];
    1995           0 :     bf1[54] = -input[54] + input[9];
    1996           0 :     bf1[55] = -input[55] + input[8];
    1997           0 :     bf1[56] = -input[56] + input[7];
    1998           0 :     bf1[57] = -input[57] + input[6];
    1999           0 :     bf1[58] = -input[58] + input[5];
    2000           0 :     bf1[59] = -input[59] + input[4];
    2001           0 :     bf1[60] = -input[60] + input[3];
    2002           0 :     bf1[61] = -input[61] + input[2];
    2003           0 :     bf1[62] = -input[62] + input[1];
    2004           0 :     bf1[63] = -input[63] + input[0];
    2005           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2006             : 
    2007             :     // stage 2
    2008           0 :     stage++;
    2009           0 :     cospi = cospi_arr(cos_bit);
    2010           0 :     bf0 = output;
    2011           0 :     bf1 = step;
    2012           0 :     bf1[0] = bf0[0] + bf0[31];
    2013           0 :     bf1[1] = bf0[1] + bf0[30];
    2014           0 :     bf1[2] = bf0[2] + bf0[29];
    2015           0 :     bf1[3] = bf0[3] + bf0[28];
    2016           0 :     bf1[4] = bf0[4] + bf0[27];
    2017           0 :     bf1[5] = bf0[5] + bf0[26];
    2018           0 :     bf1[6] = bf0[6] + bf0[25];
    2019           0 :     bf1[7] = bf0[7] + bf0[24];
    2020           0 :     bf1[8] = bf0[8] + bf0[23];
    2021           0 :     bf1[9] = bf0[9] + bf0[22];
    2022           0 :     bf1[10] = bf0[10] + bf0[21];
    2023           0 :     bf1[11] = bf0[11] + bf0[20];
    2024           0 :     bf1[12] = bf0[12] + bf0[19];
    2025           0 :     bf1[13] = bf0[13] + bf0[18];
    2026           0 :     bf1[14] = bf0[14] + bf0[17];
    2027           0 :     bf1[15] = bf0[15] + bf0[16];
    2028           0 :     bf1[16] = -bf0[16] + bf0[15];
    2029           0 :     bf1[17] = -bf0[17] + bf0[14];
    2030           0 :     bf1[18] = -bf0[18] + bf0[13];
    2031           0 :     bf1[19] = -bf0[19] + bf0[12];
    2032           0 :     bf1[20] = -bf0[20] + bf0[11];
    2033           0 :     bf1[21] = -bf0[21] + bf0[10];
    2034           0 :     bf1[22] = -bf0[22] + bf0[9];
    2035           0 :     bf1[23] = -bf0[23] + bf0[8];
    2036           0 :     bf1[24] = -bf0[24] + bf0[7];
    2037           0 :     bf1[25] = -bf0[25] + bf0[6];
    2038           0 :     bf1[26] = -bf0[26] + bf0[5];
    2039           0 :     bf1[27] = -bf0[27] + bf0[4];
    2040           0 :     bf1[28] = -bf0[28] + bf0[3];
    2041           0 :     bf1[29] = -bf0[29] + bf0[2];
    2042           0 :     bf1[30] = -bf0[30] + bf0[1];
    2043           0 :     bf1[31] = -bf0[31] + bf0[0];
    2044           0 :     bf1[32] = bf0[32];
    2045           0 :     bf1[33] = bf0[33];
    2046           0 :     bf1[34] = bf0[34];
    2047           0 :     bf1[35] = bf0[35];
    2048           0 :     bf1[36] = bf0[36];
    2049           0 :     bf1[37] = bf0[37];
    2050           0 :     bf1[38] = bf0[38];
    2051           0 :     bf1[39] = bf0[39];
    2052           0 :     bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
    2053           0 :     bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
    2054           0 :     bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
    2055           0 :     bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
    2056           0 :     bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
    2057           0 :     bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
    2058           0 :     bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
    2059           0 :     bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
    2060           0 :     bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
    2061           0 :     bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
    2062           0 :     bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
    2063           0 :     bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
    2064           0 :     bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
    2065           0 :     bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
    2066           0 :     bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
    2067           0 :     bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
    2068           0 :     bf1[56] = bf0[56];
    2069           0 :     bf1[57] = bf0[57];
    2070           0 :     bf1[58] = bf0[58];
    2071           0 :     bf1[59] = bf0[59];
    2072           0 :     bf1[60] = bf0[60];
    2073           0 :     bf1[61] = bf0[61];
    2074           0 :     bf1[62] = bf0[62];
    2075           0 :     bf1[63] = bf0[63];
    2076           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2077             : 
    2078             :     // stage 3
    2079           0 :     stage++;
    2080           0 :     cospi = cospi_arr(cos_bit);
    2081           0 :     bf0 = step;
    2082           0 :     bf1 = output;
    2083           0 :     bf1[0] = bf0[0] + bf0[15];
    2084           0 :     bf1[1] = bf0[1] + bf0[14];
    2085           0 :     bf1[2] = bf0[2] + bf0[13];
    2086           0 :     bf1[3] = bf0[3] + bf0[12];
    2087           0 :     bf1[4] = bf0[4] + bf0[11];
    2088           0 :     bf1[5] = bf0[5] + bf0[10];
    2089           0 :     bf1[6] = bf0[6] + bf0[9];
    2090           0 :     bf1[7] = bf0[7] + bf0[8];
    2091           0 :     bf1[8] = -bf0[8] + bf0[7];
    2092           0 :     bf1[9] = -bf0[9] + bf0[6];
    2093           0 :     bf1[10] = -bf0[10] + bf0[5];
    2094           0 :     bf1[11] = -bf0[11] + bf0[4];
    2095           0 :     bf1[12] = -bf0[12] + bf0[3];
    2096           0 :     bf1[13] = -bf0[13] + bf0[2];
    2097           0 :     bf1[14] = -bf0[14] + bf0[1];
    2098           0 :     bf1[15] = -bf0[15] + bf0[0];
    2099           0 :     bf1[16] = bf0[16];
    2100           0 :     bf1[17] = bf0[17];
    2101           0 :     bf1[18] = bf0[18];
    2102           0 :     bf1[19] = bf0[19];
    2103           0 :     bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
    2104           0 :     bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
    2105           0 :     bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
    2106           0 :     bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
    2107           0 :     bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
    2108           0 :     bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
    2109           0 :     bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
    2110           0 :     bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
    2111           0 :     bf1[28] = bf0[28];
    2112           0 :     bf1[29] = bf0[29];
    2113           0 :     bf1[30] = bf0[30];
    2114           0 :     bf1[31] = bf0[31];
    2115           0 :     bf1[32] = bf0[32] + bf0[47];
    2116           0 :     bf1[33] = bf0[33] + bf0[46];
    2117           0 :     bf1[34] = bf0[34] + bf0[45];
    2118           0 :     bf1[35] = bf0[35] + bf0[44];
    2119           0 :     bf1[36] = bf0[36] + bf0[43];
    2120           0 :     bf1[37] = bf0[37] + bf0[42];
    2121           0 :     bf1[38] = bf0[38] + bf0[41];
    2122           0 :     bf1[39] = bf0[39] + bf0[40];
    2123           0 :     bf1[40] = -bf0[40] + bf0[39];
    2124           0 :     bf1[41] = -bf0[41] + bf0[38];
    2125           0 :     bf1[42] = -bf0[42] + bf0[37];
    2126           0 :     bf1[43] = -bf0[43] + bf0[36];
    2127           0 :     bf1[44] = -bf0[44] + bf0[35];
    2128           0 :     bf1[45] = -bf0[45] + bf0[34];
    2129           0 :     bf1[46] = -bf0[46] + bf0[33];
    2130           0 :     bf1[47] = -bf0[47] + bf0[32];
    2131           0 :     bf1[48] = -bf0[48] + bf0[63];
    2132           0 :     bf1[49] = -bf0[49] + bf0[62];
    2133           0 :     bf1[50] = -bf0[50] + bf0[61];
    2134           0 :     bf1[51] = -bf0[51] + bf0[60];
    2135           0 :     bf1[52] = -bf0[52] + bf0[59];
    2136           0 :     bf1[53] = -bf0[53] + bf0[58];
    2137           0 :     bf1[54] = -bf0[54] + bf0[57];
    2138           0 :     bf1[55] = -bf0[55] + bf0[56];
    2139           0 :     bf1[56] = bf0[56] + bf0[55];
    2140           0 :     bf1[57] = bf0[57] + bf0[54];
    2141           0 :     bf1[58] = bf0[58] + bf0[53];
    2142           0 :     bf1[59] = bf0[59] + bf0[52];
    2143           0 :     bf1[60] = bf0[60] + bf0[51];
    2144           0 :     bf1[61] = bf0[61] + bf0[50];
    2145           0 :     bf1[62] = bf0[62] + bf0[49];
    2146           0 :     bf1[63] = bf0[63] + bf0[48];
    2147           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2148             : 
    2149             :     // stage 4
    2150           0 :     stage++;
    2151           0 :     cospi = cospi_arr(cos_bit);
    2152           0 :     bf0 = output;
    2153           0 :     bf1 = step;
    2154           0 :     bf1[0] = bf0[0] + bf0[7];
    2155           0 :     bf1[1] = bf0[1] + bf0[6];
    2156           0 :     bf1[2] = bf0[2] + bf0[5];
    2157           0 :     bf1[3] = bf0[3] + bf0[4];
    2158           0 :     bf1[4] = -bf0[4] + bf0[3];
    2159           0 :     bf1[5] = -bf0[5] + bf0[2];
    2160           0 :     bf1[6] = -bf0[6] + bf0[1];
    2161           0 :     bf1[7] = -bf0[7] + bf0[0];
    2162           0 :     bf1[8] = bf0[8];
    2163           0 :     bf1[9] = bf0[9];
    2164           0 :     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    2165           0 :     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    2166           0 :     bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
    2167           0 :     bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
    2168           0 :     bf1[14] = bf0[14];
    2169           0 :     bf1[15] = bf0[15];
    2170           0 :     bf1[16] = bf0[16] + bf0[23];
    2171           0 :     bf1[17] = bf0[17] + bf0[22];
    2172           0 :     bf1[18] = bf0[18] + bf0[21];
    2173           0 :     bf1[19] = bf0[19] + bf0[20];
    2174           0 :     bf1[20] = -bf0[20] + bf0[19];
    2175           0 :     bf1[21] = -bf0[21] + bf0[18];
    2176           0 :     bf1[22] = -bf0[22] + bf0[17];
    2177           0 :     bf1[23] = -bf0[23] + bf0[16];
    2178           0 :     bf1[24] = -bf0[24] + bf0[31];
    2179           0 :     bf1[25] = -bf0[25] + bf0[30];
    2180           0 :     bf1[26] = -bf0[26] + bf0[29];
    2181           0 :     bf1[27] = -bf0[27] + bf0[28];
    2182           0 :     bf1[28] = bf0[28] + bf0[27];
    2183           0 :     bf1[29] = bf0[29] + bf0[26];
    2184           0 :     bf1[30] = bf0[30] + bf0[25];
    2185           0 :     bf1[31] = bf0[31] + bf0[24];
    2186           0 :     bf1[32] = bf0[32];
    2187           0 :     bf1[33] = bf0[33];
    2188           0 :     bf1[34] = bf0[34];
    2189           0 :     bf1[35] = bf0[35];
    2190           0 :     bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
    2191           0 :     bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
    2192           0 :     bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
    2193           0 :     bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
    2194           0 :     bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
    2195           0 :     bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
    2196           0 :     bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
    2197           0 :     bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
    2198           0 :     bf1[44] = bf0[44];
    2199           0 :     bf1[45] = bf0[45];
    2200           0 :     bf1[46] = bf0[46];
    2201           0 :     bf1[47] = bf0[47];
    2202           0 :     bf1[48] = bf0[48];
    2203           0 :     bf1[49] = bf0[49];
    2204           0 :     bf1[50] = bf0[50];
    2205           0 :     bf1[51] = bf0[51];
    2206           0 :     bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
    2207           0 :     bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
    2208           0 :     bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
    2209           0 :     bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
    2210           0 :     bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
    2211           0 :     bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
    2212           0 :     bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
    2213           0 :     bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
    2214           0 :     bf1[60] = bf0[60];
    2215           0 :     bf1[61] = bf0[61];
    2216           0 :     bf1[62] = bf0[62];
    2217           0 :     bf1[63] = bf0[63];
    2218           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2219             : 
    2220             :     // stage 5
    2221           0 :     stage++;
    2222           0 :     cospi = cospi_arr(cos_bit);
    2223           0 :     bf0 = step;
    2224           0 :     bf1 = output;
    2225           0 :     bf1[0] = bf0[0] + bf0[3];
    2226           0 :     bf1[1] = bf0[1] + bf0[2];
    2227           0 :     bf1[2] = -bf0[2] + bf0[1];
    2228           0 :     bf1[3] = -bf0[3] + bf0[0];
    2229           0 :     bf1[4] = bf0[4];
    2230           0 :     bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    2231           0 :     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
    2232           0 :     bf1[7] = bf0[7];
    2233           0 :     bf1[8] = bf0[8] + bf0[11];
    2234           0 :     bf1[9] = bf0[9] + bf0[10];
    2235           0 :     bf1[10] = -bf0[10] + bf0[9];
    2236           0 :     bf1[11] = -bf0[11] + bf0[8];
    2237           0 :     bf1[12] = -bf0[12] + bf0[15];
    2238           0 :     bf1[13] = -bf0[13] + bf0[14];
    2239           0 :     bf1[14] = bf0[14] + bf0[13];
    2240           0 :     bf1[15] = bf0[15] + bf0[12];
    2241           0 :     bf1[16] = bf0[16];
    2242           0 :     bf1[17] = bf0[17];
    2243           0 :     bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
    2244           0 :     bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
    2245           0 :     bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
    2246           0 :     bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
    2247           0 :     bf1[22] = bf0[22];
    2248           0 :     bf1[23] = bf0[23];
    2249           0 :     bf1[24] = bf0[24];
    2250           0 :     bf1[25] = bf0[25];
    2251           0 :     bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
    2252           0 :     bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
    2253           0 :     bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
    2254           0 :     bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
    2255           0 :     bf1[30] = bf0[30];
    2256           0 :     bf1[31] = bf0[31];
    2257           0 :     bf1[32] = bf0[32] + bf0[39];
    2258           0 :     bf1[33] = bf0[33] + bf0[38];
    2259           0 :     bf1[34] = bf0[34] + bf0[37];
    2260           0 :     bf1[35] = bf0[35] + bf0[36];
    2261           0 :     bf1[36] = -bf0[36] + bf0[35];
    2262           0 :     bf1[37] = -bf0[37] + bf0[34];
    2263           0 :     bf1[38] = -bf0[38] + bf0[33];
    2264           0 :     bf1[39] = -bf0[39] + bf0[32];
    2265           0 :     bf1[40] = -bf0[40] + bf0[47];
    2266           0 :     bf1[41] = -bf0[41] + bf0[46];
    2267           0 :     bf1[42] = -bf0[42] + bf0[45];
    2268           0 :     bf1[43] = -bf0[43] + bf0[44];
    2269           0 :     bf1[44] = bf0[44] + bf0[43];
    2270           0 :     bf1[45] = bf0[45] + bf0[42];
    2271           0 :     bf1[46] = bf0[46] + bf0[41];
    2272           0 :     bf1[47] = bf0[47] + bf0[40];
    2273           0 :     bf1[48] = bf0[48] + bf0[55];
    2274           0 :     bf1[49] = bf0[49] + bf0[54];
    2275           0 :     bf1[50] = bf0[50] + bf0[53];
    2276           0 :     bf1[51] = bf0[51] + bf0[52];
    2277           0 :     bf1[52] = -bf0[52] + bf0[51];
    2278           0 :     bf1[53] = -bf0[53] + bf0[50];
    2279           0 :     bf1[54] = -bf0[54] + bf0[49];
    2280           0 :     bf1[55] = -bf0[55] + bf0[48];
    2281           0 :     bf1[56] = -bf0[56] + bf0[63];
    2282           0 :     bf1[57] = -bf0[57] + bf0[62];
    2283           0 :     bf1[58] = -bf0[58] + bf0[61];
    2284           0 :     bf1[59] = -bf0[59] + bf0[60];
    2285           0 :     bf1[60] = bf0[60] + bf0[59];
    2286           0 :     bf1[61] = bf0[61] + bf0[58];
    2287           0 :     bf1[62] = bf0[62] + bf0[57];
    2288           0 :     bf1[63] = bf0[63] + bf0[56];
    2289           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2290             : 
    2291             :     // stage 6
    2292           0 :     stage++;
    2293           0 :     cospi = cospi_arr(cos_bit);
    2294           0 :     bf0 = output;
    2295           0 :     bf1 = step;
    2296           0 :     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    2297           0 :     bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
    2298           0 :     bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
    2299           0 :     bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
    2300           0 :     bf1[4] = bf0[4] + bf0[5];
    2301           0 :     bf1[5] = -bf0[5] + bf0[4];
    2302           0 :     bf1[6] = -bf0[6] + bf0[7];
    2303           0 :     bf1[7] = bf0[7] + bf0[6];
    2304           0 :     bf1[8] = bf0[8];
    2305           0 :     bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
    2306           0 :     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
    2307           0 :     bf1[11] = bf0[11];
    2308           0 :     bf1[12] = bf0[12];
    2309           0 :     bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
    2310           0 :     bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
    2311           0 :     bf1[15] = bf0[15];
    2312           0 :     bf1[16] = bf0[16] + bf0[19];
    2313           0 :     bf1[17] = bf0[17] + bf0[18];
    2314           0 :     bf1[18] = -bf0[18] + bf0[17];
    2315           0 :     bf1[19] = -bf0[19] + bf0[16];
    2316           0 :     bf1[20] = -bf0[20] + bf0[23];
    2317           0 :     bf1[21] = -bf0[21] + bf0[22];
    2318           0 :     bf1[22] = bf0[22] + bf0[21];
    2319           0 :     bf1[23] = bf0[23] + bf0[20];
    2320           0 :     bf1[24] = bf0[24] + bf0[27];
    2321           0 :     bf1[25] = bf0[25] + bf0[26];
    2322           0 :     bf1[26] = -bf0[26] + bf0[25];
    2323           0 :     bf1[27] = -bf0[27] + bf0[24];
    2324           0 :     bf1[28] = -bf0[28] + bf0[31];
    2325           0 :     bf1[29] = -bf0[29] + bf0[30];
    2326           0 :     bf1[30] = bf0[30] + bf0[29];
    2327           0 :     bf1[31] = bf0[31] + bf0[28];
    2328           0 :     bf1[32] = bf0[32];
    2329           0 :     bf1[33] = bf0[33];
    2330           0 :     bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
    2331           0 :     bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
    2332           0 :     bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
    2333           0 :     bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
    2334           0 :     bf1[38] = bf0[38];
    2335           0 :     bf1[39] = bf0[39];
    2336           0 :     bf1[40] = bf0[40];
    2337           0 :     bf1[41] = bf0[41];
    2338           0 :     bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
    2339           0 :     bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
    2340           0 :     bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
    2341           0 :     bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
    2342           0 :     bf1[46] = bf0[46];
    2343           0 :     bf1[47] = bf0[47];
    2344           0 :     bf1[48] = bf0[48];
    2345           0 :     bf1[49] = bf0[49];
    2346           0 :     bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
    2347           0 :     bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
    2348           0 :     bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
    2349           0 :     bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
    2350           0 :     bf1[54] = bf0[54];
    2351           0 :     bf1[55] = bf0[55];
    2352           0 :     bf1[56] = bf0[56];
    2353           0 :     bf1[57] = bf0[57];
    2354           0 :     bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
    2355           0 :     bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
    2356           0 :     bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
    2357           0 :     bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
    2358           0 :     bf1[62] = bf0[62];
    2359           0 :     bf1[63] = bf0[63];
    2360           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2361             : 
    2362             :     // stage 7
    2363           0 :     stage++;
    2364           0 :     cospi = cospi_arr(cos_bit);
    2365           0 :     bf0 = step;
    2366           0 :     bf1 = output;
    2367           0 :     bf1[0] = bf0[0];
    2368           0 :     bf1[1] = bf0[1];
    2369           0 :     bf1[2] = bf0[2];
    2370           0 :     bf1[3] = bf0[3];
    2371           0 :     bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
    2372           0 :     bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
    2373           0 :     bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
    2374           0 :     bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
    2375           0 :     bf1[8] = bf0[8] + bf0[9];
    2376           0 :     bf1[9] = -bf0[9] + bf0[8];
    2377           0 :     bf1[10] = -bf0[10] + bf0[11];
    2378           0 :     bf1[11] = bf0[11] + bf0[10];
    2379           0 :     bf1[12] = bf0[12] + bf0[13];
    2380           0 :     bf1[13] = -bf0[13] + bf0[12];
    2381           0 :     bf1[14] = -bf0[14] + bf0[15];
    2382           0 :     bf1[15] = bf0[15] + bf0[14];
    2383           0 :     bf1[16] = bf0[16];
    2384           0 :     bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
    2385           0 :     bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
    2386           0 :     bf1[19] = bf0[19];
    2387           0 :     bf1[20] = bf0[20];
    2388           0 :     bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
    2389           0 :     bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
    2390           0 :     bf1[23] = bf0[23];
    2391           0 :     bf1[24] = bf0[24];
    2392           0 :     bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
    2393           0 :     bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
    2394           0 :     bf1[27] = bf0[27];
    2395           0 :     bf1[28] = bf0[28];
    2396           0 :     bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
    2397           0 :     bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
    2398           0 :     bf1[31] = bf0[31];
    2399           0 :     bf1[32] = bf0[32] + bf0[35];
    2400           0 :     bf1[33] = bf0[33] + bf0[34];
    2401           0 :     bf1[34] = -bf0[34] + bf0[33];
    2402           0 :     bf1[35] = -bf0[35] + bf0[32];
    2403           0 :     bf1[36] = -bf0[36] + bf0[39];
    2404           0 :     bf1[37] = -bf0[37] + bf0[38];
    2405           0 :     bf1[38] = bf0[38] + bf0[37];
    2406           0 :     bf1[39] = bf0[39] + bf0[36];
    2407           0 :     bf1[40] = bf0[40] + bf0[43];
    2408           0 :     bf1[41] = bf0[41] + bf0[42];
    2409           0 :     bf1[42] = -bf0[42] + bf0[41];
    2410           0 :     bf1[43] = -bf0[43] + bf0[40];
    2411           0 :     bf1[44] = -bf0[44] + bf0[47];
    2412           0 :     bf1[45] = -bf0[45] + bf0[46];
    2413           0 :     bf1[46] = bf0[46] + bf0[45];
    2414           0 :     bf1[47] = bf0[47] + bf0[44];
    2415           0 :     bf1[48] = bf0[48] + bf0[51];
    2416           0 :     bf1[49] = bf0[49] + bf0[50];
    2417           0 :     bf1[50] = -bf0[50] + bf0[49];
    2418           0 :     bf1[51] = -bf0[51] + bf0[48];
    2419           0 :     bf1[52] = -bf0[52] + bf0[55];
    2420           0 :     bf1[53] = -bf0[53] + bf0[54];
    2421           0 :     bf1[54] = bf0[54] + bf0[53];
    2422           0 :     bf1[55] = bf0[55] + bf0[52];
    2423           0 :     bf1[56] = bf0[56] + bf0[59];
    2424           0 :     bf1[57] = bf0[57] + bf0[58];
    2425           0 :     bf1[58] = -bf0[58] + bf0[57];
    2426           0 :     bf1[59] = -bf0[59] + bf0[56];
    2427           0 :     bf1[60] = -bf0[60] + bf0[63];
    2428           0 :     bf1[61] = -bf0[61] + bf0[62];
    2429           0 :     bf1[62] = bf0[62] + bf0[61];
    2430           0 :     bf1[63] = bf0[63] + bf0[60];
    2431           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2432             : 
    2433             :     // stage 8
    2434           0 :     stage++;
    2435           0 :     cospi = cospi_arr(cos_bit);
    2436           0 :     bf0 = output;
    2437           0 :     bf1 = step;
    2438           0 :     bf1[0] = bf0[0];
    2439           0 :     bf1[1] = bf0[1];
    2440           0 :     bf1[2] = bf0[2];
    2441           0 :     bf1[3] = bf0[3];
    2442           0 :     bf1[4] = bf0[4];
    2443           0 :     bf1[5] = bf0[5];
    2444           0 :     bf1[6] = bf0[6];
    2445           0 :     bf1[7] = bf0[7];
    2446           0 :     bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
    2447           0 :     bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
    2448           0 :     bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
    2449           0 :     bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
    2450           0 :     bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
    2451           0 :     bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
    2452           0 :     bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
    2453           0 :     bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
    2454           0 :     bf1[16] = bf0[16] + bf0[17];
    2455           0 :     bf1[17] = -bf0[17] + bf0[16];
    2456           0 :     bf1[18] = -bf0[18] + bf0[19];
    2457           0 :     bf1[19] = bf0[19] + bf0[18];
    2458           0 :     bf1[20] = bf0[20] + bf0[21];
    2459           0 :     bf1[21] = -bf0[21] + bf0[20];
    2460           0 :     bf1[22] = -bf0[22] + bf0[23];
    2461           0 :     bf1[23] = bf0[23] + bf0[22];
    2462           0 :     bf1[24] = bf0[24] + bf0[25];
    2463           0 :     bf1[25] = -bf0[25] + bf0[24];
    2464           0 :     bf1[26] = -bf0[26] + bf0[27];
    2465           0 :     bf1[27] = bf0[27] + bf0[26];
    2466           0 :     bf1[28] = bf0[28] + bf0[29];
    2467           0 :     bf1[29] = -bf0[29] + bf0[28];
    2468           0 :     bf1[30] = -bf0[30] + bf0[31];
    2469           0 :     bf1[31] = bf0[31] + bf0[30];
    2470           0 :     bf1[32] = bf0[32];
    2471           0 :     bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
    2472           0 :     bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
    2473           0 :     bf1[35] = bf0[35];
    2474           0 :     bf1[36] = bf0[36];
    2475           0 :     bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
    2476           0 :     bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
    2477           0 :     bf1[39] = bf0[39];
    2478           0 :     bf1[40] = bf0[40];
    2479           0 :     bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
    2480           0 :     bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
    2481           0 :     bf1[43] = bf0[43];
    2482           0 :     bf1[44] = bf0[44];
    2483           0 :     bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
    2484           0 :     bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
    2485           0 :     bf1[47] = bf0[47];
    2486           0 :     bf1[48] = bf0[48];
    2487           0 :     bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
    2488           0 :     bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
    2489           0 :     bf1[51] = bf0[51];
    2490           0 :     bf1[52] = bf0[52];
    2491           0 :     bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
    2492           0 :     bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
    2493           0 :     bf1[55] = bf0[55];
    2494           0 :     bf1[56] = bf0[56];
    2495           0 :     bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
    2496           0 :     bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
    2497           0 :     bf1[59] = bf0[59];
    2498           0 :     bf1[60] = bf0[60];
    2499           0 :     bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
    2500           0 :     bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
    2501           0 :     bf1[63] = bf0[63];
    2502           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2503             : 
    2504             :     // stage 9
    2505           0 :     stage++;
    2506           0 :     cospi = cospi_arr(cos_bit);
    2507           0 :     bf0 = step;
    2508           0 :     bf1 = output;
    2509           0 :     bf1[0] = bf0[0];
    2510           0 :     bf1[1] = bf0[1];
    2511           0 :     bf1[2] = bf0[2];
    2512           0 :     bf1[3] = bf0[3];
    2513           0 :     bf1[4] = bf0[4];
    2514           0 :     bf1[5] = bf0[5];
    2515           0 :     bf1[6] = bf0[6];
    2516           0 :     bf1[7] = bf0[7];
    2517           0 :     bf1[8] = bf0[8];
    2518           0 :     bf1[9] = bf0[9];
    2519           0 :     bf1[10] = bf0[10];
    2520           0 :     bf1[11] = bf0[11];
    2521           0 :     bf1[12] = bf0[12];
    2522           0 :     bf1[13] = bf0[13];
    2523           0 :     bf1[14] = bf0[14];
    2524           0 :     bf1[15] = bf0[15];
    2525           0 :     bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
    2526           0 :     bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
    2527           0 :     bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
    2528           0 :     bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
    2529           0 :     bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
    2530           0 :     bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
    2531           0 :     bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
    2532           0 :     bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
    2533           0 :     bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
    2534           0 :     bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
    2535           0 :     bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
    2536           0 :     bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
    2537           0 :     bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
    2538           0 :     bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
    2539           0 :     bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
    2540           0 :     bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
    2541           0 :     bf1[32] = bf0[32] + bf0[33];
    2542           0 :     bf1[33] = -bf0[33] + bf0[32];
    2543           0 :     bf1[34] = -bf0[34] + bf0[35];
    2544           0 :     bf1[35] = bf0[35] + bf0[34];
    2545           0 :     bf1[36] = bf0[36] + bf0[37];
    2546           0 :     bf1[37] = -bf0[37] + bf0[36];
    2547           0 :     bf1[38] = -bf0[38] + bf0[39];
    2548           0 :     bf1[39] = bf0[39] + bf0[38];
    2549           0 :     bf1[40] = bf0[40] + bf0[41];
    2550           0 :     bf1[41] = -bf0[41] + bf0[40];
    2551           0 :     bf1[42] = -bf0[42] + bf0[43];
    2552           0 :     bf1[43] = bf0[43] + bf0[42];
    2553           0 :     bf1[44] = bf0[44] + bf0[45];
    2554           0 :     bf1[45] = -bf0[45] + bf0[44];
    2555           0 :     bf1[46] = -bf0[46] + bf0[47];
    2556           0 :     bf1[47] = bf0[47] + bf0[46];
    2557           0 :     bf1[48] = bf0[48] + bf0[49];
    2558           0 :     bf1[49] = -bf0[49] + bf0[48];
    2559           0 :     bf1[50] = -bf0[50] + bf0[51];
    2560           0 :     bf1[51] = bf0[51] + bf0[50];
    2561           0 :     bf1[52] = bf0[52] + bf0[53];
    2562           0 :     bf1[53] = -bf0[53] + bf0[52];
    2563           0 :     bf1[54] = -bf0[54] + bf0[55];
    2564           0 :     bf1[55] = bf0[55] + bf0[54];
    2565           0 :     bf1[56] = bf0[56] + bf0[57];
    2566           0 :     bf1[57] = -bf0[57] + bf0[56];
    2567           0 :     bf1[58] = -bf0[58] + bf0[59];
    2568           0 :     bf1[59] = bf0[59] + bf0[58];
    2569           0 :     bf1[60] = bf0[60] + bf0[61];
    2570           0 :     bf1[61] = -bf0[61] + bf0[60];
    2571           0 :     bf1[62] = -bf0[62] + bf0[63];
    2572           0 :     bf1[63] = bf0[63] + bf0[62];
    2573           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2574             : 
    2575             :     // stage 10
    2576           0 :     stage++;
    2577           0 :     cospi = cospi_arr(cos_bit);
    2578           0 :     bf0 = output;
    2579           0 :     bf1 = step;
    2580           0 :     bf1[0] = bf0[0];
    2581           0 :     bf1[1] = bf0[1];
    2582           0 :     bf1[2] = bf0[2];
    2583           0 :     bf1[3] = bf0[3];
    2584           0 :     bf1[4] = bf0[4];
    2585           0 :     bf1[5] = bf0[5];
    2586           0 :     bf1[6] = bf0[6];
    2587           0 :     bf1[7] = bf0[7];
    2588           0 :     bf1[8] = bf0[8];
    2589           0 :     bf1[9] = bf0[9];
    2590           0 :     bf1[10] = bf0[10];
    2591           0 :     bf1[11] = bf0[11];
    2592           0 :     bf1[12] = bf0[12];
    2593           0 :     bf1[13] = bf0[13];
    2594           0 :     bf1[14] = bf0[14];
    2595           0 :     bf1[15] = bf0[15];
    2596           0 :     bf1[16] = bf0[16];
    2597           0 :     bf1[17] = bf0[17];
    2598           0 :     bf1[18] = bf0[18];
    2599           0 :     bf1[19] = bf0[19];
    2600           0 :     bf1[20] = bf0[20];
    2601           0 :     bf1[21] = bf0[21];
    2602           0 :     bf1[22] = bf0[22];
    2603           0 :     bf1[23] = bf0[23];
    2604           0 :     bf1[24] = bf0[24];
    2605           0 :     bf1[25] = bf0[25];
    2606           0 :     bf1[26] = bf0[26];
    2607           0 :     bf1[27] = bf0[27];
    2608           0 :     bf1[28] = bf0[28];
    2609           0 :     bf1[29] = bf0[29];
    2610           0 :     bf1[30] = bf0[30];
    2611           0 :     bf1[31] = bf0[31];
    2612           0 :     bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
    2613           0 :     bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
    2614           0 :     bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
    2615           0 :     bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
    2616           0 :     bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
    2617           0 :     bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
    2618           0 :     bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
    2619           0 :     bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
    2620           0 :     bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
    2621           0 :     bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
    2622           0 :     bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
    2623           0 :     bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
    2624           0 :     bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
    2625           0 :     bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
    2626           0 :     bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
    2627           0 :     bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
    2628           0 :     bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
    2629           0 :     bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
    2630           0 :     bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
    2631           0 :     bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
    2632           0 :     bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
    2633           0 :     bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
    2634           0 :     bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
    2635           0 :     bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
    2636           0 :     bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
    2637           0 :     bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
    2638           0 :     bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
    2639           0 :     bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
    2640           0 :     bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
    2641           0 :     bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
    2642           0 :     bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
    2643           0 :     bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
    2644           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2645             : 
    2646             :     // stage 11
    2647           0 :     stage++;
    2648           0 :     bf0 = step;
    2649           0 :     bf1 = output;
    2650           0 :     bf1[0] = bf0[0];
    2651           0 :     bf1[1] = bf0[32];
    2652           0 :     bf1[2] = bf0[16];
    2653           0 :     bf1[3] = bf0[48];
    2654           0 :     bf1[4] = bf0[8];
    2655           0 :     bf1[5] = bf0[40];
    2656           0 :     bf1[6] = bf0[24];
    2657           0 :     bf1[7] = bf0[56];
    2658           0 :     bf1[8] = bf0[4];
    2659           0 :     bf1[9] = bf0[36];
    2660           0 :     bf1[10] = bf0[20];
    2661           0 :     bf1[11] = bf0[52];
    2662           0 :     bf1[12] = bf0[12];
    2663           0 :     bf1[13] = bf0[44];
    2664           0 :     bf1[14] = bf0[28];
    2665           0 :     bf1[15] = bf0[60];
    2666           0 :     bf1[16] = bf0[2];
    2667           0 :     bf1[17] = bf0[34];
    2668           0 :     bf1[18] = bf0[18];
    2669           0 :     bf1[19] = bf0[50];
    2670           0 :     bf1[20] = bf0[10];
    2671           0 :     bf1[21] = bf0[42];
    2672           0 :     bf1[22] = bf0[26];
    2673           0 :     bf1[23] = bf0[58];
    2674           0 :     bf1[24] = bf0[6];
    2675           0 :     bf1[25] = bf0[38];
    2676           0 :     bf1[26] = bf0[22];
    2677           0 :     bf1[27] = bf0[54];
    2678           0 :     bf1[28] = bf0[14];
    2679           0 :     bf1[29] = bf0[46];
    2680           0 :     bf1[30] = bf0[30];
    2681           0 :     bf1[31] = bf0[62];
    2682           0 :     bf1[32] = bf0[1];
    2683           0 :     bf1[33] = bf0[33];
    2684           0 :     bf1[34] = bf0[17];
    2685           0 :     bf1[35] = bf0[49];
    2686           0 :     bf1[36] = bf0[9];
    2687           0 :     bf1[37] = bf0[41];
    2688           0 :     bf1[38] = bf0[25];
    2689           0 :     bf1[39] = bf0[57];
    2690           0 :     bf1[40] = bf0[5];
    2691           0 :     bf1[41] = bf0[37];
    2692           0 :     bf1[42] = bf0[21];
    2693           0 :     bf1[43] = bf0[53];
    2694           0 :     bf1[44] = bf0[13];
    2695           0 :     bf1[45] = bf0[45];
    2696           0 :     bf1[46] = bf0[29];
    2697           0 :     bf1[47] = bf0[61];
    2698           0 :     bf1[48] = bf0[3];
    2699           0 :     bf1[49] = bf0[35];
    2700           0 :     bf1[50] = bf0[19];
    2701           0 :     bf1[51] = bf0[51];
    2702           0 :     bf1[52] = bf0[11];
    2703           0 :     bf1[53] = bf0[43];
    2704           0 :     bf1[54] = bf0[27];
    2705           0 :     bf1[55] = bf0[59];
    2706           0 :     bf1[56] = bf0[7];
    2707           0 :     bf1[57] = bf0[39];
    2708           0 :     bf1[58] = bf0[23];
    2709           0 :     bf1[59] = bf0[55];
    2710           0 :     bf1[60] = bf0[15];
    2711           0 :     bf1[61] = bf0[47];
    2712           0 :     bf1[62] = bf0[31];
    2713           0 :     bf1[63] = bf0[63];
    2714           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2715           0 : }
    2716             : 
    2717           0 : void eb_av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    2718             :     const int8_t *stage_range) {
    2719           0 :     int32_t bit = cos_bit;
    2720           0 :     const int32_t *sinpi = sinpi_arr(bit);
    2721             :     int32_t x0, x1, x2, x3;
    2722             :     int32_t s0, s1, s2, s3, s4, s5, s6, s7;
    2723             : 
    2724             :     // stage 0
    2725             :     range_check(0, input, input, 4, stage_range[0]);
    2726           0 :     x0 = input[0];
    2727           0 :     x1 = input[1];
    2728           0 :     x2 = input[2];
    2729           0 :     x3 = input[3];
    2730             : 
    2731           0 :     if (!(x0 | x1 | x2 | x3)) {
    2732           0 :         output[0] = output[1] = output[2] = output[3] = 0;
    2733           0 :         return;
    2734             :     }
    2735             : 
    2736             :     //// stage 1
    2737             :     //s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
    2738             :     //s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
    2739             :     //s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
    2740             :     //s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
    2741             :     //s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
    2742             :     //s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
    2743             :     //s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
    2744             :     //s7 = range_check_value(x0 + x1, stage_range[1]);
    2745             : 
    2746             :     //// stage 2
    2747             :     //s7 = range_check_value(s7 - x3, stage_range[2]);
    2748             : 
    2749             :     //// stage 3
    2750             :     //x0 = range_check_value(s0 + s2, bit + stage_range[3]);
    2751             :     //x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
    2752             :     //x2 = range_check_value(s1 - s3, bit + stage_range[3]);
    2753             :     //x3 = range_check_value(s4, bit + stage_range[3]);
    2754             : 
    2755             :     //// stage 4
    2756             :     //x0 = range_check_value(x0 + s5, bit + stage_range[4]);
    2757             :     //x2 = range_check_value(x2 + s6, bit + stage_range[4]);
    2758             : 
    2759             :     //// stage 5
    2760             :     //s0 = range_check_value(x0 + x3, bit + stage_range[5]);
    2761             :     //s1 = range_check_value(x1, bit + stage_range[5]);
    2762             :     //s2 = range_check_value(x2 - x3, bit + stage_range[5]);
    2763             :     //s3 = range_check_value(x2 - x0, bit + stage_range[5]);
    2764             : 
    2765             :     //// stage 6
    2766             :     //s3 = range_check_value(s3 + x3, bit + stage_range[6]);
    2767             : 
    2768             :     // stage 1
    2769           0 :     s0 = sinpi[1] * x0;
    2770           0 :     s1 = sinpi[4] * x0;
    2771           0 :     s2 = sinpi[2] * x1;
    2772           0 :     s3 = sinpi[1] * x1;
    2773           0 :     s4 = sinpi[3] * x2;
    2774           0 :     s5 = sinpi[4] * x3;
    2775           0 :     s6 = sinpi[2] * x3;
    2776           0 :     s7 = x0 + x1;
    2777             : 
    2778             :     // stage 2
    2779           0 :     s7 = s7 - x3;
    2780             : 
    2781             :     // stage 3
    2782           0 :     x0 = s0 + s2;
    2783           0 :     x1 = sinpi[3] * s7;
    2784           0 :     x2 = s1 - s3;
    2785           0 :     x3 = s4;
    2786             : 
    2787             :     // stage 4
    2788           0 :     x0 = x0 + s5;
    2789           0 :     x2 = x2 + s6;
    2790             : 
    2791             :     // stage 5
    2792           0 :     s0 = x0 + x3;
    2793           0 :     s1 = x1;
    2794           0 :     s2 = x2 - x3;
    2795           0 :     s3 = x2 - x0;
    2796             : 
    2797             :     // stage 6
    2798           0 :     s3 = s3 + x3;
    2799             : 
    2800             :     // 1-D transform scaling factor is sqrt(2).
    2801           0 :     output[0] = round_shift(s0, bit);
    2802           0 :     output[1] = round_shift(s1, bit);
    2803           0 :     output[2] = round_shift(s2, bit);
    2804           0 :     output[3] = round_shift(s3, bit);
    2805           0 :     range_check(6, input, output, 4, stage_range[6]);
    2806             : }
    2807             : 
    2808           0 : void eb_av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    2809             :     const int8_t *stage_range) {
    2810           0 :     const int32_t size = 8;
    2811             :     const int32_t *cospi;
    2812             : 
    2813           0 :     int32_t stage = 0;
    2814             :     int32_t *bf0, *bf1;
    2815             :     int32_t step[8];
    2816             : 
    2817             :     // stage 0;
    2818           0 :     range_check(stage, input, input, size, stage_range[stage]);
    2819             : 
    2820             :     // stage 1;
    2821           0 :     stage++;
    2822           0 :     assert(output != input);
    2823           0 :     bf1 = output;
    2824           0 :     bf1[0] = input[0];
    2825           0 :     bf1[1] = -input[7];
    2826           0 :     bf1[2] = -input[3];
    2827           0 :     bf1[3] = input[4];
    2828           0 :     bf1[4] = -input[1];
    2829           0 :     bf1[5] = input[6];
    2830           0 :     bf1[6] = input[2];
    2831           0 :     bf1[7] = -input[5];
    2832           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2833             : 
    2834             :     // stage 2
    2835           0 :     stage++;
    2836           0 :     cospi = cospi_arr(cos_bit);
    2837           0 :     bf0 = output;
    2838           0 :     bf1 = step;
    2839           0 :     bf1[0] = bf0[0];
    2840           0 :     bf1[1] = bf0[1];
    2841           0 :     bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
    2842           0 :     bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
    2843           0 :     bf1[4] = bf0[4];
    2844           0 :     bf1[5] = bf0[5];
    2845           0 :     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
    2846           0 :     bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
    2847           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2848             : 
    2849             :     // stage 3
    2850           0 :     stage++;
    2851           0 :     bf0 = step;
    2852           0 :     bf1 = output;
    2853           0 :     bf1[0] = bf0[0] + bf0[2];
    2854           0 :     bf1[1] = bf0[1] + bf0[3];
    2855           0 :     bf1[2] = bf0[0] - bf0[2];
    2856           0 :     bf1[3] = bf0[1] - bf0[3];
    2857           0 :     bf1[4] = bf0[4] + bf0[6];
    2858           0 :     bf1[5] = bf0[5] + bf0[7];
    2859           0 :     bf1[6] = bf0[4] - bf0[6];
    2860           0 :     bf1[7] = bf0[5] - bf0[7];
    2861           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2862             : 
    2863             :     // stage 4
    2864           0 :     stage++;
    2865           0 :     cospi = cospi_arr(cos_bit);
    2866           0 :     bf0 = output;
    2867           0 :     bf1 = step;
    2868           0 :     bf1[0] = bf0[0];
    2869           0 :     bf1[1] = bf0[1];
    2870           0 :     bf1[2] = bf0[2];
    2871           0 :     bf1[3] = bf0[3];
    2872           0 :     bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
    2873           0 :     bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
    2874           0 :     bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
    2875           0 :     bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
    2876           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2877             : 
    2878             :     // stage 5
    2879           0 :     stage++;
    2880           0 :     bf0 = step;
    2881           0 :     bf1 = output;
    2882           0 :     bf1[0] = bf0[0] + bf0[4];
    2883           0 :     bf1[1] = bf0[1] + bf0[5];
    2884           0 :     bf1[2] = bf0[2] + bf0[6];
    2885           0 :     bf1[3] = bf0[3] + bf0[7];
    2886           0 :     bf1[4] = bf0[0] - bf0[4];
    2887           0 :     bf1[5] = bf0[1] - bf0[5];
    2888           0 :     bf1[6] = bf0[2] - bf0[6];
    2889           0 :     bf1[7] = bf0[3] - bf0[7];
    2890           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2891             : 
    2892             :     // stage 6
    2893           0 :     stage++;
    2894           0 :     cospi = cospi_arr(cos_bit);
    2895           0 :     bf0 = output;
    2896           0 :     bf1 = step;
    2897           0 :     bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
    2898           0 :     bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
    2899           0 :     bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
    2900           0 :     bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
    2901           0 :     bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
    2902           0 :     bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
    2903           0 :     bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
    2904           0 :     bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
    2905           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2906             : 
    2907             :     // stage 7
    2908           0 :     stage++;
    2909           0 :     bf0 = step;
    2910           0 :     bf1 = output;
    2911           0 :     bf1[0] = bf0[1];
    2912           0 :     bf1[1] = bf0[6];
    2913           0 :     bf1[2] = bf0[3];
    2914           0 :     bf1[3] = bf0[4];
    2915           0 :     bf1[4] = bf0[5];
    2916           0 :     bf1[5] = bf0[2];
    2917           0 :     bf1[6] = bf0[7];
    2918           0 :     bf1[7] = bf0[0];
    2919           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2920           0 : }
    2921             : 
    2922           0 : void eb_av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    2923             :     const int8_t *stage_range) {
    2924           0 :     const int32_t size = 16;
    2925             :     const int32_t *cospi;
    2926             : 
    2927           0 :     int32_t stage = 0;
    2928             :     int32_t *bf0, *bf1;
    2929             :     int32_t step[16];
    2930             : 
    2931             :     // stage 0;
    2932           0 :     range_check(stage, input, input, size, stage_range[stage]);
    2933             : 
    2934             :     // stage 1;
    2935           0 :     stage++;
    2936           0 :     assert(output != input);
    2937           0 :     bf1 = output;
    2938           0 :     bf1[0] = input[0];
    2939           0 :     bf1[1] = -input[15];
    2940           0 :     bf1[2] = -input[7];
    2941           0 :     bf1[3] = input[8];
    2942           0 :     bf1[4] = -input[3];
    2943           0 :     bf1[5] = input[12];
    2944           0 :     bf1[6] = input[4];
    2945           0 :     bf1[7] = -input[11];
    2946           0 :     bf1[8] = -input[1];
    2947           0 :     bf1[9] = input[14];
    2948           0 :     bf1[10] = input[6];
    2949           0 :     bf1[11] = -input[9];
    2950           0 :     bf1[12] = input[2];
    2951           0 :     bf1[13] = -input[13];
    2952           0 :     bf1[14] = -input[5];
    2953           0 :     bf1[15] = input[10];
    2954           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2955             : 
    2956             :     // stage 2
    2957           0 :     stage++;
    2958           0 :     cospi = cospi_arr(cos_bit);
    2959           0 :     bf0 = output;
    2960           0 :     bf1 = step;
    2961           0 :     bf1[0] = bf0[0];
    2962           0 :     bf1[1] = bf0[1];
    2963           0 :     bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
    2964           0 :     bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
    2965           0 :     bf1[4] = bf0[4];
    2966           0 :     bf1[5] = bf0[5];
    2967           0 :     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
    2968           0 :     bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
    2969           0 :     bf1[8] = bf0[8];
    2970           0 :     bf1[9] = bf0[9];
    2971           0 :     bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
    2972           0 :     bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
    2973           0 :     bf1[12] = bf0[12];
    2974           0 :     bf1[13] = bf0[13];
    2975           0 :     bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
    2976           0 :     bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
    2977           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    2978             : 
    2979             :     // stage 3
    2980           0 :     stage++;
    2981           0 :     bf0 = step;
    2982           0 :     bf1 = output;
    2983           0 :     bf1[0] = bf0[0] + bf0[2];
    2984           0 :     bf1[1] = bf0[1] + bf0[3];
    2985           0 :     bf1[2] = bf0[0] - bf0[2];
    2986           0 :     bf1[3] = bf0[1] - bf0[3];
    2987           0 :     bf1[4] = bf0[4] + bf0[6];
    2988           0 :     bf1[5] = bf0[5] + bf0[7];
    2989           0 :     bf1[6] = bf0[4] - bf0[6];
    2990           0 :     bf1[7] = bf0[5] - bf0[7];
    2991           0 :     bf1[8] = bf0[8] + bf0[10];
    2992           0 :     bf1[9] = bf0[9] + bf0[11];
    2993           0 :     bf1[10] = bf0[8] - bf0[10];
    2994           0 :     bf1[11] = bf0[9] - bf0[11];
    2995           0 :     bf1[12] = bf0[12] + bf0[14];
    2996           0 :     bf1[13] = bf0[13] + bf0[15];
    2997           0 :     bf1[14] = bf0[12] - bf0[14];
    2998           0 :     bf1[15] = bf0[13] - bf0[15];
    2999           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3000             : 
    3001             :     // stage 4
    3002           0 :     stage++;
    3003           0 :     cospi = cospi_arr(cos_bit);
    3004           0 :     bf0 = output;
    3005           0 :     bf1 = step;
    3006           0 :     bf1[0] = bf0[0];
    3007           0 :     bf1[1] = bf0[1];
    3008           0 :     bf1[2] = bf0[2];
    3009           0 :     bf1[3] = bf0[3];
    3010           0 :     bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
    3011           0 :     bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
    3012           0 :     bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
    3013           0 :     bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
    3014           0 :     bf1[8] = bf0[8];
    3015           0 :     bf1[9] = bf0[9];
    3016           0 :     bf1[10] = bf0[10];
    3017           0 :     bf1[11] = bf0[11];
    3018           0 :     bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
    3019           0 :     bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
    3020           0 :     bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
    3021           0 :     bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
    3022           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3023             : 
    3024             :     // stage 5
    3025           0 :     stage++;
    3026           0 :     bf0 = step;
    3027           0 :     bf1 = output;
    3028           0 :     bf1[0] = bf0[0] + bf0[4];
    3029           0 :     bf1[1] = bf0[1] + bf0[5];
    3030           0 :     bf1[2] = bf0[2] + bf0[6];
    3031           0 :     bf1[3] = bf0[3] + bf0[7];
    3032           0 :     bf1[4] = bf0[0] - bf0[4];
    3033           0 :     bf1[5] = bf0[1] - bf0[5];
    3034           0 :     bf1[6] = bf0[2] - bf0[6];
    3035           0 :     bf1[7] = bf0[3] - bf0[7];
    3036           0 :     bf1[8] = bf0[8] + bf0[12];
    3037           0 :     bf1[9] = bf0[9] + bf0[13];
    3038           0 :     bf1[10] = bf0[10] + bf0[14];
    3039           0 :     bf1[11] = bf0[11] + bf0[15];
    3040           0 :     bf1[12] = bf0[8] - bf0[12];
    3041           0 :     bf1[13] = bf0[9] - bf0[13];
    3042           0 :     bf1[14] = bf0[10] - bf0[14];
    3043           0 :     bf1[15] = bf0[11] - bf0[15];
    3044           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3045             : 
    3046             :     // stage 6
    3047           0 :     stage++;
    3048           0 :     cospi = cospi_arr(cos_bit);
    3049           0 :     bf0 = output;
    3050           0 :     bf1 = step;
    3051           0 :     bf1[0] = bf0[0];
    3052           0 :     bf1[1] = bf0[1];
    3053           0 :     bf1[2] = bf0[2];
    3054           0 :     bf1[3] = bf0[3];
    3055           0 :     bf1[4] = bf0[4];
    3056           0 :     bf1[5] = bf0[5];
    3057           0 :     bf1[6] = bf0[6];
    3058           0 :     bf1[7] = bf0[7];
    3059           0 :     bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
    3060           0 :     bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
    3061           0 :     bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
    3062           0 :     bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
    3063           0 :     bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
    3064           0 :     bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
    3065           0 :     bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
    3066           0 :     bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
    3067           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3068             : 
    3069             :     // stage 7
    3070           0 :     stage++;
    3071           0 :     bf0 = step;
    3072           0 :     bf1 = output;
    3073           0 :     bf1[0] = bf0[0] + bf0[8];
    3074           0 :     bf1[1] = bf0[1] + bf0[9];
    3075           0 :     bf1[2] = bf0[2] + bf0[10];
    3076           0 :     bf1[3] = bf0[3] + bf0[11];
    3077           0 :     bf1[4] = bf0[4] + bf0[12];
    3078           0 :     bf1[5] = bf0[5] + bf0[13];
    3079           0 :     bf1[6] = bf0[6] + bf0[14];
    3080           0 :     bf1[7] = bf0[7] + bf0[15];
    3081           0 :     bf1[8] = bf0[0] - bf0[8];
    3082           0 :     bf1[9] = bf0[1] - bf0[9];
    3083           0 :     bf1[10] = bf0[2] - bf0[10];
    3084           0 :     bf1[11] = bf0[3] - bf0[11];
    3085           0 :     bf1[12] = bf0[4] - bf0[12];
    3086           0 :     bf1[13] = bf0[5] - bf0[13];
    3087           0 :     bf1[14] = bf0[6] - bf0[14];
    3088           0 :     bf1[15] = bf0[7] - bf0[15];
    3089           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3090             : 
    3091             :     // stage 8
    3092           0 :     stage++;
    3093           0 :     cospi = cospi_arr(cos_bit);
    3094           0 :     bf0 = output;
    3095           0 :     bf1 = step;
    3096           0 :     bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
    3097           0 :     bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
    3098           0 :     bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
    3099           0 :     bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
    3100           0 :     bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
    3101           0 :     bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
    3102           0 :     bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
    3103           0 :     bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
    3104           0 :     bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
    3105           0 :     bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
    3106           0 :     bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
    3107           0 :     bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
    3108           0 :     bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
    3109           0 :     bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
    3110           0 :     bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
    3111           0 :     bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
    3112           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3113             : 
    3114             :     // stage 9
    3115           0 :     stage++;
    3116           0 :     bf0 = step;
    3117           0 :     bf1 = output;
    3118           0 :     bf1[0] = bf0[1];
    3119           0 :     bf1[1] = bf0[14];
    3120           0 :     bf1[2] = bf0[3];
    3121           0 :     bf1[3] = bf0[12];
    3122           0 :     bf1[4] = bf0[5];
    3123           0 :     bf1[5] = bf0[10];
    3124           0 :     bf1[6] = bf0[7];
    3125           0 :     bf1[7] = bf0[8];
    3126           0 :     bf1[8] = bf0[9];
    3127           0 :     bf1[9] = bf0[6];
    3128           0 :     bf1[10] = bf0[11];
    3129           0 :     bf1[11] = bf0[4];
    3130           0 :     bf1[12] = bf0[13];
    3131           0 :     bf1[13] = bf0[2];
    3132           0 :     bf1[14] = bf0[15];
    3133           0 :     bf1[15] = bf0[0];
    3134           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3135           0 : }
    3136             : 
    3137           0 : void av1_fadst32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    3138             :     const int8_t *stage_range) {
    3139           0 :     const int32_t size = 32;
    3140             :     const int32_t *cospi;
    3141             : 
    3142           0 :     int32_t stage = 0;
    3143             :     int32_t *bf0, *bf1;
    3144             :     int32_t step[32];
    3145             : 
    3146             :     // stage 0;
    3147           0 :     range_check(stage, input, input, size, stage_range[stage]);
    3148             : 
    3149             :     // stage 1;
    3150           0 :     stage++;
    3151           0 :     bf1 = output;
    3152           0 :     bf1[0] = input[31];
    3153           0 :     bf1[1] = input[0];
    3154           0 :     bf1[2] = input[29];
    3155           0 :     bf1[3] = input[2];
    3156           0 :     bf1[4] = input[27];
    3157           0 :     bf1[5] = input[4];
    3158           0 :     bf1[6] = input[25];
    3159           0 :     bf1[7] = input[6];
    3160           0 :     bf1[8] = input[23];
    3161           0 :     bf1[9] = input[8];
    3162           0 :     bf1[10] = input[21];
    3163           0 :     bf1[11] = input[10];
    3164           0 :     bf1[12] = input[19];
    3165           0 :     bf1[13] = input[12];
    3166           0 :     bf1[14] = input[17];
    3167           0 :     bf1[15] = input[14];
    3168           0 :     bf1[16] = input[15];
    3169           0 :     bf1[17] = input[16];
    3170           0 :     bf1[18] = input[13];
    3171           0 :     bf1[19] = input[18];
    3172           0 :     bf1[20] = input[11];
    3173           0 :     bf1[21] = input[20];
    3174           0 :     bf1[22] = input[9];
    3175           0 :     bf1[23] = input[22];
    3176           0 :     bf1[24] = input[7];
    3177           0 :     bf1[25] = input[24];
    3178           0 :     bf1[26] = input[5];
    3179           0 :     bf1[27] = input[26];
    3180           0 :     bf1[28] = input[3];
    3181           0 :     bf1[29] = input[28];
    3182           0 :     bf1[30] = input[1];
    3183           0 :     bf1[31] = input[30];
    3184           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3185             : 
    3186             :     // stage 2
    3187           0 :     stage++;
    3188           0 :     cospi = cospi_arr(cos_bit);
    3189           0 :     bf0 = output;
    3190           0 :     bf1 = step;
    3191           0 :     bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit);
    3192           0 :     bf1[1] = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit);
    3193           0 :     bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit);
    3194           0 :     bf1[3] = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit);
    3195           0 :     bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit);
    3196           0 :     bf1[5] = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit);
    3197           0 :     bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit);
    3198           0 :     bf1[7] = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit);
    3199           0 :     bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit);
    3200           0 :     bf1[9] = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit);
    3201           0 :     bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit);
    3202           0 :     bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit);
    3203           0 :     bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit);
    3204           0 :     bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit);
    3205           0 :     bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit);
    3206           0 :     bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit);
    3207           0 :     bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit);
    3208           0 :     bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit);
    3209           0 :     bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit);
    3210           0 :     bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit);
    3211           0 :     bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit);
    3212           0 :     bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit);
    3213           0 :     bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit);
    3214           0 :     bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit);
    3215           0 :     bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit);
    3216           0 :     bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit);
    3217           0 :     bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit);
    3218           0 :     bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit);
    3219           0 :     bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit);
    3220           0 :     bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit);
    3221           0 :     bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit);
    3222           0 :     bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit);
    3223           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3224             : 
    3225             :     // stage 3
    3226           0 :     stage++;
    3227           0 :     bf0 = step;
    3228           0 :     bf1 = output;
    3229           0 :     bf1[0] = bf0[0] + bf0[16];
    3230           0 :     bf1[1] = bf0[1] + bf0[17];
    3231           0 :     bf1[2] = bf0[2] + bf0[18];
    3232           0 :     bf1[3] = bf0[3] + bf0[19];
    3233           0 :     bf1[4] = bf0[4] + bf0[20];
    3234           0 :     bf1[5] = bf0[5] + bf0[21];
    3235           0 :     bf1[6] = bf0[6] + bf0[22];
    3236           0 :     bf1[7] = bf0[7] + bf0[23];
    3237           0 :     bf1[8] = bf0[8] + bf0[24];
    3238           0 :     bf1[9] = bf0[9] + bf0[25];
    3239           0 :     bf1[10] = bf0[10] + bf0[26];
    3240           0 :     bf1[11] = bf0[11] + bf0[27];
    3241           0 :     bf1[12] = bf0[12] + bf0[28];
    3242           0 :     bf1[13] = bf0[13] + bf0[29];
    3243           0 :     bf1[14] = bf0[14] + bf0[30];
    3244           0 :     bf1[15] = bf0[15] + bf0[31];
    3245           0 :     bf1[16] = -bf0[16] + bf0[0];
    3246           0 :     bf1[17] = -bf0[17] + bf0[1];
    3247           0 :     bf1[18] = -bf0[18] + bf0[2];
    3248           0 :     bf1[19] = -bf0[19] + bf0[3];
    3249           0 :     bf1[20] = -bf0[20] + bf0[4];
    3250           0 :     bf1[21] = -bf0[21] + bf0[5];
    3251           0 :     bf1[22] = -bf0[22] + bf0[6];
    3252           0 :     bf1[23] = -bf0[23] + bf0[7];
    3253           0 :     bf1[24] = -bf0[24] + bf0[8];
    3254           0 :     bf1[25] = -bf0[25] + bf0[9];
    3255           0 :     bf1[26] = -bf0[26] + bf0[10];
    3256           0 :     bf1[27] = -bf0[27] + bf0[11];
    3257           0 :     bf1[28] = -bf0[28] + bf0[12];
    3258           0 :     bf1[29] = -bf0[29] + bf0[13];
    3259           0 :     bf1[30] = -bf0[30] + bf0[14];
    3260           0 :     bf1[31] = -bf0[31] + bf0[15];
    3261           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3262             : 
    3263             :     // stage 4
    3264           0 :     stage++;
    3265           0 :     cospi = cospi_arr(cos_bit);
    3266           0 :     bf0 = output;
    3267           0 :     bf1 = step;
    3268           0 :     bf1[0] = bf0[0];
    3269           0 :     bf1[1] = bf0[1];
    3270           0 :     bf1[2] = bf0[2];
    3271           0 :     bf1[3] = bf0[3];
    3272           0 :     bf1[4] = bf0[4];
    3273           0 :     bf1[5] = bf0[5];
    3274           0 :     bf1[6] = bf0[6];
    3275           0 :     bf1[7] = bf0[7];
    3276           0 :     bf1[8] = bf0[8];
    3277           0 :     bf1[9] = bf0[9];
    3278           0 :     bf1[10] = bf0[10];
    3279           0 :     bf1[11] = bf0[11];
    3280           0 :     bf1[12] = bf0[12];
    3281           0 :     bf1[13] = bf0[13];
    3282           0 :     bf1[14] = bf0[14];
    3283           0 :     bf1[15] = bf0[15];
    3284           0 :     bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit);
    3285           0 :     bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit);
    3286           0 :     bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit);
    3287           0 :     bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit);
    3288           0 :     bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit);
    3289           0 :     bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit);
    3290           0 :     bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit);
    3291           0 :     bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit);
    3292           0 :     bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit);
    3293           0 :     bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit);
    3294           0 :     bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit);
    3295           0 :     bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit);
    3296           0 :     bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit);
    3297           0 :     bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit);
    3298           0 :     bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit);
    3299           0 :     bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit);
    3300           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3301             : 
    3302             :     // stage 5
    3303           0 :     stage++;
    3304           0 :     bf0 = step;
    3305           0 :     bf1 = output;
    3306           0 :     bf1[0] = bf0[0] + bf0[8];
    3307           0 :     bf1[1] = bf0[1] + bf0[9];
    3308           0 :     bf1[2] = bf0[2] + bf0[10];
    3309           0 :     bf1[3] = bf0[3] + bf0[11];
    3310           0 :     bf1[4] = bf0[4] + bf0[12];
    3311           0 :     bf1[5] = bf0[5] + bf0[13];
    3312           0 :     bf1[6] = bf0[6] + bf0[14];
    3313           0 :     bf1[7] = bf0[7] + bf0[15];
    3314           0 :     bf1[8] = -bf0[8] + bf0[0];
    3315           0 :     bf1[9] = -bf0[9] + bf0[1];
    3316           0 :     bf1[10] = -bf0[10] + bf0[2];
    3317           0 :     bf1[11] = -bf0[11] + bf0[3];
    3318           0 :     bf1[12] = -bf0[12] + bf0[4];
    3319           0 :     bf1[13] = -bf0[13] + bf0[5];
    3320           0 :     bf1[14] = -bf0[14] + bf0[6];
    3321           0 :     bf1[15] = -bf0[15] + bf0[7];
    3322           0 :     bf1[16] = bf0[16] + bf0[24];
    3323           0 :     bf1[17] = bf0[17] + bf0[25];
    3324           0 :     bf1[18] = bf0[18] + bf0[26];
    3325           0 :     bf1[19] = bf0[19] + bf0[27];
    3326           0 :     bf1[20] = bf0[20] + bf0[28];
    3327           0 :     bf1[21] = bf0[21] + bf0[29];
    3328           0 :     bf1[22] = bf0[22] + bf0[30];
    3329           0 :     bf1[23] = bf0[23] + bf0[31];
    3330           0 :     bf1[24] = -bf0[24] + bf0[16];
    3331           0 :     bf1[25] = -bf0[25] + bf0[17];
    3332           0 :     bf1[26] = -bf0[26] + bf0[18];
    3333           0 :     bf1[27] = -bf0[27] + bf0[19];
    3334           0 :     bf1[28] = -bf0[28] + bf0[20];
    3335           0 :     bf1[29] = -bf0[29] + bf0[21];
    3336           0 :     bf1[30] = -bf0[30] + bf0[22];
    3337           0 :     bf1[31] = -bf0[31] + bf0[23];
    3338           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3339             : 
    3340             :     // stage 6
    3341           0 :     stage++;
    3342           0 :     cospi = cospi_arr(cos_bit);
    3343           0 :     bf0 = output;
    3344           0 :     bf1 = step;
    3345           0 :     bf1[0] = bf0[0];
    3346           0 :     bf1[1] = bf0[1];
    3347           0 :     bf1[2] = bf0[2];
    3348           0 :     bf1[3] = bf0[3];
    3349           0 :     bf1[4] = bf0[4];
    3350           0 :     bf1[5] = bf0[5];
    3351           0 :     bf1[6] = bf0[6];
    3352           0 :     bf1[7] = bf0[7];
    3353           0 :     bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
    3354           0 :     bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit);
    3355           0 :     bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
    3356           0 :     bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit);
    3357           0 :     bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
    3358           0 :     bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit);
    3359           0 :     bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
    3360           0 :     bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit);
    3361           0 :     bf1[16] = bf0[16];
    3362           0 :     bf1[17] = bf0[17];
    3363           0 :     bf1[18] = bf0[18];
    3364           0 :     bf1[19] = bf0[19];
    3365           0 :     bf1[20] = bf0[20];
    3366           0 :     bf1[21] = bf0[21];
    3367           0 :     bf1[22] = bf0[22];
    3368           0 :     bf1[23] = bf0[23];
    3369           0 :     bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit);
    3370           0 :     bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit);
    3371           0 :     bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit);
    3372           0 :     bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit);
    3373           0 :     bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit);
    3374           0 :     bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit);
    3375           0 :     bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit);
    3376           0 :     bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit);
    3377           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3378             : 
    3379             :     // stage 7
    3380           0 :     stage++;
    3381           0 :     bf0 = step;
    3382           0 :     bf1 = output;
    3383           0 :     bf1[0] = bf0[0] + bf0[4];
    3384           0 :     bf1[1] = bf0[1] + bf0[5];
    3385           0 :     bf1[2] = bf0[2] + bf0[6];
    3386           0 :     bf1[3] = bf0[3] + bf0[7];
    3387           0 :     bf1[4] = -bf0[4] + bf0[0];
    3388           0 :     bf1[5] = -bf0[5] + bf0[1];
    3389           0 :     bf1[6] = -bf0[6] + bf0[2];
    3390           0 :     bf1[7] = -bf0[7] + bf0[3];
    3391           0 :     bf1[8] = bf0[8] + bf0[12];
    3392           0 :     bf1[9] = bf0[9] + bf0[13];
    3393           0 :     bf1[10] = bf0[10] + bf0[14];
    3394           0 :     bf1[11] = bf0[11] + bf0[15];
    3395           0 :     bf1[12] = -bf0[12] + bf0[8];
    3396           0 :     bf1[13] = -bf0[13] + bf0[9];
    3397           0 :     bf1[14] = -bf0[14] + bf0[10];
    3398           0 :     bf1[15] = -bf0[15] + bf0[11];
    3399           0 :     bf1[16] = bf0[16] + bf0[20];
    3400           0 :     bf1[17] = bf0[17] + bf0[21];
    3401           0 :     bf1[18] = bf0[18] + bf0[22];
    3402           0 :     bf1[19] = bf0[19] + bf0[23];
    3403           0 :     bf1[20] = -bf0[20] + bf0[16];
    3404           0 :     bf1[21] = -bf0[21] + bf0[17];
    3405           0 :     bf1[22] = -bf0[22] + bf0[18];
    3406           0 :     bf1[23] = -bf0[23] + bf0[19];
    3407           0 :     bf1[24] = bf0[24] + bf0[28];
    3408           0 :     bf1[25] = bf0[25] + bf0[29];
    3409           0 :     bf1[26] = bf0[26] + bf0[30];
    3410           0 :     bf1[27] = bf0[27] + bf0[31];
    3411           0 :     bf1[28] = -bf0[28] + bf0[24];
    3412           0 :     bf1[29] = -bf0[29] + bf0[25];
    3413           0 :     bf1[30] = -bf0[30] + bf0[26];
    3414           0 :     bf1[31] = -bf0[31] + bf0[27];
    3415           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3416             : 
    3417             :     // stage 8
    3418           0 :     stage++;
    3419           0 :     cospi = cospi_arr(cos_bit);
    3420           0 :     bf0 = output;
    3421           0 :     bf1 = step;
    3422           0 :     bf1[0] = bf0[0];
    3423           0 :     bf1[1] = bf0[1];
    3424           0 :     bf1[2] = bf0[2];
    3425           0 :     bf1[3] = bf0[3];
    3426           0 :     bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
    3427           0 :     bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit);
    3428           0 :     bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
    3429           0 :     bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit);
    3430           0 :     bf1[8] = bf0[8];
    3431           0 :     bf1[9] = bf0[9];
    3432           0 :     bf1[10] = bf0[10];
    3433           0 :     bf1[11] = bf0[11];
    3434           0 :     bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
    3435           0 :     bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit);
    3436           0 :     bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
    3437           0 :     bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit);
    3438           0 :     bf1[16] = bf0[16];
    3439           0 :     bf1[17] = bf0[17];
    3440           0 :     bf1[18] = bf0[18];
    3441           0 :     bf1[19] = bf0[19];
    3442           0 :     bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit);
    3443           0 :     bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit);
    3444           0 :     bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit);
    3445           0 :     bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit);
    3446           0 :     bf1[24] = bf0[24];
    3447           0 :     bf1[25] = bf0[25];
    3448           0 :     bf1[26] = bf0[26];
    3449           0 :     bf1[27] = bf0[27];
    3450           0 :     bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit);
    3451           0 :     bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit);
    3452           0 :     bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit);
    3453           0 :     bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit);
    3454           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3455             : 
    3456             :     // stage 9
    3457           0 :     stage++;
    3458           0 :     bf0 = step;
    3459           0 :     bf1 = output;
    3460           0 :     bf1[0] = bf0[0] + bf0[2];
    3461           0 :     bf1[1] = bf0[1] + bf0[3];
    3462           0 :     bf1[2] = -bf0[2] + bf0[0];
    3463           0 :     bf1[3] = -bf0[3] + bf0[1];
    3464           0 :     bf1[4] = bf0[4] + bf0[6];
    3465           0 :     bf1[5] = bf0[5] + bf0[7];
    3466           0 :     bf1[6] = -bf0[6] + bf0[4];
    3467           0 :     bf1[7] = -bf0[7] + bf0[5];
    3468           0 :     bf1[8] = bf0[8] + bf0[10];
    3469           0 :     bf1[9] = bf0[9] + bf0[11];
    3470           0 :     bf1[10] = -bf0[10] + bf0[8];
    3471           0 :     bf1[11] = -bf0[11] + bf0[9];
    3472           0 :     bf1[12] = bf0[12] + bf0[14];
    3473           0 :     bf1[13] = bf0[13] + bf0[15];
    3474           0 :     bf1[14] = -bf0[14] + bf0[12];
    3475           0 :     bf1[15] = -bf0[15] + bf0[13];
    3476           0 :     bf1[16] = bf0[16] + bf0[18];
    3477           0 :     bf1[17] = bf0[17] + bf0[19];
    3478           0 :     bf1[18] = -bf0[18] + bf0[16];
    3479           0 :     bf1[19] = -bf0[19] + bf0[17];
    3480           0 :     bf1[20] = bf0[20] + bf0[22];
    3481           0 :     bf1[21] = bf0[21] + bf0[23];
    3482           0 :     bf1[22] = -bf0[22] + bf0[20];
    3483           0 :     bf1[23] = -bf0[23] + bf0[21];
    3484           0 :     bf1[24] = bf0[24] + bf0[26];
    3485           0 :     bf1[25] = bf0[25] + bf0[27];
    3486           0 :     bf1[26] = -bf0[26] + bf0[24];
    3487           0 :     bf1[27] = -bf0[27] + bf0[25];
    3488           0 :     bf1[28] = bf0[28] + bf0[30];
    3489           0 :     bf1[29] = bf0[29] + bf0[31];
    3490           0 :     bf1[30] = -bf0[30] + bf0[28];
    3491           0 :     bf1[31] = -bf0[31] + bf0[29];
    3492           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3493             : 
    3494             :     // stage 10
    3495           0 :     stage++;
    3496           0 :     cospi = cospi_arr(cos_bit);
    3497           0 :     bf0 = output;
    3498           0 :     bf1 = step;
    3499           0 :     bf1[0] = bf0[0];
    3500           0 :     bf1[1] = bf0[1];
    3501           0 :     bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
    3502           0 :     bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit);
    3503           0 :     bf1[4] = bf0[4];
    3504           0 :     bf1[5] = bf0[5];
    3505           0 :     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
    3506           0 :     bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit);
    3507           0 :     bf1[8] = bf0[8];
    3508           0 :     bf1[9] = bf0[9];
    3509           0 :     bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
    3510           0 :     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit);
    3511           0 :     bf1[12] = bf0[12];
    3512           0 :     bf1[13] = bf0[13];
    3513           0 :     bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
    3514           0 :     bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit);
    3515           0 :     bf1[16] = bf0[16];
    3516           0 :     bf1[17] = bf0[17];
    3517           0 :     bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit);
    3518           0 :     bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit);
    3519           0 :     bf1[20] = bf0[20];
    3520           0 :     bf1[21] = bf0[21];
    3521           0 :     bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit);
    3522           0 :     bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit);
    3523           0 :     bf1[24] = bf0[24];
    3524           0 :     bf1[25] = bf0[25];
    3525           0 :     bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit);
    3526           0 :     bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit);
    3527           0 :     bf1[28] = bf0[28];
    3528           0 :     bf1[29] = bf0[29];
    3529           0 :     bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit);
    3530           0 :     bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit);
    3531           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3532             : 
    3533             :     // stage 11
    3534           0 :     stage++;
    3535           0 :     bf0 = step;
    3536           0 :     bf1 = output;
    3537           0 :     bf1[0] = bf0[0];
    3538           0 :     bf1[1] = -bf0[16];
    3539           0 :     bf1[2] = bf0[24];
    3540           0 :     bf1[3] = -bf0[8];
    3541           0 :     bf1[4] = bf0[12];
    3542           0 :     bf1[5] = -bf0[28];
    3543           0 :     bf1[6] = bf0[20];
    3544           0 :     bf1[7] = -bf0[4];
    3545           0 :     bf1[8] = bf0[6];
    3546           0 :     bf1[9] = -bf0[22];
    3547           0 :     bf1[10] = bf0[30];
    3548           0 :     bf1[11] = -bf0[14];
    3549           0 :     bf1[12] = bf0[10];
    3550           0 :     bf1[13] = -bf0[26];
    3551           0 :     bf1[14] = bf0[18];
    3552           0 :     bf1[15] = -bf0[2];
    3553           0 :     bf1[16] = bf0[3];
    3554           0 :     bf1[17] = -bf0[19];
    3555           0 :     bf1[18] = bf0[27];
    3556           0 :     bf1[19] = -bf0[11];
    3557           0 :     bf1[20] = bf0[15];
    3558           0 :     bf1[21] = -bf0[31];
    3559           0 :     bf1[22] = bf0[23];
    3560           0 :     bf1[23] = -bf0[7];
    3561           0 :     bf1[24] = bf0[5];
    3562           0 :     bf1[25] = -bf0[21];
    3563           0 :     bf1[26] = bf0[29];
    3564           0 :     bf1[27] = -bf0[13];
    3565           0 :     bf1[28] = bf0[9];
    3566           0 :     bf1[29] = -bf0[25];
    3567           0 :     bf1[30] = bf0[17];
    3568           0 :     bf1[31] = -bf0[1];
    3569           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3570           0 : }
    3571             : 
    3572           0 : void eb_av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
    3573             :     const int8_t *stage_range) {
    3574             :     (void)cos_bit;
    3575           0 :     for (int32_t i = 0; i < 4; ++i)
    3576           0 :         output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
    3577           0 :     assert(stage_range[0] + NewSqrt2Bits <= 32);
    3578             :     range_check(0, input, output, 4, stage_range[0]);
    3579           0 : }
    3580             : 
    3581           0 : void eb_av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
    3582             :     const int8_t *stage_range) {
    3583             :     (void)cos_bit;
    3584           0 :     for (int32_t i = 0; i < 8; ++i) output[i] = input[i] * 2;
    3585             :     range_check(0, input, output, 8, stage_range[0]);
    3586           0 : }
    3587             : 
    3588           0 : void eb_av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
    3589             :     const int8_t *stage_range) {
    3590             :     (void)cos_bit;
    3591           0 :     for (int32_t i = 0; i < 16; ++i)
    3592           0 :         output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
    3593           0 :     assert(stage_range[0] + NewSqrt2Bits <= 32);
    3594             :     range_check(0, input, output, 16, stage_range[0]);
    3595           0 : }
    3596             : 
    3597           0 : void eb_av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
    3598             :     const int8_t *stage_range) {
    3599             :     (void)cos_bit;
    3600           0 :     for (int32_t i = 0; i < 32; ++i) output[i] = input[i] * 4;
    3601             :     range_check(0, input, output, 32, stage_range[0]);
    3602           0 : }
    3603             : 
    3604           0 : void av1_fidentity64_c(const int32_t *input, int32_t *output, int8_t cos_bit,
    3605             :     const int8_t *stage_range) {
    3606             :     (void)cos_bit;
    3607           0 :     for (int32_t i = 0; i < 64; ++i)
    3608           0 :         output[i] = round_shift((int64_t)input[i] * 4 * NewSqrt2, NewSqrt2Bits);
    3609           0 :     assert(stage_range[0] + NewSqrt2Bits <= 32);
    3610             :     range_check(0, input, output, 64, stage_range[0]);
    3611           0 : }
    3612             : 
    3613     3833360 : static INLINE TxfmFunc fwd_txfm_type_to_func(TxfmType TxfmType) {
    3614     3833360 :     switch (TxfmType) {
    3615           0 :     case TXFM_TYPE_DCT4: return eb_av1_fdct4_new;
    3616           0 :     case TXFM_TYPE_DCT8: return eb_av1_fdct8_new;
    3617     1916700 :     case TXFM_TYPE_DCT16: return eb_av1_fdct16_new;
    3618     1916710 :     case TXFM_TYPE_DCT32: return eb_av1_fdct32_new;
    3619           0 :     case TXFM_TYPE_DCT64: return eb_av1_fdct64_new;
    3620           0 :     case TXFM_TYPE_ADST4: return eb_av1_fadst4_new;
    3621           0 :     case TXFM_TYPE_ADST8: return eb_av1_fadst8_new;
    3622           0 :     case TXFM_TYPE_ADST16: return eb_av1_fadst16_new;
    3623           0 :     case TXFM_TYPE_ADST32: return av1_fadst32_new;
    3624           0 :     case TXFM_TYPE_IDENTITY4: return eb_av1_fidentity4_c;
    3625           0 :     case TXFM_TYPE_IDENTITY8: return eb_av1_fidentity8_c;
    3626           0 :     case TXFM_TYPE_IDENTITY16: return eb_av1_fidentity16_c;
    3627           0 :     case TXFM_TYPE_IDENTITY32: return eb_av1_fidentity32_c;
    3628           0 :     case TXFM_TYPE_IDENTITY64: return av1_fidentity64_c;
    3629           0 :     default: assert(0); return NULL;
    3630             :     }
    3631             : }
    3632             : 
    3633   152882000 : void eb_av1_round_shift_array_c(int32_t *arr, int32_t size, int32_t bit) {
    3634             :     int32_t i;
    3635   152882000 :     if (bit == 0)
    3636    30633500 :         return;
    3637             :     else {
    3638   122248000 :         if (bit > 0) {
    3639  1037510000 :             for (i = 0; i < size; i++)
    3640   976355000 :                 arr[i] = round_shift(arr[i], bit);
    3641             :         }
    3642             :         else {
    3643  1040080000 :             for (i = 0; i < size; i++)
    3644   979090000 :                 arr[i] = arr[i] * (1 << (-bit));
    3645             :         }
    3646             :     }
    3647             : }
    3648             : //fwd_txfm2d_c
    3649     1916680 : static INLINE void Av1TranformTwoDCore_c(
    3650             :     int16_t                     *input,
    3651             :     uint32_t                      input_stride,
    3652             :     int32_t                      *output,
    3653             :     const Txfm2DFlipCfg      *cfg,
    3654             :     int32_t                      *buf,
    3655             :     uint8_t                        bit_depth)
    3656             : {
    3657             :     int32_t c, r;
    3658             :     // Note when assigning txfm_size_col, we use the txfm_size from the
    3659             :     // row configuration and vice versa. This is intentionally done to
    3660             :     // accurately perform rectangular transforms. When the transform is
    3661             :     // rectangular, the number of columns will be the same as the
    3662             :     // txfm_size stored in the row cfg struct. It will make no difference
    3663             :     // for square transforms.
    3664     1916680 :     const int32_t txfm_size_col = tx_size_wide[cfg->tx_size];
    3665     1916680 :     const int32_t txfm_size_row = tx_size_high[cfg->tx_size];
    3666             :     // Take the shift from the larger dimension in the rectangular case.
    3667     1916680 :     const int8_t *shift = cfg->shift;
    3668     1916680 :     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
    3669             :     int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
    3670             :     int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
    3671     1916680 :     assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
    3672     1916680 :     assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
    3673     1916680 :     eb_av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bit_depth);
    3674             : 
    3675     1916700 :     const int8_t cos_bit_col = cfg->cos_bit_col;
    3676     1916700 :     const int8_t cos_bit_row = cfg->cos_bit_row;
    3677     1916700 :     const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
    3678     1916700 :     const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
    3679             :     ASSERT(txfm_func_col != NULL);
    3680             :     ASSERT(txfm_func_row != NULL);
    3681             :     // use output buffer as temp buffer
    3682     1977540 :     int32_t *temp_in = output;
    3683     1977540 :     int32_t *temp_out = output + txfm_size_row;
    3684             : 
    3685             :     // Columns
    3686    63146700 :     for (c = 0; c < txfm_size_col; ++c) {
    3687    61230000 :         if (cfg->ud_flip == 0)
    3688  1039890000 :             for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * input_stride + c];
    3689             :         else {
    3690           0 :             for (r = 0; r < txfm_size_row; ++r)
    3691             :                 // flip upside down
    3692           0 :                 temp_in[r] = input[(txfm_size_row - r - 1) * input_stride + c];
    3693             :         }
    3694    61230000 :         eb_av1_round_shift_array_c(temp_in, txfm_size_row, -shift[0]); // NM eb_av1_round_shift_array_c
    3695    61218600 :         txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
    3696    61246600 :         eb_av1_round_shift_array_c(temp_out, txfm_size_row, -shift[1]); // NM eb_av1_round_shift_array_c
    3697    61169100 :         if (cfg->lr_flip == 0) {
    3698  1039700000 :             for (r = 0; r < txfm_size_row; ++r)
    3699   978529000 :                 buf[r * txfm_size_col + c] = temp_out[r];
    3700             :         }
    3701             :         else {
    3702           0 :             for (r = 0; r < txfm_size_row; ++r)
    3703             :                 // flip from left to right
    3704           0 :                 buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
    3705             :         }
    3706             :     }
    3707             : 
    3708             :     // Rows
    3709    32526000 :     for (r = 0; r < txfm_size_row; ++r) {
    3710    30615200 :         txfm_func_row(buf + r * txfm_size_col,
    3711    30615200 :             output + r * txfm_size_col,
    3712             :             cos_bit_row,
    3713             :             stage_range_row);
    3714    30633600 :         eb_av1_round_shift_array_c(output + r * txfm_size_col, txfm_size_col, -shift[2]);
    3715             : 
    3716    30702000 :         if (abs(rect_type) == 1) {
    3717             :             // Multiply everything by Sqrt2 if the transform is rectangular and the
    3718             :             // size difference is a factor of 2.
    3719  1004890000 :             for (c = 0; c < txfm_size_col; ++c) {
    3720   974261000 :                 output[r * txfm_size_col + c] = round_shift(
    3721   974354000 :                     (int64_t)output[r * txfm_size_col + c] * NewSqrt2, NewSqrt2Bits);
    3722             :             }
    3723             :         }
    3724             :     }
    3725     1910800 : }
    3726             : 
    3727           0 : void av1_round_shift_array_pf_c(int32_t *arr_in, int32_t *arr_out, int32_t size, int32_t bit) {
    3728             :     int32_t i;
    3729           0 :     if (bit == 0) {
    3730           0 :         for (i = 0; i < size; i++)
    3731           0 :             arr_out[i] = arr_in[i];
    3732             :     }
    3733             :     else {
    3734           0 :         if (bit > 0) {
    3735           0 :             for (i = 0; i < size; i++)
    3736           0 :                 arr_out[i] = round_shift(arr_in[i], bit);
    3737             :         }
    3738             :         else {
    3739           0 :             for (i = 0; i < size; i++)
    3740           0 :                 arr_out[i] = arr_in[i] * (1 << (-bit));
    3741             :         }
    3742             :     }
    3743           0 : }
    3744           0 : void av1_fdct32_pf_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    3745             :     const int8_t *stage_range) {
    3746           0 :     const int32_t size = 32;
    3747             :     const int32_t *cospi;
    3748             : 
    3749           0 :     int32_t stage = 0;
    3750             :     int32_t *bf0, *bf1;
    3751             :     int32_t step[32];
    3752             : 
    3753             :     // stage 0;
    3754           0 :     range_check(stage, input, input, size, stage_range[stage]);
    3755             : 
    3756             :     // stage 1;
    3757           0 :     stage++;
    3758           0 :     bf1 = output;
    3759           0 :     bf1[0] = input[0] + input[31];
    3760           0 :     bf1[1] = input[1] + input[30];
    3761           0 :     bf1[2] = input[2] + input[29];
    3762           0 :     bf1[3] = input[3] + input[28];
    3763           0 :     bf1[4] = input[4] + input[27];
    3764           0 :     bf1[5] = input[5] + input[26];
    3765           0 :     bf1[6] = input[6] + input[25];
    3766           0 :     bf1[7] = input[7] + input[24];
    3767           0 :     bf1[8] = input[8] + input[23];
    3768           0 :     bf1[9] = input[9] + input[22];
    3769           0 :     bf1[10] = input[10] + input[21];
    3770           0 :     bf1[11] = input[11] + input[20];
    3771           0 :     bf1[12] = input[12] + input[19];
    3772           0 :     bf1[13] = input[13] + input[18];
    3773           0 :     bf1[14] = input[14] + input[17];
    3774           0 :     bf1[15] = input[15] + input[16];
    3775           0 :     bf1[16] = -input[16] + input[15];
    3776           0 :     bf1[17] = -input[17] + input[14];
    3777           0 :     bf1[18] = -input[18] + input[13];
    3778           0 :     bf1[19] = -input[19] + input[12];
    3779           0 :     bf1[20] = -input[20] + input[11];
    3780           0 :     bf1[21] = -input[21] + input[10];
    3781           0 :     bf1[22] = -input[22] + input[9];
    3782           0 :     bf1[23] = -input[23] + input[8];
    3783           0 :     bf1[24] = -input[24] + input[7];
    3784           0 :     bf1[25] = -input[25] + input[6];
    3785           0 :     bf1[26] = -input[26] + input[5];
    3786           0 :     bf1[27] = -input[27] + input[4];
    3787           0 :     bf1[28] = -input[28] + input[3];
    3788           0 :     bf1[29] = -input[29] + input[2];
    3789           0 :     bf1[30] = -input[30] + input[1];
    3790           0 :     bf1[31] = -input[31] + input[0];
    3791           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3792             : 
    3793             :     // stage 2
    3794           0 :     stage++;
    3795           0 :     cospi = cospi_arr(cos_bit);
    3796           0 :     bf0 = output;
    3797           0 :     bf1 = step;
    3798           0 :     bf1[0] = bf0[0] + bf0[15];
    3799           0 :     bf1[1] = bf0[1] + bf0[14];
    3800           0 :     bf1[2] = bf0[2] + bf0[13];
    3801           0 :     bf1[3] = bf0[3] + bf0[12];
    3802           0 :     bf1[4] = bf0[4] + bf0[11];
    3803           0 :     bf1[5] = bf0[5] + bf0[10];
    3804           0 :     bf1[6] = bf0[6] + bf0[9];
    3805           0 :     bf1[7] = bf0[7] + bf0[8];
    3806           0 :     bf1[8] = -bf0[8] + bf0[7];
    3807           0 :     bf1[9] = -bf0[9] + bf0[6];
    3808           0 :     bf1[10] = -bf0[10] + bf0[5];
    3809           0 :     bf1[11] = -bf0[11] + bf0[4];
    3810           0 :     bf1[12] = -bf0[12] + bf0[3];
    3811           0 :     bf1[13] = -bf0[13] + bf0[2];
    3812           0 :     bf1[14] = -bf0[14] + bf0[1];
    3813           0 :     bf1[15] = -bf0[15] + bf0[0];
    3814           0 :     bf1[16] = bf0[16];
    3815           0 :     bf1[17] = bf0[17];
    3816           0 :     bf1[18] = bf0[18];
    3817           0 :     bf1[19] = bf0[19];
    3818           0 :     bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
    3819           0 :     bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
    3820           0 :     bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
    3821           0 :     bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
    3822           0 :     bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
    3823           0 :     bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
    3824           0 :     bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
    3825           0 :     bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
    3826           0 :     bf1[28] = bf0[28];
    3827           0 :     bf1[29] = bf0[29];
    3828           0 :     bf1[30] = bf0[30];
    3829           0 :     bf1[31] = bf0[31];
    3830           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3831             : 
    3832             :     // stage 3
    3833           0 :     stage++;
    3834           0 :     cospi = cospi_arr(cos_bit);
    3835           0 :     bf0 = step;
    3836           0 :     bf1 = output;
    3837           0 :     bf1[0] = bf0[0] + bf0[7];
    3838           0 :     bf1[1] = bf0[1] + bf0[6];
    3839           0 :     bf1[2] = bf0[2] + bf0[5];
    3840           0 :     bf1[3] = bf0[3] + bf0[4];
    3841           0 :     bf1[4] = -bf0[4] + bf0[3];
    3842           0 :     bf1[5] = -bf0[5] + bf0[2];
    3843           0 :     bf1[6] = -bf0[6] + bf0[1];
    3844           0 :     bf1[7] = -bf0[7] + bf0[0];
    3845           0 :     bf1[8] = bf0[8];
    3846           0 :     bf1[9] = bf0[9];
    3847           0 :     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    3848           0 :     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    3849           0 :     bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
    3850           0 :     bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
    3851           0 :     bf1[14] = bf0[14];
    3852           0 :     bf1[15] = bf0[15];
    3853           0 :     bf1[16] = bf0[16] + bf0[23];
    3854           0 :     bf1[17] = bf0[17] + bf0[22];
    3855           0 :     bf1[18] = bf0[18] + bf0[21];
    3856           0 :     bf1[19] = bf0[19] + bf0[20];
    3857           0 :     bf1[20] = -bf0[20] + bf0[19];
    3858           0 :     bf1[21] = -bf0[21] + bf0[18];
    3859           0 :     bf1[22] = -bf0[22] + bf0[17];
    3860           0 :     bf1[23] = -bf0[23] + bf0[16];
    3861           0 :     bf1[24] = -bf0[24] + bf0[31];
    3862           0 :     bf1[25] = -bf0[25] + bf0[30];
    3863           0 :     bf1[26] = -bf0[26] + bf0[29];
    3864           0 :     bf1[27] = -bf0[27] + bf0[28];
    3865           0 :     bf1[28] = bf0[28] + bf0[27];
    3866           0 :     bf1[29] = bf0[29] + bf0[26];
    3867           0 :     bf1[30] = bf0[30] + bf0[25];
    3868           0 :     bf1[31] = bf0[31] + bf0[24];
    3869           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3870             : 
    3871             :     // stage 4
    3872           0 :     stage++;
    3873           0 :     cospi = cospi_arr(cos_bit);
    3874           0 :     bf0 = output;
    3875           0 :     bf1 = step;
    3876           0 :     bf1[0] = bf0[0] + bf0[3];
    3877           0 :     bf1[1] = bf0[1] + bf0[2];
    3878           0 :     bf1[2] = -bf0[2] + bf0[1];
    3879           0 :     bf1[3] = -bf0[3] + bf0[0];
    3880           0 :     bf1[4] = bf0[4];
    3881           0 :     bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    3882           0 :     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
    3883           0 :     bf1[7] = bf0[7];
    3884           0 :     bf1[8] = bf0[8] + bf0[11];
    3885           0 :     bf1[9] = bf0[9] + bf0[10];
    3886           0 :     bf1[10] = -bf0[10] + bf0[9];
    3887           0 :     bf1[11] = -bf0[11] + bf0[8];
    3888           0 :     bf1[12] = -bf0[12] + bf0[15];
    3889           0 :     bf1[13] = -bf0[13] + bf0[14];
    3890           0 :     bf1[14] = bf0[14] + bf0[13];
    3891           0 :     bf1[15] = bf0[15] + bf0[12];
    3892           0 :     bf1[16] = bf0[16];
    3893           0 :     bf1[17] = bf0[17];
    3894           0 :     bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
    3895           0 :     bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
    3896           0 :     bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
    3897           0 :     bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
    3898           0 :     bf1[22] = bf0[22];
    3899           0 :     bf1[23] = bf0[23];
    3900           0 :     bf1[24] = bf0[24];
    3901           0 :     bf1[25] = bf0[25];
    3902           0 :     bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
    3903           0 :     bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
    3904           0 :     bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
    3905           0 :     bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
    3906           0 :     bf1[30] = bf0[30];
    3907           0 :     bf1[31] = bf0[31];
    3908           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3909             : 
    3910             :     // stage 5
    3911           0 :     stage++;
    3912           0 :     cospi = cospi_arr(cos_bit);
    3913           0 :     bf0 = step;
    3914           0 :     bf1 = output;
    3915           0 :     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    3916             :     //bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
    3917           0 :     bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
    3918             :     //bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
    3919           0 :     bf1[4] = bf0[4] + bf0[5];
    3920           0 :     bf1[5] = -bf0[5] + bf0[4];
    3921           0 :     bf1[6] = -bf0[6] + bf0[7];
    3922           0 :     bf1[7] = bf0[7] + bf0[6];
    3923           0 :     bf1[8] = bf0[8];
    3924           0 :     bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
    3925           0 :     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
    3926           0 :     bf1[11] = bf0[11];
    3927           0 :     bf1[12] = bf0[12];
    3928           0 :     bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
    3929           0 :     bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
    3930           0 :     bf1[15] = bf0[15];
    3931           0 :     bf1[16] = bf0[16] + bf0[19];
    3932           0 :     bf1[17] = bf0[17] + bf0[18];
    3933           0 :     bf1[18] = -bf0[18] + bf0[17];
    3934           0 :     bf1[19] = -bf0[19] + bf0[16];
    3935           0 :     bf1[20] = -bf0[20] + bf0[23];
    3936           0 :     bf1[21] = -bf0[21] + bf0[22];
    3937           0 :     bf1[22] = bf0[22] + bf0[21];
    3938           0 :     bf1[23] = bf0[23] + bf0[20];
    3939           0 :     bf1[24] = bf0[24] + bf0[27];
    3940           0 :     bf1[25] = bf0[25] + bf0[26];
    3941           0 :     bf1[26] = -bf0[26] + bf0[25];
    3942           0 :     bf1[27] = -bf0[27] + bf0[24];
    3943           0 :     bf1[28] = -bf0[28] + bf0[31];
    3944           0 :     bf1[29] = -bf0[29] + bf0[30];
    3945           0 :     bf1[30] = bf0[30] + bf0[29];
    3946           0 :     bf1[31] = bf0[31] + bf0[28];
    3947           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3948             : 
    3949             :     // stage 6
    3950           0 :     stage++;
    3951           0 :     cospi = cospi_arr(cos_bit);
    3952           0 :     bf0 = output;
    3953           0 :     bf1 = step;
    3954           0 :     bf1[0] = bf0[0];
    3955             :     //bf1[1] = bf0[1];
    3956           0 :     bf1[2] = bf0[2];
    3957             :     //bf1[3] = bf0[3];
    3958           0 :     bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
    3959             :     //bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
    3960           0 :     bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
    3961             :     //bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
    3962           0 :     bf1[8] = bf0[8] + bf0[9];
    3963           0 :     bf1[9] = -bf0[9] + bf0[8];
    3964           0 :     bf1[10] = -bf0[10] + bf0[11];
    3965           0 :     bf1[11] = bf0[11] + bf0[10];
    3966           0 :     bf1[12] = bf0[12] + bf0[13];
    3967           0 :     bf1[13] = -bf0[13] + bf0[12];
    3968           0 :     bf1[14] = -bf0[14] + bf0[15];
    3969           0 :     bf1[15] = bf0[15] + bf0[14];
    3970           0 :     bf1[16] = bf0[16];
    3971           0 :     bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
    3972           0 :     bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
    3973           0 :     bf1[19] = bf0[19];
    3974           0 :     bf1[20] = bf0[20];
    3975           0 :     bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
    3976           0 :     bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
    3977           0 :     bf1[23] = bf0[23];
    3978           0 :     bf1[24] = bf0[24];
    3979           0 :     bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
    3980           0 :     bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
    3981           0 :     bf1[27] = bf0[27];
    3982           0 :     bf1[28] = bf0[28];
    3983           0 :     bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
    3984           0 :     bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
    3985           0 :     bf1[31] = bf0[31];
    3986           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    3987             : 
    3988             :     // stage 7
    3989           0 :     stage++;
    3990           0 :     cospi = cospi_arr(cos_bit);
    3991           0 :     bf0 = step;
    3992           0 :     bf1 = output;
    3993           0 :     bf1[0] = bf0[0];
    3994             :     //bf1[1] = bf0[1];
    3995           0 :     bf1[2] = bf0[2];
    3996             :     //bf1[3] = bf0[3];
    3997           0 :     bf1[4] = bf0[4];
    3998             :     //bf1[5] = bf0[5];
    3999           0 :     bf1[6] = bf0[6];
    4000             :     //bf1[7] = bf0[7];
    4001           0 :     bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
    4002             :     //bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
    4003           0 :     bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
    4004             :     //bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
    4005           0 :     bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
    4006             :     //bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
    4007           0 :     bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
    4008             :     //bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
    4009           0 :     bf1[16] = bf0[16] + bf0[17];
    4010           0 :     bf1[17] = -bf0[17] + bf0[16];
    4011           0 :     bf1[18] = -bf0[18] + bf0[19];
    4012           0 :     bf1[19] = bf0[19] + bf0[18];
    4013           0 :     bf1[20] = bf0[20] + bf0[21];
    4014           0 :     bf1[21] = -bf0[21] + bf0[20];
    4015           0 :     bf1[22] = -bf0[22] + bf0[23];
    4016           0 :     bf1[23] = bf0[23] + bf0[22];
    4017           0 :     bf1[24] = bf0[24] + bf0[25];
    4018           0 :     bf1[25] = -bf0[25] + bf0[24];
    4019           0 :     bf1[26] = -bf0[26] + bf0[27];
    4020           0 :     bf1[27] = bf0[27] + bf0[26];
    4021           0 :     bf1[28] = bf0[28] + bf0[29];
    4022           0 :     bf1[29] = -bf0[29] + bf0[28];
    4023           0 :     bf1[30] = -bf0[30] + bf0[31];
    4024           0 :     bf1[31] = bf0[31] + bf0[30];
    4025           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    4026             : 
    4027             :     // stage 8
    4028           0 :     stage++;
    4029           0 :     cospi = cospi_arr(cos_bit);
    4030           0 :     bf0 = output;
    4031           0 :     bf1 = step;
    4032           0 :     bf1[0] = bf0[0];
    4033             :     //bf1[1] = bf0[1];
    4034           0 :     bf1[2] = bf0[2];
    4035             :     //bf1[3] = bf0[3];
    4036           0 :     bf1[4] = bf0[4];
    4037             :     //bf1[5] = bf0[5];
    4038           0 :     bf1[6] = bf0[6];
    4039             :     //bf1[7] = bf0[7];
    4040           0 :     bf1[8] = bf0[8];
    4041             :     //bf1[9] = bf0[9];
    4042           0 :     bf1[10] = bf0[10];
    4043             :     //bf1[11] = bf0[11];
    4044           0 :     bf1[12] = bf0[12];
    4045             :     //bf1[13] = bf0[13];
    4046           0 :     bf1[14] = bf0[14];
    4047             :     //bf1[15] = bf0[15];
    4048           0 :     bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
    4049             :     //bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
    4050           0 :     bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
    4051             :     //bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
    4052           0 :     bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
    4053             :     //bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
    4054           0 :     bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
    4055             :     //bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
    4056           0 :     bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
    4057             :     //bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
    4058           0 :     bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
    4059             :     //bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
    4060           0 :     bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
    4061             :     //bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
    4062           0 :     bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
    4063             :     //bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
    4064           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    4065             : 
    4066             :     // stage 9
    4067           0 :     stage++;
    4068           0 :     bf0 = step;
    4069           0 :     bf1 = output;
    4070           0 :     bf1[0] = bf0[0];
    4071           0 :     bf1[1] = bf0[16];
    4072           0 :     bf1[2] = bf0[8];
    4073           0 :     bf1[3] = bf0[24];
    4074           0 :     bf1[4] = bf0[4];
    4075           0 :     bf1[5] = bf0[20];
    4076           0 :     bf1[6] = bf0[12];
    4077           0 :     bf1[7] = bf0[28];
    4078           0 :     bf1[8] = bf0[2];
    4079           0 :     bf1[9] = bf0[18];
    4080           0 :     bf1[10] = bf0[10];
    4081           0 :     bf1[11] = bf0[26];
    4082           0 :     bf1[12] = bf0[6];
    4083           0 :     bf1[13] = bf0[22];
    4084           0 :     bf1[14] = bf0[14];
    4085           0 :     bf1[15] = bf0[30];
    4086             :     /*   bf1[16] = bf0[1];
    4087             :        bf1[17] = bf0[17];
    4088             :        bf1[18] = bf0[9];
    4089             :        bf1[19] = bf0[25];
    4090             :        bf1[20] = bf0[5];
    4091             :        bf1[21] = bf0[21];
    4092             :        bf1[22] = bf0[13];
    4093             :        bf1[23] = bf0[29];
    4094             :        bf1[24] = bf0[3];
    4095             :        bf1[25] = bf0[19];
    4096             :        bf1[26] = bf0[11];
    4097             :        bf1[27] = bf0[27];
    4098             :        bf1[28] = bf0[7];
    4099             :        bf1[29] = bf0[23];
    4100             :        bf1[30] = bf0[15];
    4101             :        bf1[31] = bf0[31];*/
    4102           0 :     range_check(stage, input, bf1, size, stage_range[stage]);
    4103           0 : }
    4104           0 : static INLINE TxfmFunc fwd_txfm_pf_type_to_func(TxfmType TxfmType) {
    4105           0 :     switch (TxfmType) {
    4106           0 :     case TXFM_TYPE_DCT4: return eb_av1_fdct4_new;
    4107           0 :     case TXFM_TYPE_DCT8: return eb_av1_fdct8_new;
    4108           0 :     case TXFM_TYPE_DCT16: return eb_av1_fdct16_new;
    4109           0 :     case TXFM_TYPE_DCT32: return av1_fdct32_pf_new;
    4110           0 :     case TXFM_TYPE_DCT64: return eb_av1_fdct64_new;
    4111           0 :     case TXFM_TYPE_ADST4: return eb_av1_fadst4_new;
    4112           0 :     case TXFM_TYPE_ADST8: return eb_av1_fadst8_new;
    4113           0 :     case TXFM_TYPE_ADST16: return eb_av1_fadst16_new;
    4114           0 :     case TXFM_TYPE_ADST32: return av1_fadst32_new;
    4115           0 :     case TXFM_TYPE_IDENTITY4: return eb_av1_fidentity4_c;
    4116           0 :     case TXFM_TYPE_IDENTITY8: return eb_av1_fidentity8_c;
    4117           0 :     case TXFM_TYPE_IDENTITY16: return eb_av1_fidentity16_c;
    4118           0 :     case TXFM_TYPE_IDENTITY32: return eb_av1_fidentity32_c;
    4119           0 :     case TXFM_TYPE_IDENTITY64: return av1_fidentity64_c;
    4120           0 :     default: assert(0); return NULL;
    4121             :     }
    4122             : }
    4123           0 : static INLINE void Av1TranformTwoDCore_pf_c(
    4124             :     int16_t                     *input,
    4125             :     uint32_t                      inputStride,
    4126             :     int32_t                      *output,
    4127             :     const Txfm2DFlipCfg      *cfg,
    4128             :     int32_t                      *buf,
    4129             :     uint8_t                        bit_depth)
    4130             : {
    4131             :     int32_t c, r;
    4132             :     // Note when assigning txfm_size_col, we use the txfm_size from the
    4133             :     // row configuration and vice versa. This is intentionally done to
    4134             :     // accurately perform rectangular transforms. When the transform is
    4135             :     // rectangular, the number of columns will be the same as the
    4136             :     // txfm_size stored in the row cfg struct. It will make no difference
    4137             :     // for square transforms.
    4138           0 :     const int32_t txfm_size_col = tx_size_wide[cfg->tx_size];
    4139           0 :     const int32_t txfm_size_row = tx_size_high[cfg->tx_size];
    4140             :     // Take the shift from the larger dimension in the rectangular case.
    4141           0 :     const int8_t *shift = cfg->shift;
    4142           0 :     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
    4143             :     int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
    4144             :     int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
    4145           0 :     assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
    4146           0 :     assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
    4147           0 :     eb_av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bit_depth);
    4148             : 
    4149           0 :     const int8_t cos_bit_col = cfg->cos_bit_col;
    4150           0 :     const int8_t cos_bit_row = cfg->cos_bit_row;
    4151           0 :     const TxfmFunc txfm_func_col = fwd_txfm_pf_type_to_func(cfg->txfm_type_col);
    4152           0 :     const TxfmFunc txfm_func_row = fwd_txfm_pf_type_to_func(cfg->txfm_type_row);
    4153             :     ASSERT(txfm_func_col != NULL);
    4154             :     ASSERT(txfm_func_row != NULL);
    4155             :     // use output buffer as temp buffer
    4156             :    /* int32_t *temp_in = output;
    4157             :     int32_t *temp_out = output + txfm_size_row;*/
    4158             :     int32_t temp_in[32];
    4159             :     int32_t temp_out[32];
    4160             : 
    4161             :     // Columns
    4162           0 :     for (c = 0; c < txfm_size_col; ++c) {
    4163           0 :         if (cfg->ud_flip == 0)
    4164           0 :             for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * inputStride + c];
    4165             :         else {
    4166           0 :             for (r = 0; r < txfm_size_row; ++r)
    4167             :                 // flip upside down
    4168           0 :                 temp_in[r] = input[(txfm_size_row - r - 1) * inputStride + c];
    4169             :         }
    4170           0 :         eb_av1_round_shift_array_c(temp_in, txfm_size_row, -shift[0]); // NM eb_av1_round_shift_array_c
    4171           0 :         txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
    4172           0 :         eb_av1_round_shift_array_c(temp_out, 16/*txfm_size_row*/, -shift[1]); // NM eb_av1_round_shift_array_c
    4173           0 :         if (cfg->lr_flip == 0) {
    4174           0 :             for (r = 0; r < txfm_size_row; ++r)
    4175           0 :                 buf[r * txfm_size_col + c] = temp_out[r];
    4176             :         }
    4177             :         else {
    4178           0 :             for (r = 0; r < txfm_size_row; ++r)
    4179             :                 // flip from left to right
    4180           0 :                 buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
    4181             :         }
    4182             :     }
    4183             : 
    4184             :     // Rows
    4185           0 :     for (r = 0; r < 16/*txfm_size_row*/; ++r) {
    4186           0 :         txfm_func_row(buf + r * txfm_size_col,
    4187             :             temp_out, //output + r * txfm_size_col,//
    4188             :             cos_bit_row,
    4189             :             stage_range_row);
    4190           0 :         av1_round_shift_array_pf_c(temp_out, output + r * txfm_size_col, 16/*txfm_size_col*/, -shift[2]);
    4191             : 
    4192           0 :         if (abs(rect_type) == 1) {
    4193             :             // Multiply everything by Sqrt2 if the transform is rectangular and the
    4194             :             // size difference is a factor of 2.
    4195           0 :             for (c = 0; c < txfm_size_col; ++c) {
    4196           0 :                 output[r * txfm_size_col + c] = round_shift(
    4197           0 :                     (int64_t)output[r * txfm_size_col + c] * NewSqrt2, NewSqrt2Bits);
    4198             :             }
    4199             :         }
    4200             :     }
    4201           0 : }
    4202             : 
    4203     3630020 : static INLINE void set_flip_cfg(TxType tx_type, Txfm2DFlipCfg *cfg) {
    4204     3630020 :     get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip);
    4205     3630020 : }
    4206     3630020 : static INLINE void set_fwd_txfm_non_scale_range(Txfm2DFlipCfg *cfg) {
    4207     3630020 :     const int32_t txh_idx = get_txh_idx(cfg->tx_size);
    4208     3630030 :     av1_zero(cfg->stage_range_col);
    4209     3630030 :     av1_zero(cfg->stage_range_row);
    4210     3630030 :     assert(cfg->txfm_type_col < TXFM_TYPES);
    4211     3630030 :     if (cfg->txfm_type_col != TXFM_TYPE_INVALID) {
    4212     3630080 :         int32_t stage_num_col = cfg->stage_num_col;
    4213     3630080 :         const int8_t *range_mult2_col =
    4214     3630080 :             fwd_txfm_range_mult2_list[cfg->txfm_type_col];
    4215    35277400 :         for (int32_t i = 0; i < stage_num_col; ++i)
    4216    31647300 :             cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1;
    4217             :     }
    4218             : 
    4219     3630030 :     if (cfg->txfm_type_row != TXFM_TYPE_INVALID) {
    4220     3630130 :         int32_t stage_num_row = cfg->stage_num_row;
    4221     3630130 :         assert(cfg->txfm_type_row < TXFM_TYPES);
    4222     3630130 :         const int8_t *range_mult2_row =
    4223     3630130 :             fwd_txfm_range_mult2_list[cfg->txfm_type_row];
    4224    39110500 :         for (int32_t i = 0; i < stage_num_row; ++i)
    4225    35480300 :             cfg->stage_range_row[i] =
    4226    35480300 :             (max_fwd_range_mult2_col[txh_idx] + range_mult2_row[i] + 1) >> 1;
    4227             :     }
    4228     3630030 : }
    4229             : 
    4230     3630040 : void Av1TransformConfig(
    4231             :     TxType tx_type,
    4232             :     TxSize tx_size,
    4233             :     Txfm2DFlipCfg *cfg)
    4234             : {
    4235     3630040 :     assert(cfg != NULL);
    4236     3630040 :     cfg->tx_size = tx_size;
    4237     3630040 :     set_flip_cfg(tx_type, cfg);
    4238     3630040 :     const TxType1D tx_type_1d_col = vtx_tab[tx_type];
    4239     3630040 :     const TxType1D tx_type_1d_row = htx_tab[tx_type];
    4240     3630040 :     const int32_t txw_idx = tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
    4241     3630040 :     const int32_t txh_idx = tx_size_high_log2[tx_size] - tx_size_high_log2[0];
    4242     3630040 :     cfg->shift = fwd_txfm_shift_ls[tx_size];
    4243     3630040 :     cfg->cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
    4244     3630040 :     cfg->cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
    4245     3630040 :     cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
    4246     3630040 :     cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
    4247     3630040 :     cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
    4248     3630040 :     cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
    4249     3630040 :     set_fwd_txfm_non_scale_range(cfg);
    4250     3630110 : }
    4251             : 
    4252           0 : static uint64_t EnergyComputation(
    4253             :     int32_t  *coeff,
    4254             :     uint32_t   coeff_stride,
    4255             :     uint32_t   area_width,
    4256             :     uint32_t   area_height)
    4257             : {
    4258             :     uint32_t  columnIndex;
    4259           0 :     uint32_t  row_index = 0;
    4260           0 :     uint64_t  predictionDistortion = 0;
    4261             : 
    4262           0 :     while (row_index < area_height) {
    4263           0 :         columnIndex = 0;
    4264           0 :         while (columnIndex < area_width) {
    4265           0 :             predictionDistortion += (int64_t)SQR((int64_t)(coeff[columnIndex]));
    4266           0 :             ++columnIndex;
    4267             :         }
    4268             : 
    4269           0 :         coeff += coeff_stride;
    4270           0 :         ++row_index;
    4271             :     }
    4272             : 
    4273           0 :     return predictionDistortion;
    4274             : }
    4275             : 
    4276           0 : uint64_t HandleTransform64x64_c(int32_t *output) {
    4277             :     uint64_t three_quad_energy;
    4278             : 
    4279             :     // top - right 32x32 area.
    4280           0 :     three_quad_energy = EnergyComputation(output + 32, 64, 32, 32);
    4281             :     //bottom 64x32 area.
    4282           0 :     three_quad_energy += EnergyComputation(output + 32 * 64, 64, 64, 32);
    4283             : 
    4284             :     // Zero out top-right 32x32 area.
    4285           0 :     for (int32_t row = 0; row < 32; ++row)
    4286           0 :         memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
    4287             : 
    4288             :     // Zero out the bottom 64x32 area.
    4289           0 :     memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
    4290             : 
    4291             :     // Re-pack non-zero coeffs in the first 32x32 indices.
    4292           0 :     for (int32_t row = 1; row < 32; ++row)
    4293           0 :         memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
    4294             : 
    4295           0 :     return three_quad_energy;
    4296             : }
    4297             : 
    4298           0 : void Av1TransformTwoD_64x64_c(
    4299             :     int16_t         *input,
    4300             :     int32_t         *output,
    4301             :     uint32_t         input_stride,
    4302             :     TxType        transform_type,
    4303             :     uint8_t          bit_depth)
    4304             : {
    4305             :     int32_t intermediateTransformBuffer[64 * 64];
    4306             :     Txfm2DFlipCfg cfg;
    4307             :     //av1_get_fwd_txfm_cfg
    4308           0 :     Av1TransformConfig(
    4309             :         transform_type,
    4310             :         TX_64X64,
    4311             :         &cfg);
    4312             :     //fwd_txfm2d_c
    4313           0 :     Av1TranformTwoDCore_c(
    4314             :         input,
    4315             :         input_stride,
    4316             :         output,
    4317             :         &cfg,
    4318             :         intermediateTransformBuffer,
    4319             :         bit_depth);
    4320           0 : }
    4321             : 
    4322           0 : void Av1TransformTwoD_32x32_c(
    4323             :     int16_t         *input,
    4324             :     int32_t         *output,
    4325             :     uint32_t         input_stride,
    4326             :     TxType        transform_type,
    4327             :     uint8_t          bit_depth)
    4328             : {
    4329             :     int32_t intermediateTransformBuffer[32 * 32];
    4330             :     Txfm2DFlipCfg cfg;
    4331             : 
    4332           0 :     Av1TransformConfig(
    4333             :         transform_type,
    4334             :         TX_32X32,
    4335             :         &cfg);
    4336             : 
    4337           0 :     Av1TranformTwoDCore_c(
    4338             :         input,
    4339             :         input_stride,
    4340             :         output,
    4341             :         &cfg,
    4342             :         intermediateTransformBuffer,
    4343             :         bit_depth);
    4344           0 : }
    4345           0 : void av1_fwd_txfm2d_pf_32x32_c(
    4346             :     int16_t         *input,
    4347             :     int32_t         *output,
    4348             :     uint32_t         inputStride,
    4349             :     TxType           transform_type,
    4350             :     uint8_t          bit_depth)
    4351             : {
    4352             :     int32_t intermediateTransformBuffer[32 * 32];
    4353             :     Txfm2DFlipCfg cfg;
    4354             : 
    4355           0 :     memset(output, 0, 1024 * sizeof(int32_t));
    4356             : 
    4357           0 :     Av1TransformConfig(
    4358             :         transform_type,
    4359             :         TX_32X32,
    4360             :         &cfg);
    4361             : 
    4362           0 :     Av1TranformTwoDCore_pf_c(
    4363             :         input,
    4364             :         inputStride,
    4365             :         output,
    4366             :         &cfg,
    4367             :         intermediateTransformBuffer,
    4368             :         bit_depth);
    4369           0 : }
    4370           0 : void Av1TransformTwoD_16x16_c(
    4371             :     int16_t         *input,
    4372             :     int32_t         *output,
    4373             :     uint32_t         input_stride,
    4374             :     TxType        transform_type,
    4375             :     uint8_t          bit_depth)
    4376             : {
    4377             :     int32_t intermediateTransformBuffer[16 * 16];
    4378             :     Txfm2DFlipCfg cfg;
    4379             : 
    4380           0 :     Av1TransformConfig(
    4381             :         transform_type,
    4382             :         TX_16X16,
    4383             :         &cfg);
    4384             : 
    4385           0 :     Av1TranformTwoDCore_c(
    4386             :         input,
    4387             :         input_stride,
    4388             :         output,
    4389             :         &cfg,
    4390             :         intermediateTransformBuffer,
    4391             :         bit_depth);
    4392           0 : }
    4393             : 
    4394           0 : void Av1TransformTwoD_8x8_c(
    4395             :     int16_t         *input,
    4396             :     int32_t         *output,
    4397             :     uint32_t         input_stride,
    4398             :     TxType        transform_type,
    4399             :     uint8_t          bit_depth)
    4400             : {
    4401             :     int32_t intermediateTransformBuffer[8 * 8];
    4402             :     Txfm2DFlipCfg cfg;
    4403             : 
    4404           0 :     Av1TransformConfig(
    4405             :         transform_type,
    4406             :         TX_8X8,
    4407             :         &cfg);
    4408             : 
    4409           0 :     Av1TranformTwoDCore_c(
    4410             :         input,
    4411             :         input_stride,
    4412             :         output,
    4413             :         &cfg,
    4414             :         intermediateTransformBuffer,
    4415             :         bit_depth);
    4416           0 : }
    4417             : 
    4418           0 : void Av1TransformTwoD_4x4_c(
    4419             :     int16_t         *input,
    4420             :     int32_t         *output,
    4421             :     uint32_t         input_stride,
    4422             :     TxType        transform_type,
    4423             :     uint8_t          bit_depth)
    4424             : {
    4425             :     int32_t intermediateTransformBuffer[4 * 4];
    4426             :     Txfm2DFlipCfg cfg;
    4427             : 
    4428           0 :     Av1TransformConfig(
    4429             :         transform_type,
    4430             :         TX_4X4,
    4431             :         &cfg);
    4432             : 
    4433           0 :     Av1TranformTwoDCore_c(
    4434             :         input,
    4435             :         input_stride,
    4436             :         output,
    4437             :         &cfg,
    4438             :         intermediateTransformBuffer,
    4439             :         bit_depth);
    4440           0 : }
    4441             : 
    4442             : /*********************************************************************
    4443             : * Calculate CBF
    4444             : *********************************************************************/
    4445           0 : void eb_av1_fwd_txfm2d_64x32_c(
    4446             :     int16_t         *input,
    4447             :     int32_t         *output,
    4448             :     uint32_t         input_stride,
    4449             :     TxType        transform_type,
    4450             :     uint8_t          bit_depth) {
    4451             :     int32_t intermediateTransformBuffer[64 * 32];
    4452             :     Txfm2DFlipCfg cfg;
    4453             :     /*av1_get_fwd_txfm_cfg*/
    4454           0 :     Av1TransformConfig
    4455             :     (transform_type, TX_64X32, &cfg);
    4456           0 :     /*fwd_txfm2d_c*/Av1TranformTwoDCore_c(
    4457             :         input,
    4458             :         input_stride,
    4459             :         output,
    4460             :         &cfg,
    4461             :         intermediateTransformBuffer,
    4462             :         bit_depth);
    4463           0 : }
    4464             : 
    4465           0 : uint64_t HandleTransform64x32_c(int32_t *output) {
    4466             :     // top - right 32x32 area.
    4467             :     const uint64_t three_quad_energy =
    4468           0 :         EnergyComputation(output + 32, 64, 32, 32);
    4469             : 
    4470             :     // Zero out right 32x32 area.
    4471           0 :     for (int32_t row = 0; row < 32; ++row)
    4472           0 :         memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
    4473             : 
    4474             :     // Re-pack non-zero coeffs in the first 32x32 indices.
    4475           0 :     for (int32_t row = 1; row < 32; ++row)
    4476           0 :         memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
    4477             : 
    4478           0 :     return three_quad_energy;
    4479             : }
    4480             : 
    4481           0 : void eb_av1_fwd_txfm2d_32x64_c(
    4482             :     int16_t         *input,
    4483             :     int32_t         *output,
    4484             :     uint32_t         input_stride,
    4485             :     TxType        transform_type,
    4486             :     uint8_t          bit_depth) {
    4487             :     int32_t intermediateTransformBuffer[32 * 64];
    4488             : 
    4489             :     Txfm2DFlipCfg cfg;
    4490             :     /*av1_get_fwd_txfm_cfg*/
    4491           0 :     Av1TransformConfig(transform_type, TX_32X64, &cfg);
    4492             :     /*fwd_txfm2d_c*/
    4493           0 :     Av1TranformTwoDCore_c(
    4494             :         input,
    4495             :         input_stride,
    4496             :         output,
    4497             :         &cfg,
    4498             :         intermediateTransformBuffer,
    4499             :         bit_depth);
    4500           0 : }
    4501             : 
    4502           0 : uint64_t HandleTransform32x64_c(int32_t *output) {
    4503             :     //bottom 32x32 area.
    4504             :     const uint64_t three_quad_energy =
    4505           0 :         EnergyComputation(output + 32 * 32, 32, 32, 32);
    4506             : 
    4507             :     // Zero out the bottom 32x32 area.
    4508           0 :     memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
    4509             : 
    4510           0 :     return three_quad_energy;
    4511             : }
    4512             : 
    4513           0 : void eb_av1_fwd_txfm2d_64x16_c(
    4514             :     int16_t         *input,
    4515             :     int32_t         *output,
    4516             :     uint32_t         input_stride,
    4517             :     TxType        transform_type,
    4518             :     uint8_t          bit_depth) {
    4519             :     int32_t intermediateTransformBuffer[64 * 16];
    4520             :     Txfm2DFlipCfg cfg;
    4521             :     /*av1_get_fwd_txfm_cfg*/
    4522           0 :     Av1TransformConfig
    4523             :     (transform_type, TX_64X16, &cfg);
    4524           0 :     /*fwd_txfm2d_c*/Av1TranformTwoDCore_c(
    4525             :         input,
    4526             :         input_stride,
    4527             :         output,
    4528             :         &cfg,
    4529             :         intermediateTransformBuffer,
    4530             :         bit_depth);
    4531           0 : }
    4532             : 
    4533           0 : uint64_t HandleTransform64x16_c(int32_t *output) {
    4534             :     // top - right 32x16 area.
    4535             :     const uint64_t three_quad_energy =
    4536           0 :         EnergyComputation(output + 32, 64, 32, 16);
    4537             : 
    4538             :     // Zero out right 32x16 area.
    4539           0 :     for (int32_t row = 0; row < 16; ++row)
    4540           0 :         memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
    4541             : 
    4542             :     // Re-pack non-zero coeffs in the first 32x16 indices.
    4543           0 :     for (int32_t row = 1; row < 16; ++row)
    4544           0 :         memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
    4545             : 
    4546           0 :     return three_quad_energy;
    4547             : }
    4548             : 
    4549           0 : void eb_av1_fwd_txfm2d_16x64_c(
    4550             :     int16_t         *input,
    4551             :     int32_t         *output,
    4552             :     uint32_t         input_stride,
    4553             :     TxType        transform_type,
    4554             :     uint8_t          bit_depth) {
    4555             :     int32_t intermediateTransformBuffer[16 * 64];
    4556             : 
    4557             :     Txfm2DFlipCfg cfg;
    4558             :     /*av1_get_fwd_txfm_cfg*/
    4559           0 :     Av1TransformConfig(transform_type, TX_16X64, &cfg);
    4560             :     /*fwd_txfm2d_c*/
    4561           0 :     Av1TranformTwoDCore_c(
    4562             :         input,
    4563             :         input_stride,
    4564             :         output,
    4565             :         &cfg,
    4566             :         intermediateTransformBuffer,
    4567             :         bit_depth);
    4568           0 : }
    4569             : 
    4570           0 : uint64_t HandleTransform16x64_c(int32_t *output) {
    4571             :     //bottom 16x32 area.
    4572             :     const uint64_t three_quad_energy =
    4573           0 :         EnergyComputation(output + 16 * 32, 16, 16, 32);
    4574             : 
    4575             :     // Zero out the bottom 16x32 area.
    4576           0 :     memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
    4577             : 
    4578           0 :     return three_quad_energy;
    4579             : }
    4580             : 
    4581     1916670 : void eb_av1_fwd_txfm2d_32x16_c(
    4582             :     int16_t         *input,
    4583             :     int32_t         *output,
    4584             :     uint32_t         input_stride,
    4585             :     TxType        transform_type,
    4586             :     uint8_t          bit_depth) {
    4587             :     int32_t intermediateTransformBuffer[32 * 16];
    4588             :     Txfm2DFlipCfg cfg;
    4589     1916670 :     /*av1_get_fwd_txfm_cfg*/Av1TransformConfig(transform_type, TX_32X16, &cfg);
    4590     1916690 :     /*fwd_txfm2d_c*/Av1TranformTwoDCore_c(
    4591             :         input,
    4592             :         input_stride,
    4593             :         output,
    4594             :         &cfg,
    4595             :         intermediateTransformBuffer,
    4596             :         bit_depth);
    4597     1916540 : }
    4598             : 
    4599           0 : void eb_av1_fwd_txfm2d_16x32_c(
    4600             :     int16_t         *input,
    4601             :     int32_t         *output,
    4602             :     uint32_t         input_stride,
    4603             :     TxType        transform_type,
    4604             :     uint8_t          bit_depth) {
    4605             :     int32_t intermediateTransformBuffer[16 * 32];
    4606             :     Txfm2DFlipCfg cfg;
    4607           0 :     /*av1_get_fwd_txfm_cfg*/Av1TransformConfig(transform_type, TX_16X32, &cfg);
    4608           0 :     /*fwd_txfm2d_c*/Av1TranformTwoDCore_c(
    4609             :         input,
    4610             :         input_stride,
    4611             :         output,
    4612             :         &cfg,
    4613             :         intermediateTransformBuffer,
    4614             :         bit_depth);
    4615           0 : }
    4616             : 
    4617           0 : void eb_av1_fwd_txfm2d_16x8_c(
    4618             :     int16_t         *input,
    4619             :     int32_t         *output,
    4620             :     uint32_t         input_stride,
    4621             :     TxType        transform_type,
    4622             :     uint8_t          bit_depth) {
    4623             :     int32_t intermediateTransformBuffer[16 * 8];
    4624             :     Txfm2DFlipCfg cfg;
    4625           0 :     /*av1_get_fwd_txfm_cfg*/Av1TransformConfig(transform_type, TX_16X8, &cfg);
    4626           0 :     /*fwd_txfm2d_c*/Av1TranformTwoDCore_c(
    4627             :         input,
    4628             :         input_stride,
    4629             :         output,
    4630             :         &cfg,
    4631             :         intermediateTransformBuffer,
    4632             :         bit_depth);
    4633           0 : }
    4634             : 
    4635           0 : void eb_av1_fwd_txfm2d_8x16_c(
    4636             :     int16_t         *input,
    4637             :     int32_t         *output,
    4638             :     uint32_t         input_stride,
    4639             :     TxType        transform_type,
    4640             :     uint8_t          bit_depth) {
    4641             :     int32_t intermediateTransformBuffer[8 * 16];
    4642             :     Txfm2DFlipCfg cfg;
    4643           0 :     /*av1_get_fwd_txfm_cfg*/Av1TransformConfig(transform_type, TX_8X16, &cfg);
    4644           0 :     /*fwd_txfm2d_c*/Av1TranformTwoDCore_c(
    4645             :         input,
    4646             :         input_stride,
    4647             :         output,
    4648             :         &cfg,
    4649             :         intermediateTransformBuffer,
    4650             :         bit_depth);
    4651           0 : }
    4652             : 
    4653           0 : void eb_av1_fwd_txfm2d_32x8_c(
    4654             :     int16_t         *input,
    4655             :     int32_t         *output,
    4656             :     uint32_t         input_stride,
    4657             :     TxType        transform_type,
    4658             :     uint8_t          bit_depth) {
    4659             :     int32_t intermediateTransformBuffer[32 * 8];
    4660             :     Txfm2DFlipCfg cfg;
    4661           0 :     /*av1_get_fwd_txfm_cfg*/Av1TransformConfig(transform_type, TX_32X8, &cfg);
    4662           0 :     /*fwd_txfm2d_c*/Av1TranformTwoDCore_c(
    4663             :         input,
    4664             :         input_stride,
    4665             :         output,
    4666             :         &cfg,
    4667             :         intermediateTransformBuffer,
    4668             :         bit_depth);
    4669           0 : }
    4670             : 
    4671           0 : void eb_av1_fwd_txfm2d_8x32_c(
    4672             :     int16_t         *input,
    4673             :     int32_t         *output,
    4674             :     uint32_t         input_stride,
    4675             :     TxType        transform_type,
    4676             :     uint8_t          bit_depth) {
    4677             :     int32_t intermediateTransformBuffer[8 * 32];
    4678             :     Txfm2DFlipCfg cfg;
    4679           0 :     /*av1_get_fwd_txfm_cfg*/Av1TransformConfig(transform_type, TX_8X32, &cfg);
    4680           0 :     /*fwd_txfm2d_c*/Av1TranformTwoDCore_c(
    4681             :         input,
    4682             :         input_stride,
    4683             :         output,
    4684             :         &cfg,
    4685             :         intermediateTransformBuffer,
    4686             :         bit_depth);
    4687           0 : }
    4688             : 
    4689           0 : void eb_av1_fwd_txfm2d_16x4_c(
    4690             :     int16_t         *input,
    4691             :     int32_t         *output,
    4692             :     uint32_t         input_stride,
    4693             :     TxType        transform_type,
    4694             :     uint8_t          bit_depth) {
    4695             :     int32_t intermediateTransformBuffer[16 * 4];
    4696             :     Txfm2DFlipCfg cfg;
    4697           0 :     /*av1_get_fwd_txfm_cfg*/Av1TransformConfig(transform_type, TX_16X4, &cfg);
    4698           0 :     /*fwd_txfm2d_c*/Av1TranformTwoDCore_c(
    4699             :         input,
    4700             :         input_stride,
    4701             :         output,
    4702             :         &cfg,
    4703             :         intermediateTransformBuffer,
    4704             :         bit_depth);
    4705           0 : }
    4706             : 
    4707           0 : void eb_av1_fwd_txfm2d_4x16_c(
    4708             :     int16_t         *input,
    4709             :     int32_t         *output,
    4710             :     uint32_t         input_stride,
    4711             :     TxType        transform_type,
    4712             :     uint8_t          bit_depth) {
    4713             :     int32_t intermediateTransformBuffer[4 * 16];
    4714             :     Txfm2DFlipCfg cfg;
    4715           0 :     /*av1_get_fwd_txfm_cfg*/Av1TransformConfig(transform_type, TX_4X16, &cfg);
    4716           0 :     /*fwd_txfm2d_c*/Av1TranformTwoDCore_c(
    4717             :         input,
    4718             :         input_stride,
    4719             :         output,
    4720             :         &cfg,
    4721             :         intermediateTransformBuffer,
    4722             :         bit_depth);
    4723           0 : }
    4724             : 
    4725           0 : void eb_av1_fwd_txfm2d_8x4_c(
    4726             :     int16_t         *input,
    4727             :     int32_t         *output,
    4728             :     uint32_t         input_stride,
    4729             :     TxType        transform_type,
    4730             :     uint8_t          bit_depth) {
    4731             :     int32_t intermediateTransformBuffer[8 * 4];
    4732             :     Txfm2DFlipCfg cfg;
    4733           0 :     /*av1_get_fwd_txfm_cfg*/Av1TransformConfig(transform_type, TX_8X4, &cfg);
    4734           0 :     /*fwd_txfm2d_c*/Av1TranformTwoDCore_c(
    4735             :         input,
    4736             :         input_stride,
    4737             :         output,
    4738             :         &cfg,
    4739             :         intermediateTransformBuffer,
    4740             :         bit_depth);
    4741           0 : }
    4742             : 
    4743           0 : void eb_av1_fwd_txfm2d_4x8_c(
    4744             :     int16_t         *input,
    4745             :     int32_t         *output,
    4746             :     uint32_t         input_stride,
    4747             :     TxType        transform_type,
    4748             :     uint8_t          bit_depth) {
    4749             :     int32_t intermediateTransformBuffer[4 * 8];
    4750             :     Txfm2DFlipCfg cfg;
    4751           0 :     /*av1_get_fwd_txfm_cfg*/Av1TransformConfig(transform_type, TX_4X8, &cfg);
    4752           0 :     /*fwd_txfm2d_c*/Av1TranformTwoDCore_c(
    4753             :         input,
    4754             :         input_stride,
    4755             :         output,
    4756             :         &cfg,
    4757             :         intermediateTransformBuffer,
    4758             :         bit_depth);
    4759           0 : }
    4760             : 
    4761             : /*********************************************************************
    4762             : * Transform
    4763             : *   Note there is an implicit assumption that TU Size <= PU Size,
    4764             : *   which is different than the HEVC requirements.
    4765             : *********************************************************************/
    4766   105601000 : EbErrorType av1_estimate_transform(
    4767             :     int16_t              *residual_buffer,
    4768             :     uint32_t              residual_stride,
    4769             :     int32_t              *coeff_buffer,
    4770             :     uint32_t              coeff_stride,
    4771             :     TxSize                transform_size,
    4772             :     uint64_t             *three_quad_energy,
    4773             :     int16_t              *transform_inner_array_ptr,
    4774             :     uint32_t              bit_increment,
    4775             :     TxType                transform_type,
    4776             :     PlaneType            component_type,
    4777             :     EB_TRANS_COEFF_SHAPE  trans_coeff_shape)
    4778             : 
    4779             : {
    4780             :     (void)trans_coeff_shape;
    4781   105601000 :     EbErrorType return_error = EB_ErrorNone;
    4782             : 
    4783             :     (void)transform_inner_array_ptr;
    4784             :     (void)coeff_stride;
    4785             :     (void)component_type;
    4786   105601000 :     uint8_t      bit_depth = bit_increment ? 10 : 8;// NM - Set to zero for the moment
    4787             : 
    4788   105601000 :     switch (transform_size) {
    4789      513582 :     case TX_64X32:
    4790      513582 :         if (transform_type == DCT_DCT)
    4791      513582 :             eb_av1_fwd_txfm2d_64x32(
    4792             :                 residual_buffer,
    4793             :                 coeff_buffer,
    4794             :                 residual_stride,
    4795             :                 transform_type,
    4796             :                 bit_depth);
    4797             :         else
    4798           0 :             eb_av1_fwd_txfm2d_64x32_c(
    4799             :                 residual_buffer,
    4800             :                 coeff_buffer,
    4801             :                 residual_stride,
    4802             :                 transform_type,
    4803             :                 bit_depth);
    4804             : 
    4805      513589 :         *three_quad_energy = HandleTransform64x32(coeff_buffer);
    4806             : 
    4807      513583 :         break;
    4808             : 
    4809      557016 :     case TX_32X64:
    4810      557016 :         if (transform_type == DCT_DCT)
    4811      557019 :             eb_av1_fwd_txfm2d_32x64(
    4812             :                 residual_buffer,
    4813             :                 coeff_buffer,
    4814             :                 residual_stride,
    4815             :                 transform_type,
    4816             :                 bit_depth);
    4817             :         else
    4818           0 :             eb_av1_fwd_txfm2d_32x64_c(
    4819             :                 residual_buffer,
    4820             :                 coeff_buffer,
    4821             :                 residual_stride,
    4822             :                 transform_type,
    4823             :                 bit_depth);
    4824             : 
    4825      557014 :         *three_quad_energy = HandleTransform32x64(coeff_buffer);
    4826             : 
    4827      557012 :         break;
    4828             : 
    4829      790603 :     case TX_64X16:
    4830      790603 :         if (transform_type == DCT_DCT)
    4831      790603 :             eb_av1_fwd_txfm2d_64x16(
    4832             :                 residual_buffer,
    4833             :                 coeff_buffer,
    4834             :                 residual_stride,
    4835             :                 transform_type,
    4836             :                 bit_depth);
    4837             :         else
    4838           0 :             eb_av1_fwd_txfm2d_64x16_c(
    4839             :                 residual_buffer,
    4840             :                 coeff_buffer,
    4841             :                 residual_stride,
    4842             :                 transform_type,
    4843             :                 bit_depth);
    4844             : 
    4845      790609 :         *three_quad_energy = HandleTransform64x16(coeff_buffer);
    4846             : 
    4847      790601 :         break;
    4848             : 
    4849      855372 :     case TX_16X64:
    4850      855372 :         if (transform_type == DCT_DCT)
    4851      855372 :             eb_av1_fwd_txfm2d_16x64(
    4852             :                 residual_buffer,
    4853             :                 coeff_buffer,
    4854             :                 residual_stride,
    4855             :                 transform_type,
    4856             :                 bit_depth);
    4857             :         else
    4858           0 :             eb_av1_fwd_txfm2d_16x64_c(
    4859             :                 residual_buffer,
    4860             :                 coeff_buffer,
    4861             :                 residual_stride,
    4862             :                 transform_type,
    4863             :                 bit_depth);
    4864             : 
    4865      855372 :         *three_quad_energy = HandleTransform16x64(coeff_buffer);
    4866             : 
    4867      855376 :         break;
    4868             : 
    4869     2003820 :     case TX_32X16:
    4870             :         // TTK
    4871     2003820 :         if (transform_type == IDTX)
    4872       87137 :             eb_av1_fwd_txfm2d_32x16(
    4873             :                 residual_buffer,
    4874             :                 coeff_buffer,
    4875             :                 residual_stride,
    4876             :                 transform_type,
    4877             :                 bit_depth);
    4878             :         else
    4879     1916680 :             eb_av1_fwd_txfm2d_32x16_c(
    4880             :                 residual_buffer,
    4881             :                 coeff_buffer,
    4882             :                 residual_stride,
    4883             :                 transform_type,
    4884             :                 bit_depth);
    4885     2003680 :         break;
    4886             : 
    4887     2006860 :     case TX_16X32:
    4888     2006860 :         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
    4889     2006860 :             eb_av1_fwd_txfm2d_16x32(
    4890             :                 residual_buffer,
    4891             :                 coeff_buffer,
    4892             :                 residual_stride,
    4893             :                 transform_type,
    4894             :                 bit_depth);
    4895             :         else
    4896           0 :             eb_av1_fwd_txfm2d_16x32_c(
    4897             :                 residual_buffer,
    4898             :                 coeff_buffer,
    4899             :                 residual_stride,
    4900             :                 transform_type,
    4901             :                 bit_depth);
    4902     2006880 :         break;
    4903             : 
    4904     7281400 :     case TX_16X8:
    4905     7281400 :         eb_av1_fwd_txfm2d_16x8(
    4906             :             residual_buffer,
    4907             :             coeff_buffer,
    4908             :             residual_stride,
    4909             :             transform_type,
    4910             :             bit_depth);
    4911     7282350 :         break;
    4912             : 
    4913     8828210 :     case TX_8X16:
    4914     8828210 :         eb_av1_fwd_txfm2d_8x16(
    4915             :             residual_buffer,
    4916             :             coeff_buffer,
    4917             :             residual_stride,
    4918             :             transform_type,
    4919             :             bit_depth);
    4920     8829530 :         break;
    4921             : 
    4922     2533920 :     case TX_32X8:
    4923     2533920 :         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
    4924     2533920 :             eb_av1_fwd_txfm2d_32x8(
    4925             :                 residual_buffer,
    4926             :                 coeff_buffer,
    4927             :                 residual_stride,
    4928             :                 transform_type,
    4929             :                 bit_depth);
    4930             :         else
    4931           0 :             eb_av1_fwd_txfm2d_32x8_c(
    4932             :                 residual_buffer,
    4933             :                 coeff_buffer,
    4934             :                 residual_stride,
    4935             :                 transform_type,
    4936             :                 bit_depth);
    4937     2534090 :         break;
    4938             : 
    4939     2537710 :     case TX_8X32:
    4940     2537710 :         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
    4941     2537710 :             eb_av1_fwd_txfm2d_8x32(
    4942             :                 residual_buffer,
    4943             :                 coeff_buffer,
    4944             :                 residual_stride,
    4945             :                 transform_type,
    4946             :                 bit_depth);
    4947             :         else
    4948           0 :             eb_av1_fwd_txfm2d_8x32_c(
    4949             :                 residual_buffer,
    4950             :                 coeff_buffer,
    4951             :                 residual_stride,
    4952             :                 transform_type,
    4953             :                 bit_depth);
    4954     2537860 :         break;
    4955     5777420 :     case TX_16X4:
    4956     5777420 :         eb_av1_fwd_txfm2d_16x4(
    4957             :             residual_buffer,
    4958             :             coeff_buffer,
    4959             :             residual_stride,
    4960             :             transform_type,
    4961             :             bit_depth);
    4962     5777920 :         break;
    4963     5564350 :     case TX_4X16:
    4964     5564350 :         eb_av1_fwd_txfm2d_4x16(
    4965             :             residual_buffer,
    4966             :             coeff_buffer,
    4967             :             residual_stride,
    4968             :             transform_type,
    4969             :             bit_depth);
    4970     5564820 :         break;
    4971     7357360 :     case TX_8X4:
    4972             : 
    4973     7357360 :         eb_av1_fwd_txfm2d_8x4(
    4974             :             residual_buffer,
    4975             :             coeff_buffer,
    4976             :             residual_stride,
    4977             :             transform_type,
    4978             :             bit_depth);
    4979             : 
    4980     7358330 :         break;
    4981     7683930 :     case TX_4X8:
    4982             : 
    4983     7683930 :         eb_av1_fwd_txfm2d_4x8(
    4984             :             residual_buffer,
    4985             :             coeff_buffer,
    4986             :             residual_stride,
    4987             :             transform_type,
    4988             :             bit_depth);
    4989             : 
    4990     7684920 :         break;
    4991             : 
    4992      314333 :     case TX_64X64:
    4993             : 
    4994      314333 :         eb_av1_fwd_txfm2d_64x64(
    4995             :             residual_buffer,
    4996             :             coeff_buffer,
    4997             :             residual_stride,
    4998             :             transform_type,
    4999             :             bit_depth);
    5000             : 
    5001      314334 :         *three_quad_energy = HandleTransform64x64(coeff_buffer);
    5002             : 
    5003       70574 :         break;
    5004             : 
    5005     1713680 :     case TX_32X32:
    5006     1713680 :         if (transform_type == V_DCT || transform_type == H_DCT || transform_type == V_ADST || transform_type == H_ADST || transform_type == V_FLIPADST || transform_type == H_FLIPADST)
    5007             :             // Tahani: I believe those cases are never hit
    5008           0 :             Av1TransformTwoD_32x32_c(
    5009             :                 residual_buffer,
    5010             :                 coeff_buffer,
    5011             :                 residual_stride,
    5012             :                 transform_type,
    5013             :                 bit_depth);
    5014             : 
    5015             :         else {
    5016     1713680 :             eb_av1_fwd_txfm2d_32x32(
    5017             :                 residual_buffer,
    5018             :                 coeff_buffer,
    5019             :                 residual_stride,
    5020             :                 transform_type,
    5021             :                 bit_depth);
    5022             :         }
    5023             : 
    5024     1713690 :         break;
    5025             : 
    5026     7143980 :     case TX_16X16:
    5027             : 
    5028     7143980 :         eb_av1_fwd_txfm2d_16x16(
    5029             :             residual_buffer,
    5030             :             coeff_buffer,
    5031             :             residual_stride,
    5032             :             transform_type,
    5033             :             bit_depth);
    5034             : 
    5035     7144360 :         break;
    5036    19667700 :     case TX_8X8:
    5037             : 
    5038    19667700 :         eb_av1_fwd_txfm2d_8x8(
    5039             :             residual_buffer,
    5040             :             coeff_buffer,
    5041             :             residual_stride,
    5042             :             transform_type,
    5043             :             bit_depth);
    5044             : 
    5045    19672900 :         break;
    5046    22761000 :     case TX_4X4:
    5047             : 
    5048    22761000 :         eb_av1_fwd_txfm2d_4x4(
    5049             :             residual_buffer,
    5050             :             coeff_buffer,
    5051             :             residual_stride,
    5052             :             transform_type,
    5053             :             bit_depth);
    5054             : 
    5055    22767800 :         break;
    5056           0 :     default: assert(0); break;
    5057             :     }
    5058             : 
    5059   105666000 :     return return_error;
    5060             : }
    5061             : 
    5062           0 : void Av1InverseTransformConfig(
    5063             :     TxType tx_type,
    5064             :     TxSize tx_size,
    5065             :     Txfm2DFlipCfg *cfg)
    5066             : {
    5067           0 :     assert(cfg != NULL);
    5068           0 :     cfg->tx_size = tx_size;
    5069           0 :     set_flip_cfg(tx_type, cfg);
    5070           0 :     av1_zero(cfg->stage_range_col);
    5071           0 :     av1_zero(cfg->stage_range_row);
    5072           0 :     set_flip_cfg(tx_type, cfg);
    5073           0 :     const TxType1D tx_type_1d_col = vtx_tab[tx_type];
    5074           0 :     const TxType1D tx_type_1d_row = htx_tab[tx_type];
    5075           0 :     cfg->shift = eb_inv_txfm_shift_ls[tx_size];
    5076           0 :     const int32_t txw_idx = get_txw_idx(tx_size);
    5077           0 :     const int32_t txh_idx = get_txh_idx(tx_size);
    5078           0 :     cfg->cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
    5079           0 :     cfg->cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
    5080           0 :     cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
    5081           0 :     if (cfg->txfm_type_col == TXFM_TYPE_ADST4)
    5082           0 :         memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range));
    5083           0 :     cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
    5084           0 :     if (cfg->txfm_type_row == TXFM_TYPE_ADST4)
    5085           0 :         memcpy(cfg->stage_range_row, iadst4_range, sizeof(iadst4_range));
    5086           0 :     cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
    5087           0 :     cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
    5088           0 : }
    5089             : 
    5090           0 : void eb_av1_gen_inv_stage_range(
    5091             :     int8_t *stage_range_col,
    5092             :     int8_t *stage_range_row,
    5093             :     const Txfm2DFlipCfg *cfg,
    5094             :     TxSize tx_size,
    5095             :     int32_t bd)
    5096             : {
    5097           0 :     const int32_t fwd_shift = inv_start_range[tx_size];
    5098           0 :     const int8_t *shift = cfg->shift;
    5099             :     int8_t opt_range_row, opt_range_col;
    5100           0 :     if (bd == 8) {
    5101           0 :         opt_range_row = 16;
    5102           0 :         opt_range_col = 16;
    5103             :     }
    5104           0 :     else if (bd == 10) {
    5105           0 :         opt_range_row = 18;
    5106           0 :         opt_range_col = 16;
    5107             :     }
    5108             :     else {
    5109           0 :         assert(bd == 12);
    5110           0 :         opt_range_row = 20;
    5111           0 :         opt_range_col = 18;
    5112             :     }
    5113             :     // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
    5114           0 :     for (int32_t i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
    5115           0 :         int32_t real_range_row = cfg->stage_range_row[i] + fwd_shift + bd + 1;
    5116             :         (void)real_range_row;
    5117           0 :         if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) {
    5118             :             // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
    5119             :             // so opt_range_col >= real_range_col will not hold
    5120           0 :             stage_range_row[i] = opt_range_row;
    5121             :         }
    5122             :         else {
    5123           0 :             assert(opt_range_row >= real_range_row);
    5124           0 :             stage_range_row[i] = opt_range_row;
    5125             :         }
    5126             :     }
    5127             :     // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
    5128           0 :     for (int32_t i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
    5129           0 :         int32_t real_range_col =
    5130           0 :             cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1;
    5131             :         (void)real_range_col;
    5132           0 :         if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) {
    5133             :             // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
    5134             :             // so opt_range_col >= real_range_col will not hold
    5135           0 :             stage_range_col[i] = opt_range_col;
    5136             :         }
    5137             :         else {
    5138           0 :             assert(opt_range_col >= real_range_col);
    5139           0 :             stage_range_col[i] = opt_range_col;
    5140             :         }
    5141             :     }
    5142           0 : }
    5143             : 
    5144           0 : static INLINE int32_t clamp_value(int32_t value, int8_t bit) {
    5145           0 :     if (bit <= 0) return value;  // Do nothing for invalid clamp bit.
    5146           0 :     const int64_t max_value = (1LL << (bit - 1)) - 1;
    5147           0 :     const int64_t min_value = -(1LL << (bit - 1));
    5148           0 :     return (int32_t)clamp64(value, min_value, max_value);
    5149             : }
    5150             : 
    5151           0 : void eb_av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    5152             :     const int8_t *stage_range) {
    5153           0 :     assert(output != input);
    5154           0 :     const int32_t *cospi = cospi_arr(cos_bit);
    5155             : 
    5156           0 :     int32_t stage = 0;
    5157             :     int32_t *bf0, *bf1;
    5158             :     int32_t step[4];
    5159             : 
    5160             :     // stage 0;
    5161             : 
    5162             :     // stage 1;
    5163           0 :     stage++;
    5164           0 :     bf1 = output;
    5165           0 :     bf1[0] = input[0];
    5166           0 :     bf1[1] = input[2];
    5167           0 :     bf1[2] = input[1];
    5168           0 :     bf1[3] = input[3];
    5169             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5170             : 
    5171             :     // stage 2
    5172           0 :     stage++;
    5173           0 :     bf0 = output;
    5174           0 :     bf1 = step;
    5175           0 :     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    5176           0 :     bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
    5177           0 :     bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
    5178           0 :     bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
    5179             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5180             : 
    5181             :     // stage 3
    5182           0 :     stage++;
    5183           0 :     bf0 = step;
    5184           0 :     bf1 = output;
    5185           0 :     bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
    5186           0 :     bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
    5187           0 :     bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
    5188           0 :     bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
    5189           0 : }
    5190           0 : void eb_av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    5191             :     const int8_t *stage_range) {
    5192           0 :     assert(output != input);
    5193           0 :     const int32_t *cospi = cospi_arr(cos_bit);
    5194             : 
    5195           0 :     int32_t stage = 0;
    5196             :     int32_t *bf0, *bf1;
    5197             :     int32_t step[8];
    5198             : 
    5199             :     // stage 0;
    5200             : 
    5201             :     // stage 1;
    5202           0 :     stage++;
    5203           0 :     bf1 = output;
    5204           0 :     bf1[0] = input[0];
    5205           0 :     bf1[1] = input[4];
    5206           0 :     bf1[2] = input[2];
    5207           0 :     bf1[3] = input[6];
    5208           0 :     bf1[4] = input[1];
    5209           0 :     bf1[5] = input[5];
    5210           0 :     bf1[6] = input[3];
    5211           0 :     bf1[7] = input[7];
    5212             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5213             : 
    5214             :     // stage 2
    5215           0 :     stage++;
    5216           0 :     bf0 = output;
    5217           0 :     bf1 = step;
    5218           0 :     bf1[0] = bf0[0];
    5219           0 :     bf1[1] = bf0[1];
    5220           0 :     bf1[2] = bf0[2];
    5221           0 :     bf1[3] = bf0[3];
    5222           0 :     bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
    5223           0 :     bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
    5224           0 :     bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
    5225           0 :     bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
    5226             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5227             : 
    5228             :     // stage 3
    5229           0 :     stage++;
    5230           0 :     bf0 = step;
    5231           0 :     bf1 = output;
    5232           0 :     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    5233           0 :     bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
    5234           0 :     bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
    5235           0 :     bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
    5236           0 :     bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
    5237           0 :     bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
    5238           0 :     bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
    5239           0 :     bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
    5240             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5241             : 
    5242             :     // stage 4
    5243           0 :     stage++;
    5244           0 :     bf0 = output;
    5245           0 :     bf1 = step;
    5246           0 :     bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
    5247           0 :     bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
    5248           0 :     bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
    5249           0 :     bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
    5250           0 :     bf1[4] = bf0[4];
    5251           0 :     bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    5252           0 :     bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    5253           0 :     bf1[7] = bf0[7];
    5254             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5255             : 
    5256             :     // stage 5
    5257           0 :     stage++;
    5258           0 :     bf0 = step;
    5259           0 :     bf1 = output;
    5260           0 :     bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
    5261           0 :     bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
    5262           0 :     bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
    5263           0 :     bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
    5264           0 :     bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
    5265           0 :     bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
    5266           0 :     bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
    5267           0 :     bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
    5268           0 : }
    5269           0 : void eb_av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    5270             :     const int8_t *stage_range) {
    5271           0 :     assert(output != input);
    5272           0 :     const int32_t *cospi = cospi_arr(cos_bit);
    5273             : 
    5274           0 :     int32_t stage = 0;
    5275             :     int32_t *bf0, *bf1;
    5276             :     int32_t step[16];
    5277             : 
    5278             :     // stage 0;
    5279             : 
    5280             :     // stage 1;
    5281           0 :     stage++;
    5282           0 :     bf1 = output;
    5283           0 :     bf1[0] = input[0];
    5284           0 :     bf1[1] = input[8];
    5285           0 :     bf1[2] = input[4];
    5286           0 :     bf1[3] = input[12];
    5287           0 :     bf1[4] = input[2];
    5288           0 :     bf1[5] = input[10];
    5289           0 :     bf1[6] = input[6];
    5290           0 :     bf1[7] = input[14];
    5291           0 :     bf1[8] = input[1];
    5292           0 :     bf1[9] = input[9];
    5293           0 :     bf1[10] = input[5];
    5294           0 :     bf1[11] = input[13];
    5295           0 :     bf1[12] = input[3];
    5296           0 :     bf1[13] = input[11];
    5297           0 :     bf1[14] = input[7];
    5298           0 :     bf1[15] = input[15];
    5299             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5300             : 
    5301             :     // stage 2
    5302           0 :     stage++;
    5303           0 :     bf0 = output;
    5304           0 :     bf1 = step;
    5305           0 :     bf1[0] = bf0[0];
    5306           0 :     bf1[1] = bf0[1];
    5307           0 :     bf1[2] = bf0[2];
    5308           0 :     bf1[3] = bf0[3];
    5309           0 :     bf1[4] = bf0[4];
    5310           0 :     bf1[5] = bf0[5];
    5311           0 :     bf1[6] = bf0[6];
    5312           0 :     bf1[7] = bf0[7];
    5313           0 :     bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
    5314           0 :     bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
    5315           0 :     bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
    5316           0 :     bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
    5317           0 :     bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
    5318           0 :     bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
    5319           0 :     bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
    5320           0 :     bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
    5321             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5322             : 
    5323             :     // stage 3
    5324           0 :     stage++;
    5325           0 :     bf0 = step;
    5326           0 :     bf1 = output;
    5327           0 :     bf1[0] = bf0[0];
    5328           0 :     bf1[1] = bf0[1];
    5329           0 :     bf1[2] = bf0[2];
    5330           0 :     bf1[3] = bf0[3];
    5331           0 :     bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
    5332           0 :     bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
    5333           0 :     bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
    5334           0 :     bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
    5335           0 :     bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
    5336           0 :     bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
    5337           0 :     bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
    5338           0 :     bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
    5339           0 :     bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
    5340           0 :     bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
    5341           0 :     bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
    5342           0 :     bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
    5343             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5344             : 
    5345             :     // stage 4
    5346           0 :     stage++;
    5347           0 :     bf0 = output;
    5348           0 :     bf1 = step;
    5349           0 :     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    5350           0 :     bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
    5351           0 :     bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
    5352           0 :     bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
    5353           0 :     bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
    5354           0 :     bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
    5355           0 :     bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
    5356           0 :     bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
    5357           0 :     bf1[8] = bf0[8];
    5358           0 :     bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
    5359           0 :     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
    5360           0 :     bf1[11] = bf0[11];
    5361           0 :     bf1[12] = bf0[12];
    5362           0 :     bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
    5363           0 :     bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
    5364           0 :     bf1[15] = bf0[15];
    5365             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5366             : 
    5367             :     // stage 5
    5368           0 :     stage++;
    5369           0 :     bf0 = step;
    5370           0 :     bf1 = output;
    5371           0 :     bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
    5372           0 :     bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
    5373           0 :     bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
    5374           0 :     bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
    5375           0 :     bf1[4] = bf0[4];
    5376           0 :     bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    5377           0 :     bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    5378           0 :     bf1[7] = bf0[7];
    5379           0 :     bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
    5380           0 :     bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
    5381           0 :     bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
    5382           0 :     bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
    5383           0 :     bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
    5384           0 :     bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
    5385           0 :     bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
    5386           0 :     bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
    5387             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5388             : 
    5389             :     // stage 6
    5390           0 :     stage++;
    5391           0 :     bf0 = output;
    5392           0 :     bf1 = step;
    5393           0 :     bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
    5394           0 :     bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
    5395           0 :     bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
    5396           0 :     bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
    5397           0 :     bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
    5398           0 :     bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
    5399           0 :     bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
    5400           0 :     bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
    5401           0 :     bf1[8] = bf0[8];
    5402           0 :     bf1[9] = bf0[9];
    5403           0 :     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    5404           0 :     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    5405           0 :     bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    5406           0 :     bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    5407           0 :     bf1[14] = bf0[14];
    5408           0 :     bf1[15] = bf0[15];
    5409             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5410             : 
    5411             :     // stage 7
    5412           0 :     stage++;
    5413           0 :     bf0 = step;
    5414           0 :     bf1 = output;
    5415           0 :     bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
    5416           0 :     bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
    5417           0 :     bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
    5418           0 :     bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
    5419           0 :     bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
    5420           0 :     bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
    5421           0 :     bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
    5422           0 :     bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
    5423           0 :     bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
    5424           0 :     bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
    5425           0 :     bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
    5426           0 :     bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
    5427           0 :     bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
    5428           0 :     bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
    5429           0 :     bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
    5430           0 :     bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
    5431           0 : }
    5432           0 : void eb_av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    5433             :     const int8_t *stage_range) {
    5434           0 :     assert(output != input);
    5435           0 :     const int32_t *cospi = cospi_arr(cos_bit);
    5436             : 
    5437           0 :     int32_t stage = 0;
    5438             :     int32_t *bf0, *bf1;
    5439             :     int32_t step[32];
    5440             : 
    5441             :     // stage 0;
    5442             : 
    5443             :     // stage 1;
    5444           0 :     stage++;
    5445           0 :     bf1 = output;
    5446           0 :     bf1[0] = input[0];
    5447           0 :     bf1[1] = input[16];
    5448           0 :     bf1[2] = input[8];
    5449           0 :     bf1[3] = input[24];
    5450           0 :     bf1[4] = input[4];
    5451           0 :     bf1[5] = input[20];
    5452           0 :     bf1[6] = input[12];
    5453           0 :     bf1[7] = input[28];
    5454           0 :     bf1[8] = input[2];
    5455           0 :     bf1[9] = input[18];
    5456           0 :     bf1[10] = input[10];
    5457           0 :     bf1[11] = input[26];
    5458           0 :     bf1[12] = input[6];
    5459           0 :     bf1[13] = input[22];
    5460           0 :     bf1[14] = input[14];
    5461           0 :     bf1[15] = input[30];
    5462           0 :     bf1[16] = input[1];
    5463           0 :     bf1[17] = input[17];
    5464           0 :     bf1[18] = input[9];
    5465           0 :     bf1[19] = input[25];
    5466           0 :     bf1[20] = input[5];
    5467           0 :     bf1[21] = input[21];
    5468           0 :     bf1[22] = input[13];
    5469           0 :     bf1[23] = input[29];
    5470           0 :     bf1[24] = input[3];
    5471           0 :     bf1[25] = input[19];
    5472           0 :     bf1[26] = input[11];
    5473           0 :     bf1[27] = input[27];
    5474           0 :     bf1[28] = input[7];
    5475           0 :     bf1[29] = input[23];
    5476           0 :     bf1[30] = input[15];
    5477           0 :     bf1[31] = input[31];
    5478             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5479             : 
    5480             :     // stage 2
    5481           0 :     stage++;
    5482           0 :     bf0 = output;
    5483           0 :     bf1 = step;
    5484           0 :     bf1[0] = bf0[0];
    5485           0 :     bf1[1] = bf0[1];
    5486           0 :     bf1[2] = bf0[2];
    5487           0 :     bf1[3] = bf0[3];
    5488           0 :     bf1[4] = bf0[4];
    5489           0 :     bf1[5] = bf0[5];
    5490           0 :     bf1[6] = bf0[6];
    5491           0 :     bf1[7] = bf0[7];
    5492           0 :     bf1[8] = bf0[8];
    5493           0 :     bf1[9] = bf0[9];
    5494           0 :     bf1[10] = bf0[10];
    5495           0 :     bf1[11] = bf0[11];
    5496           0 :     bf1[12] = bf0[12];
    5497           0 :     bf1[13] = bf0[13];
    5498           0 :     bf1[14] = bf0[14];
    5499           0 :     bf1[15] = bf0[15];
    5500           0 :     bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
    5501           0 :     bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
    5502           0 :     bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
    5503           0 :     bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
    5504           0 :     bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
    5505           0 :     bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
    5506           0 :     bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
    5507           0 :     bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
    5508           0 :     bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
    5509           0 :     bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
    5510           0 :     bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
    5511           0 :     bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
    5512           0 :     bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
    5513           0 :     bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
    5514           0 :     bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
    5515           0 :     bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
    5516             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5517             : 
    5518             :     // stage 3
    5519           0 :     stage++;
    5520           0 :     bf0 = step;
    5521           0 :     bf1 = output;
    5522           0 :     bf1[0] = bf0[0];
    5523           0 :     bf1[1] = bf0[1];
    5524           0 :     bf1[2] = bf0[2];
    5525           0 :     bf1[3] = bf0[3];
    5526           0 :     bf1[4] = bf0[4];
    5527           0 :     bf1[5] = bf0[5];
    5528           0 :     bf1[6] = bf0[6];
    5529           0 :     bf1[7] = bf0[7];
    5530           0 :     bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
    5531           0 :     bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
    5532           0 :     bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
    5533           0 :     bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
    5534           0 :     bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
    5535           0 :     bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
    5536           0 :     bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
    5537           0 :     bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
    5538           0 :     bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
    5539           0 :     bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
    5540           0 :     bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
    5541           0 :     bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
    5542           0 :     bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
    5543           0 :     bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
    5544           0 :     bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
    5545           0 :     bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
    5546           0 :     bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
    5547           0 :     bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
    5548           0 :     bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
    5549           0 :     bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
    5550           0 :     bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
    5551           0 :     bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
    5552           0 :     bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
    5553           0 :     bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
    5554             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5555             : 
    5556             :     // stage 4
    5557           0 :     stage++;
    5558           0 :     bf0 = output;
    5559           0 :     bf1 = step;
    5560           0 :     bf1[0] = bf0[0];
    5561           0 :     bf1[1] = bf0[1];
    5562           0 :     bf1[2] = bf0[2];
    5563           0 :     bf1[3] = bf0[3];
    5564           0 :     bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
    5565           0 :     bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
    5566           0 :     bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
    5567           0 :     bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
    5568           0 :     bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
    5569           0 :     bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
    5570           0 :     bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
    5571           0 :     bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
    5572           0 :     bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
    5573           0 :     bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
    5574           0 :     bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
    5575           0 :     bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
    5576           0 :     bf1[16] = bf0[16];
    5577           0 :     bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
    5578           0 :     bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
    5579           0 :     bf1[19] = bf0[19];
    5580           0 :     bf1[20] = bf0[20];
    5581           0 :     bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
    5582           0 :     bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
    5583           0 :     bf1[23] = bf0[23];
    5584           0 :     bf1[24] = bf0[24];
    5585           0 :     bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
    5586           0 :     bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
    5587           0 :     bf1[27] = bf0[27];
    5588           0 :     bf1[28] = bf0[28];
    5589           0 :     bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
    5590           0 :     bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
    5591           0 :     bf1[31] = bf0[31];
    5592             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5593             : 
    5594             :     // stage 5
    5595           0 :     stage++;
    5596           0 :     bf0 = step;
    5597           0 :     bf1 = output;
    5598           0 :     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    5599           0 :     bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
    5600           0 :     bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
    5601           0 :     bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
    5602           0 :     bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
    5603           0 :     bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
    5604           0 :     bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
    5605           0 :     bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
    5606           0 :     bf1[8] = bf0[8];
    5607           0 :     bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
    5608           0 :     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
    5609           0 :     bf1[11] = bf0[11];
    5610           0 :     bf1[12] = bf0[12];
    5611           0 :     bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
    5612           0 :     bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
    5613           0 :     bf1[15] = bf0[15];
    5614           0 :     bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
    5615           0 :     bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
    5616           0 :     bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
    5617           0 :     bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
    5618           0 :     bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
    5619           0 :     bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
    5620           0 :     bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
    5621           0 :     bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
    5622           0 :     bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
    5623           0 :     bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
    5624           0 :     bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
    5625           0 :     bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
    5626           0 :     bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
    5627           0 :     bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
    5628           0 :     bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
    5629           0 :     bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
    5630             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5631             : 
    5632             :     // stage 6
    5633           0 :     stage++;
    5634           0 :     bf0 = output;
    5635           0 :     bf1 = step;
    5636           0 :     bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
    5637           0 :     bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
    5638           0 :     bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
    5639           0 :     bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
    5640           0 :     bf1[4] = bf0[4];
    5641           0 :     bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    5642           0 :     bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    5643           0 :     bf1[7] = bf0[7];
    5644           0 :     bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
    5645           0 :     bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
    5646           0 :     bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
    5647           0 :     bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
    5648           0 :     bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
    5649           0 :     bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
    5650           0 :     bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
    5651           0 :     bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
    5652           0 :     bf1[16] = bf0[16];
    5653           0 :     bf1[17] = bf0[17];
    5654           0 :     bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
    5655           0 :     bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
    5656           0 :     bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
    5657           0 :     bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
    5658           0 :     bf1[22] = bf0[22];
    5659           0 :     bf1[23] = bf0[23];
    5660           0 :     bf1[24] = bf0[24];
    5661           0 :     bf1[25] = bf0[25];
    5662           0 :     bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
    5663           0 :     bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
    5664           0 :     bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
    5665           0 :     bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
    5666           0 :     bf1[30] = bf0[30];
    5667           0 :     bf1[31] = bf0[31];
    5668             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5669             : 
    5670             :     // stage 7
    5671           0 :     stage++;
    5672           0 :     bf0 = step;
    5673           0 :     bf1 = output;
    5674           0 :     bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
    5675           0 :     bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
    5676           0 :     bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
    5677           0 :     bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
    5678           0 :     bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
    5679           0 :     bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
    5680           0 :     bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
    5681           0 :     bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
    5682           0 :     bf1[8] = bf0[8];
    5683           0 :     bf1[9] = bf0[9];
    5684           0 :     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    5685           0 :     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    5686           0 :     bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    5687           0 :     bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    5688           0 :     bf1[14] = bf0[14];
    5689           0 :     bf1[15] = bf0[15];
    5690           0 :     bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
    5691           0 :     bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
    5692           0 :     bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
    5693           0 :     bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
    5694           0 :     bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
    5695           0 :     bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
    5696           0 :     bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
    5697           0 :     bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
    5698           0 :     bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
    5699           0 :     bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
    5700           0 :     bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
    5701           0 :     bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
    5702           0 :     bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
    5703           0 :     bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
    5704           0 :     bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
    5705           0 :     bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
    5706             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5707             : 
    5708             :     // stage 8
    5709           0 :     stage++;
    5710           0 :     bf0 = output;
    5711           0 :     bf1 = step;
    5712           0 :     bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
    5713           0 :     bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
    5714           0 :     bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
    5715           0 :     bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
    5716           0 :     bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
    5717           0 :     bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
    5718           0 :     bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
    5719           0 :     bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
    5720           0 :     bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
    5721           0 :     bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
    5722           0 :     bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
    5723           0 :     bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
    5724           0 :     bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
    5725           0 :     bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
    5726           0 :     bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
    5727           0 :     bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
    5728           0 :     bf1[16] = bf0[16];
    5729           0 :     bf1[17] = bf0[17];
    5730           0 :     bf1[18] = bf0[18];
    5731           0 :     bf1[19] = bf0[19];
    5732           0 :     bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
    5733           0 :     bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
    5734           0 :     bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
    5735           0 :     bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
    5736           0 :     bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
    5737           0 :     bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
    5738           0 :     bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
    5739           0 :     bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
    5740           0 :     bf1[28] = bf0[28];
    5741           0 :     bf1[29] = bf0[29];
    5742           0 :     bf1[30] = bf0[30];
    5743           0 :     bf1[31] = bf0[31];
    5744             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5745             : 
    5746             :     // stage 9
    5747           0 :     stage++;
    5748           0 :     bf0 = step;
    5749           0 :     bf1 = output;
    5750           0 :     bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
    5751           0 :     bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
    5752           0 :     bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
    5753           0 :     bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
    5754           0 :     bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
    5755           0 :     bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
    5756           0 :     bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
    5757           0 :     bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
    5758           0 :     bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
    5759           0 :     bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
    5760           0 :     bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
    5761           0 :     bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
    5762           0 :     bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
    5763           0 :     bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
    5764           0 :     bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
    5765           0 :     bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
    5766           0 :     bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
    5767           0 :     bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
    5768           0 :     bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
    5769           0 :     bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
    5770           0 :     bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
    5771           0 :     bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
    5772           0 :     bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
    5773           0 :     bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
    5774           0 :     bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
    5775           0 :     bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
    5776           0 :     bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
    5777           0 :     bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
    5778           0 :     bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
    5779           0 :     bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
    5780           0 :     bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
    5781           0 :     bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
    5782           0 : }
    5783           0 : void eb_av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    5784             :     const int8_t *stage_range) {
    5785             :     (void)stage_range;
    5786           0 :     int32_t bit = cos_bit;
    5787           0 :     const int32_t *sinpi = sinpi_arr(bit);
    5788             :     int32_t s0, s1, s2, s3, s4, s5, s6, s7;
    5789             : 
    5790           0 :     int32_t x0 = input[0];
    5791           0 :     int32_t x1 = input[1];
    5792           0 :     int32_t x2 = input[2];
    5793           0 :     int32_t x3 = input[3];
    5794             : 
    5795           0 :     if (!(x0 | x1 | x2 | x3)) {
    5796           0 :         output[0] = output[1] = output[2] = output[3] = 0;
    5797           0 :         return;
    5798             :     }
    5799             : 
    5800           0 :     assert(sinpi[1] + sinpi[2] == sinpi[4]);
    5801             : 
    5802             :     // stage 1
    5803             :     //s0 = range_check_value(sinpi[1] * x0, stage_range[1] + bit);
    5804             :     //s1 = range_check_value(sinpi[2] * x0, stage_range[1] + bit);
    5805             :     //s2 = range_check_value(sinpi[3] * x1, stage_range[1] + bit);
    5806             :     //s3 = range_check_value(sinpi[4] * x2, stage_range[1] + bit);
    5807             :     //s4 = range_check_value(sinpi[1] * x2, stage_range[1] + bit);
    5808             :     //s5 = range_check_value(sinpi[2] * x3, stage_range[1] + bit);
    5809             :     //s6 = range_check_value(sinpi[4] * x3, stage_range[1] + bit);
    5810             : 
    5811           0 :     s0 = sinpi[1] * x0;
    5812           0 :     s1 = sinpi[2] * x0;
    5813           0 :     s2 = sinpi[3] * x1;
    5814           0 :     s3 = sinpi[4] * x2;
    5815           0 :     s4 = sinpi[1] * x2;
    5816           0 :     s5 = sinpi[2] * x3;
    5817           0 :     s6 = sinpi[4] * x3;
    5818             : 
    5819             :     // stage 2
    5820             :     // NOTICE: (x0 - x2) here may use one extra bit compared to the
    5821             :     // opt_range_row/col specified in eb_av1_gen_inv_stage_range()
    5822             :     //s7 = range_check_value((x0 - x2) + x3, stage_range[2]);
    5823             : 
    5824             :     //// stage 3
    5825             :     //s0 = range_check_value(s0 + s3, stage_range[3] + bit);
    5826             :     //s1 = range_check_value(s1 - s4, stage_range[3] + bit);
    5827             :     //s3 = range_check_value(s2, stage_range[3] + bit);
    5828             :     //s2 = range_check_value(sinpi[3] * s7, stage_range[3] + bit);
    5829             : 
    5830             :     //// stage 4
    5831             :     //s0 = range_check_value(s0 + s5, stage_range[4] + bit);
    5832             :     //s1 = range_check_value(s1 - s6, stage_range[4] + bit);
    5833             : 
    5834             :     //// stage 5
    5835             :     //x0 = range_check_value(s0 + s3, stage_range[5] + bit);
    5836             :     //x1 = range_check_value(s1 + s3, stage_range[5] + bit);
    5837             :     //x2 = range_check_value(s2, stage_range[5] + bit);
    5838             :     //x3 = range_check_value(s0 + s1, stage_range[5] + bit);
    5839             : 
    5840             :     //// stage 6
    5841             :     //x3 = range_check_value(x3 - s3, stage_range[6] + bit);
    5842             : 
    5843           0 :     s7 = (x0 - x2) + x3;
    5844             : 
    5845             :     // stage 3
    5846           0 :     s0 = s0 + s3;
    5847           0 :     s1 = s1 - s4;
    5848           0 :     s3 = s2;
    5849           0 :     s2 = sinpi[3] * s7;
    5850             : 
    5851             :     // stage 4
    5852           0 :     s0 = s0 + s5;
    5853           0 :     s1 = s1 - s6;
    5854             : 
    5855             :     // stage 5
    5856           0 :     x0 = s0 + s3;
    5857           0 :     x1 = s1 + s3;
    5858           0 :     x2 = s2;
    5859           0 :     x3 = s0 + s1;
    5860             : 
    5861             :     // stage 6
    5862           0 :     x3 = x3 - s3;
    5863             : 
    5864           0 :     output[0] = round_shift(x0, bit);
    5865           0 :     output[1] = round_shift(x1, bit);
    5866           0 :     output[2] = round_shift(x2, bit);
    5867           0 :     output[3] = round_shift(x3, bit);
    5868             :     //range_check_buf(6, input, output, 4, stage_range[6]);
    5869             : }
    5870           0 : static INLINE void clamp_buf(int32_t *buf, int32_t size, int8_t bit) {
    5871           0 :     for (int32_t i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit);
    5872           0 : }
    5873           0 : void eb_av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    5874             :     const int8_t *stage_range) {
    5875           0 :     assert(output != input);
    5876           0 :     const int32_t *cospi = cospi_arr(cos_bit);
    5877             : 
    5878           0 :     int32_t stage = 0;
    5879             :     int32_t *bf0, *bf1;
    5880             :     int32_t step[8];
    5881             : 
    5882             :     // stage 0;
    5883             : 
    5884             :     // stage 1;
    5885           0 :     stage++;
    5886           0 :     bf1 = output;
    5887           0 :     bf1[0] = input[7];
    5888           0 :     bf1[1] = input[0];
    5889           0 :     bf1[2] = input[5];
    5890           0 :     bf1[3] = input[2];
    5891           0 :     bf1[4] = input[3];
    5892           0 :     bf1[5] = input[4];
    5893           0 :     bf1[6] = input[1];
    5894           0 :     bf1[7] = input[6];
    5895             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5896             : 
    5897             :     // stage 2
    5898           0 :     stage++;
    5899           0 :     bf0 = output;
    5900           0 :     bf1 = step;
    5901           0 :     bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
    5902           0 :     bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
    5903           0 :     bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
    5904           0 :     bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
    5905           0 :     bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
    5906           0 :     bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
    5907           0 :     bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
    5908           0 :     bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
    5909             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5910             : 
    5911             :     // stage 3
    5912           0 :     stage++;
    5913           0 :     bf0 = step;
    5914           0 :     bf1 = output;
    5915           0 :     bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
    5916           0 :     bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
    5917           0 :     bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
    5918           0 :     bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
    5919           0 :     bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
    5920           0 :     bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
    5921           0 :     bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
    5922           0 :     bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
    5923             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5924             : 
    5925             :     // stage 4
    5926           0 :     stage++;
    5927           0 :     bf0 = output;
    5928           0 :     bf1 = step;
    5929           0 :     bf1[0] = bf0[0];
    5930           0 :     bf1[1] = bf0[1];
    5931           0 :     bf1[2] = bf0[2];
    5932           0 :     bf1[3] = bf0[3];
    5933           0 :     bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
    5934           0 :     bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
    5935           0 :     bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
    5936           0 :     bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
    5937             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5938             : 
    5939             :     // stage 5
    5940           0 :     stage++;
    5941           0 :     bf0 = step;
    5942           0 :     bf1 = output;
    5943           0 :     bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
    5944           0 :     bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
    5945           0 :     bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
    5946           0 :     bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
    5947           0 :     bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
    5948           0 :     bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
    5949           0 :     bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
    5950           0 :     bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
    5951             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5952             : 
    5953             :     // stage 6
    5954           0 :     stage++;
    5955           0 :     bf0 = output;
    5956           0 :     bf1 = step;
    5957           0 :     bf1[0] = bf0[0];
    5958           0 :     bf1[1] = bf0[1];
    5959           0 :     bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
    5960           0 :     bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
    5961           0 :     bf1[4] = bf0[4];
    5962           0 :     bf1[5] = bf0[5];
    5963           0 :     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
    5964           0 :     bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
    5965             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    5966             : 
    5967             :     // stage 7
    5968           0 :     stage++;
    5969           0 :     bf0 = step;
    5970           0 :     bf1 = output;
    5971           0 :     bf1[0] = bf0[0];
    5972           0 :     bf1[1] = -bf0[4];
    5973           0 :     bf1[2] = bf0[6];
    5974           0 :     bf1[3] = -bf0[2];
    5975           0 :     bf1[4] = bf0[3];
    5976           0 :     bf1[5] = -bf0[7];
    5977           0 :     bf1[6] = bf0[5];
    5978           0 :     bf1[7] = -bf0[1];
    5979           0 : }
    5980           0 : void eb_av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    5981             :     const int8_t *stage_range) {
    5982           0 :     assert(output != input);
    5983           0 :     const int32_t *cospi = cospi_arr(cos_bit);
    5984             : 
    5985           0 :     int32_t stage = 0;
    5986             :     int32_t *bf0, *bf1;
    5987             :     int32_t step[16];
    5988             : 
    5989             :     // stage 0;
    5990             : 
    5991             :     // stage 1;
    5992           0 :     stage++;
    5993           0 :     bf1 = output;
    5994           0 :     bf1[0] = input[15];
    5995           0 :     bf1[1] = input[0];
    5996           0 :     bf1[2] = input[13];
    5997           0 :     bf1[3] = input[2];
    5998           0 :     bf1[4] = input[11];
    5999           0 :     bf1[5] = input[4];
    6000           0 :     bf1[6] = input[9];
    6001           0 :     bf1[7] = input[6];
    6002           0 :     bf1[8] = input[7];
    6003           0 :     bf1[9] = input[8];
    6004           0 :     bf1[10] = input[5];
    6005           0 :     bf1[11] = input[10];
    6006           0 :     bf1[12] = input[3];
    6007           0 :     bf1[13] = input[12];
    6008           0 :     bf1[14] = input[1];
    6009           0 :     bf1[15] = input[14];
    6010             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    6011             : 
    6012             :     // stage 2
    6013           0 :     stage++;
    6014           0 :     bf0 = output;
    6015           0 :     bf1 = step;
    6016           0 :     bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
    6017           0 :     bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
    6018           0 :     bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
    6019           0 :     bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
    6020           0 :     bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
    6021           0 :     bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
    6022           0 :     bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
    6023           0 :     bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
    6024           0 :     bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
    6025           0 :     bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
    6026           0 :     bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
    6027           0 :     bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
    6028           0 :     bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
    6029           0 :     bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
    6030           0 :     bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
    6031           0 :     bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
    6032             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    6033             : 
    6034             :     // stage 3
    6035           0 :     stage++;
    6036           0 :     bf0 = step;
    6037           0 :     bf1 = output;
    6038           0 :     bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]);
    6039           0 :     bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]);
    6040           0 :     bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]);
    6041           0 :     bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]);
    6042           0 :     bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]);
    6043           0 :     bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]);
    6044           0 :     bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]);
    6045           0 :     bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]);
    6046           0 :     bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]);
    6047           0 :     bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]);
    6048           0 :     bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]);
    6049           0 :     bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]);
    6050           0 :     bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]);
    6051           0 :     bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
    6052           0 :     bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
    6053           0 :     bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
    6054             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    6055             : 
    6056             :     // stage 4
    6057           0 :     stage++;
    6058           0 :     bf0 = output;
    6059           0 :     bf1 = step;
    6060           0 :     bf1[0] = bf0[0];
    6061           0 :     bf1[1] = bf0[1];
    6062           0 :     bf1[2] = bf0[2];
    6063           0 :     bf1[3] = bf0[3];
    6064           0 :     bf1[4] = bf0[4];
    6065           0 :     bf1[5] = bf0[5];
    6066           0 :     bf1[6] = bf0[6];
    6067           0 :     bf1[7] = bf0[7];
    6068           0 :     bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
    6069           0 :     bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
    6070           0 :     bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
    6071           0 :     bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
    6072           0 :     bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
    6073           0 :     bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
    6074           0 :     bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
    6075           0 :     bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
    6076             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    6077             : 
    6078             :     // stage 5
    6079           0 :     stage++;
    6080           0 :     bf0 = step;
    6081           0 :     bf1 = output;
    6082           0 :     bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
    6083           0 :     bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
    6084           0 :     bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
    6085           0 :     bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
    6086           0 :     bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
    6087           0 :     bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
    6088           0 :     bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
    6089           0 :     bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
    6090           0 :     bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]);
    6091           0 :     bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]);
    6092           0 :     bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]);
    6093           0 :     bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]);
    6094           0 :     bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]);
    6095           0 :     bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
    6096           0 :     bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
    6097           0 :     bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
    6098             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    6099             : 
    6100             :     // stage 6
    6101           0 :     stage++;
    6102           0 :     bf0 = output;
    6103           0 :     bf1 = step;
    6104           0 :     bf1[0] = bf0[0];
    6105           0 :     bf1[1] = bf0[1];
    6106           0 :     bf1[2] = bf0[2];
    6107           0 :     bf1[3] = bf0[3];
    6108           0 :     bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
    6109           0 :     bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
    6110           0 :     bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
    6111           0 :     bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
    6112           0 :     bf1[8] = bf0[8];
    6113           0 :     bf1[9] = bf0[9];
    6114           0 :     bf1[10] = bf0[10];
    6115           0 :     bf1[11] = bf0[11];
    6116           0 :     bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
    6117           0 :     bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
    6118           0 :     bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
    6119           0 :     bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
    6120             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    6121             : 
    6122             :     // stage 7
    6123           0 :     stage++;
    6124           0 :     bf0 = step;
    6125           0 :     bf1 = output;
    6126           0 :     bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
    6127           0 :     bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
    6128           0 :     bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
    6129           0 :     bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
    6130           0 :     bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
    6131           0 :     bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
    6132           0 :     bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
    6133           0 :     bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
    6134           0 :     bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]);
    6135           0 :     bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]);
    6136           0 :     bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]);
    6137           0 :     bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]);
    6138           0 :     bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]);
    6139           0 :     bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
    6140           0 :     bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
    6141           0 :     bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
    6142             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    6143             : 
    6144             :     // stage 8
    6145           0 :     stage++;
    6146           0 :     bf0 = output;
    6147           0 :     bf1 = step;
    6148           0 :     bf1[0] = bf0[0];
    6149           0 :     bf1[1] = bf0[1];
    6150           0 :     bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
    6151           0 :     bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
    6152           0 :     bf1[4] = bf0[4];
    6153           0 :     bf1[5] = bf0[5];
    6154           0 :     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
    6155           0 :     bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
    6156           0 :     bf1[8] = bf0[8];
    6157           0 :     bf1[9] = bf0[9];
    6158           0 :     bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
    6159           0 :     bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
    6160           0 :     bf1[12] = bf0[12];
    6161           0 :     bf1[13] = bf0[13];
    6162           0 :     bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
    6163           0 :     bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
    6164             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    6165             : 
    6166             :     // stage 9
    6167           0 :     stage++;
    6168           0 :     bf0 = step;
    6169           0 :     bf1 = output;
    6170           0 :     bf1[0] = bf0[0];
    6171           0 :     bf1[1] = -bf0[8];
    6172           0 :     bf1[2] = bf0[12];
    6173           0 :     bf1[3] = -bf0[4];
    6174           0 :     bf1[4] = bf0[6];
    6175           0 :     bf1[5] = -bf0[14];
    6176           0 :     bf1[6] = bf0[10];
    6177           0 :     bf1[7] = -bf0[2];
    6178           0 :     bf1[8] = bf0[3];
    6179           0 :     bf1[9] = -bf0[11];
    6180           0 :     bf1[10] = bf0[15];
    6181           0 :     bf1[11] = -bf0[7];
    6182           0 :     bf1[12] = bf0[5];
    6183           0 :     bf1[13] = -bf0[13];
    6184           0 :     bf1[14] = bf0[9];
    6185           0 :     bf1[15] = -bf0[1];
    6186           0 : }
    6187           0 : void av1_iadst32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    6188             :     const int8_t *stage_range) {
    6189           0 :     const int32_t size = 32;
    6190             :     const int32_t *cospi;
    6191             : 
    6192           0 :     int32_t stage = 0;
    6193             :     int32_t *bf0, *bf1;
    6194             :     int32_t step[32];
    6195             : 
    6196             :     // stage 0;
    6197           0 :     clamp_buf((int32_t *)input, size, stage_range[stage]);
    6198             : 
    6199             :     // stage 1;
    6200           0 :     stage++;
    6201           0 :     assert(output != input);
    6202           0 :     bf1 = output;
    6203           0 :     bf1[0] = input[0];
    6204           0 :     bf1[1] = -input[31];
    6205           0 :     bf1[2] = -input[15];
    6206           0 :     bf1[3] = input[16];
    6207           0 :     bf1[4] = -input[7];
    6208           0 :     bf1[5] = input[24];
    6209           0 :     bf1[6] = input[8];
    6210           0 :     bf1[7] = -input[23];
    6211           0 :     bf1[8] = -input[3];
    6212           0 :     bf1[9] = input[28];
    6213           0 :     bf1[10] = input[12];
    6214           0 :     bf1[11] = -input[19];
    6215           0 :     bf1[12] = input[4];
    6216           0 :     bf1[13] = -input[27];
    6217           0 :     bf1[14] = -input[11];
    6218           0 :     bf1[15] = input[20];
    6219           0 :     bf1[16] = -input[1];
    6220           0 :     bf1[17] = input[30];
    6221           0 :     bf1[18] = input[14];
    6222           0 :     bf1[19] = -input[17];
    6223           0 :     bf1[20] = input[6];
    6224           0 :     bf1[21] = -input[25];
    6225           0 :     bf1[22] = -input[9];
    6226           0 :     bf1[23] = input[22];
    6227           0 :     bf1[24] = input[2];
    6228           0 :     bf1[25] = -input[29];
    6229           0 :     bf1[26] = -input[13];
    6230           0 :     bf1[27] = input[18];
    6231           0 :     bf1[28] = -input[5];
    6232           0 :     bf1[29] = input[26];
    6233           0 :     bf1[30] = input[10];
    6234           0 :     bf1[31] = -input[21];
    6235           0 :     clamp_buf(bf1, size, stage_range[stage]);
    6236             : 
    6237             :     // stage 2
    6238           0 :     stage++;
    6239           0 :     cospi = cospi_arr(cos_bit);
    6240           0 :     bf0 = output;
    6241           0 :     bf1 = step;
    6242           0 :     bf1[0] = bf0[0];
    6243           0 :     bf1[1] = bf0[1];
    6244           0 :     bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
    6245           0 :     bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
    6246           0 :     bf1[4] = bf0[4];
    6247           0 :     bf1[5] = bf0[5];
    6248           0 :     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
    6249           0 :     bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
    6250           0 :     bf1[8] = bf0[8];
    6251           0 :     bf1[9] = bf0[9];
    6252           0 :     bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
    6253           0 :     bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
    6254           0 :     bf1[12] = bf0[12];
    6255           0 :     bf1[13] = bf0[13];
    6256           0 :     bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
    6257           0 :     bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
    6258           0 :     bf1[16] = bf0[16];
    6259           0 :     bf1[17] = bf0[17];
    6260           0 :     bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit);
    6261           0 :     bf1[19] = half_btf(cospi[32], bf0[18], -cospi[32], bf0[19], cos_bit);
    6262           0 :     bf1[20] = bf0[20];
    6263           0 :     bf1[21] = bf0[21];
    6264           0 :     bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit);
    6265           0 :     bf1[23] = half_btf(cospi[32], bf0[22], -cospi[32], bf0[23], cos_bit);
    6266           0 :     bf1[24] = bf0[24];
    6267           0 :     bf1[25] = bf0[25];
    6268           0 :     bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit);
    6269           0 :     bf1[27] = half_btf(cospi[32], bf0[26], -cospi[32], bf0[27], cos_bit);
    6270           0 :     bf1[28] = bf0[28];
    6271           0 :     bf1[29] = bf0[29];
    6272           0 :     bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit);
    6273           0 :     bf1[31] = half_btf(cospi[32], bf0[30], -cospi[32], bf0[31], cos_bit);
    6274           0 :     clamp_buf(bf1, size, stage_range[stage]);
    6275             : 
    6276             :     // stage 3
    6277           0 :     stage++;
    6278           0 :     bf0 = step;
    6279           0 :     bf1 = output;
    6280           0 :     bf1[0] = bf0[0] + bf0[2];
    6281           0 :     bf1[1] = bf0[1] + bf0[3];
    6282           0 :     bf1[2] = bf0[0] - bf0[2];
    6283           0 :     bf1[3] = bf0[1] - bf0[3];
    6284           0 :     bf1[4] = bf0[4] + bf0[6];
    6285           0 :     bf1[5] = bf0[5] + bf0[7];
    6286           0 :     bf1[6] = bf0[4] - bf0[6];
    6287           0 :     bf1[7] = bf0[5] - bf0[7];
    6288           0 :     bf1[8] = bf0[8] + bf0[10];
    6289           0 :     bf1[9] = bf0[9] + bf0[11];
    6290           0 :     bf1[10] = bf0[8] - bf0[10];
    6291           0 :     bf1[11] = bf0[9] - bf0[11];
    6292           0 :     bf1[12] = bf0[12] + bf0[14];
    6293           0 :     bf1[13] = bf0[13] + bf0[15];
    6294           0 :     bf1[14] = bf0[12] - bf0[14];
    6295           0 :     bf1[15] = bf0[13] - bf0[15];
    6296           0 :     bf1[16] = bf0[16] + bf0[18];
    6297           0 :     bf1[17] = bf0[17] + bf0[19];
    6298           0 :     bf1[18] = bf0[16] - bf0[18];
    6299           0 :     bf1[19] = bf0[17] - bf0[19];
    6300           0 :     bf1[20] = bf0[20] + bf0[22];
    6301           0 :     bf1[21] = bf0[21] + bf0[23];
    6302           0 :     bf1[22] = bf0[20] - bf0[22];
    6303           0 :     bf1[23] = bf0[21] - bf0[23];
    6304           0 :     bf1[24] = bf0[24] + bf0[26];
    6305           0 :     bf1[25] = bf0[25] + bf0[27];
    6306           0 :     bf1[26] = bf0[24] - bf0[26];
    6307           0 :     bf1[27] = bf0[25] - bf0[27];
    6308           0 :     bf1[28] = bf0[28] + bf0[30];
    6309           0 :     bf1[29] = bf0[29] + bf0[31];
    6310           0 :     bf1[30] = bf0[28] - bf0[30];
    6311           0 :     bf1[31] = bf0[29] - bf0[31];
    6312           0 :     clamp_buf(bf1, size, stage_range[stage]);
    6313             : 
    6314             :     // stage 4
    6315           0 :     stage++;
    6316           0 :     cospi = cospi_arr(cos_bit);
    6317           0 :     bf0 = output;
    6318           0 :     bf1 = step;
    6319           0 :     bf1[0] = bf0[0];
    6320           0 :     bf1[1] = bf0[1];
    6321           0 :     bf1[2] = bf0[2];
    6322           0 :     bf1[3] = bf0[3];
    6323           0 :     bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
    6324           0 :     bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
    6325           0 :     bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
    6326           0 :     bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
    6327           0 :     bf1[8] = bf0[8];
    6328           0 :     bf1[9] = bf0[9];
    6329           0 :     bf1[10] = bf0[10];
    6330           0 :     bf1[11] = bf0[11];
    6331           0 :     bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
    6332           0 :     bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
    6333           0 :     bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
    6334           0 :     bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
    6335           0 :     bf1[16] = bf0[16];
    6336           0 :     bf1[17] = bf0[17];
    6337           0 :     bf1[18] = bf0[18];
    6338           0 :     bf1[19] = bf0[19];
    6339           0 :     bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit);
    6340           0 :     bf1[21] = half_btf(cospi[48], bf0[20], -cospi[16], bf0[21], cos_bit);
    6341           0 :     bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit);
    6342           0 :     bf1[23] = half_btf(cospi[16], bf0[22], cospi[48], bf0[23], cos_bit);
    6343           0 :     bf1[24] = bf0[24];
    6344           0 :     bf1[25] = bf0[25];
    6345           0 :     bf1[26] = bf0[26];
    6346           0 :     bf1[27] = bf0[27];
    6347           0 :     bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit);
    6348           0 :     bf1[29] = half_btf(cospi[48], bf0[28], -cospi[16], bf0[29], cos_bit);
    6349           0 :     bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit);
    6350           0 :     bf1[31] = half_btf(cospi[16], bf0[30], cospi[48], bf0[31], cos_bit);
    6351           0 :     clamp_buf(bf1, size, stage_range[stage]);
    6352             : 
    6353             :     // stage 5
    6354           0 :     stage++;
    6355           0 :     bf0 = step;
    6356           0 :     bf1 = output;
    6357           0 :     bf1[0] = bf0[0] + bf0[4];
    6358           0 :     bf1[1] = bf0[1] + bf0[5];
    6359           0 :     bf1[2] = bf0[2] + bf0[6];
    6360           0 :     bf1[3] = bf0[3] + bf0[7];
    6361           0 :     bf1[4] = bf0[0] - bf0[4];
    6362           0 :     bf1[5] = bf0[1] - bf0[5];
    6363           0 :     bf1[6] = bf0[2] - bf0[6];
    6364           0 :     bf1[7] = bf0[3] - bf0[7];
    6365           0 :     bf1[8] = bf0[8] + bf0[12];
    6366           0 :     bf1[9] = bf0[9] + bf0[13];
    6367           0 :     bf1[10] = bf0[10] + bf0[14];
    6368           0 :     bf1[11] = bf0[11] + bf0[15];
    6369           0 :     bf1[12] = bf0[8] - bf0[12];
    6370           0 :     bf1[13] = bf0[9] - bf0[13];
    6371           0 :     bf1[14] = bf0[10] - bf0[14];
    6372           0 :     bf1[15] = bf0[11] - bf0[15];
    6373           0 :     bf1[16] = bf0[16] + bf0[20];
    6374           0 :     bf1[17] = bf0[17] + bf0[21];
    6375           0 :     bf1[18] = bf0[18] + bf0[22];
    6376           0 :     bf1[19] = bf0[19] + bf0[23];
    6377           0 :     bf1[20] = bf0[16] - bf0[20];
    6378           0 :     bf1[21] = bf0[17] - bf0[21];
    6379           0 :     bf1[22] = bf0[18] - bf0[22];
    6380           0 :     bf1[23] = bf0[19] - bf0[23];
    6381           0 :     bf1[24] = bf0[24] + bf0[28];
    6382           0 :     bf1[25] = bf0[25] + bf0[29];
    6383           0 :     bf1[26] = bf0[26] + bf0[30];
    6384           0 :     bf1[27] = bf0[27] + bf0[31];
    6385           0 :     bf1[28] = bf0[24] - bf0[28];
    6386           0 :     bf1[29] = bf0[25] - bf0[29];
    6387           0 :     bf1[30] = bf0[26] - bf0[30];
    6388           0 :     bf1[31] = bf0[27] - bf0[31];
    6389           0 :     clamp_buf(bf1, size, stage_range[stage]);
    6390             : 
    6391             :     // stage 6
    6392           0 :     stage++;
    6393           0 :     cospi = cospi_arr(cos_bit);
    6394           0 :     bf0 = output;
    6395           0 :     bf1 = step;
    6396           0 :     bf1[0] = bf0[0];
    6397           0 :     bf1[1] = bf0[1];
    6398           0 :     bf1[2] = bf0[2];
    6399           0 :     bf1[3] = bf0[3];
    6400           0 :     bf1[4] = bf0[4];
    6401           0 :     bf1[5] = bf0[5];
    6402           0 :     bf1[6] = bf0[6];
    6403           0 :     bf1[7] = bf0[7];
    6404           0 :     bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
    6405           0 :     bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
    6406           0 :     bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
    6407           0 :     bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
    6408           0 :     bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
    6409           0 :     bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
    6410           0 :     bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
    6411           0 :     bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
    6412           0 :     bf1[16] = bf0[16];
    6413           0 :     bf1[17] = bf0[17];
    6414           0 :     bf1[18] = bf0[18];
    6415           0 :     bf1[19] = bf0[19];
    6416           0 :     bf1[20] = bf0[20];
    6417           0 :     bf1[21] = bf0[21];
    6418           0 :     bf1[22] = bf0[22];
    6419           0 :     bf1[23] = bf0[23];
    6420           0 :     bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit);
    6421           0 :     bf1[25] = half_btf(cospi[56], bf0[24], -cospi[8], bf0[25], cos_bit);
    6422           0 :     bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit);
    6423           0 :     bf1[27] = half_btf(cospi[24], bf0[26], -cospi[40], bf0[27], cos_bit);
    6424           0 :     bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit);
    6425           0 :     bf1[29] = half_btf(cospi[8], bf0[28], cospi[56], bf0[29], cos_bit);
    6426           0 :     bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit);
    6427           0 :     bf1[31] = half_btf(cospi[40], bf0[30], cospi[24], bf0[31], cos_bit);
    6428           0 :     clamp_buf(bf1, size, stage_range[stage]);
    6429             : 
    6430             :     // stage 7
    6431           0 :     stage++;
    6432           0 :     bf0 = step;
    6433           0 :     bf1 = output;
    6434           0 :     bf1[0] = bf0[0] + bf0[8];
    6435           0 :     bf1[1] = bf0[1] + bf0[9];
    6436           0 :     bf1[2] = bf0[2] + bf0[10];
    6437           0 :     bf1[3] = bf0[3] + bf0[11];
    6438           0 :     bf1[4] = bf0[4] + bf0[12];
    6439           0 :     bf1[5] = bf0[5] + bf0[13];
    6440           0 :     bf1[6] = bf0[6] + bf0[14];
    6441           0 :     bf1[7] = bf0[7] + bf0[15];
    6442           0 :     bf1[8] = bf0[0] - bf0[8];
    6443           0 :     bf1[9] = bf0[1] - bf0[9];
    6444           0 :     bf1[10] = bf0[2] - bf0[10];
    6445           0 :     bf1[11] = bf0[3] - bf0[11];
    6446           0 :     bf1[12] = bf0[4] - bf0[12];
    6447           0 :     bf1[13] = bf0[5] - bf0[13];
    6448           0 :     bf1[14] = bf0[6] - bf0[14];
    6449           0 :     bf1[15] = bf0[7] - bf0[15];
    6450           0 :     bf1[16] = bf0[16] + bf0[24];
    6451           0 :     bf1[17] = bf0[17] + bf0[25];
    6452           0 :     bf1[18] = bf0[18] + bf0[26];
    6453           0 :     bf1[19] = bf0[19] + bf0[27];
    6454           0 :     bf1[20] = bf0[20] + bf0[28];
    6455           0 :     bf1[21] = bf0[21] + bf0[29];
    6456           0 :     bf1[22] = bf0[22] + bf0[30];
    6457           0 :     bf1[23] = bf0[23] + bf0[31];
    6458           0 :     bf1[24] = bf0[16] - bf0[24];
    6459           0 :     bf1[25] = bf0[17] - bf0[25];
    6460           0 :     bf1[26] = bf0[18] - bf0[26];
    6461           0 :     bf1[27] = bf0[19] - bf0[27];
    6462           0 :     bf1[28] = bf0[20] - bf0[28];
    6463           0 :     bf1[29] = bf0[21] - bf0[29];
    6464           0 :     bf1[30] = bf0[22] - bf0[30];
    6465           0 :     bf1[31] = bf0[23] - bf0[31];
    6466           0 :     clamp_buf(bf1, size, stage_range[stage]);
    6467             : 
    6468             :     // stage 8
    6469           0 :     stage++;
    6470           0 :     cospi = cospi_arr(cos_bit);
    6471           0 :     bf0 = output;
    6472           0 :     bf1 = step;
    6473           0 :     bf1[0] = bf0[0];
    6474           0 :     bf1[1] = bf0[1];
    6475           0 :     bf1[2] = bf0[2];
    6476           0 :     bf1[3] = bf0[3];
    6477           0 :     bf1[4] = bf0[4];
    6478           0 :     bf1[5] = bf0[5];
    6479           0 :     bf1[6] = bf0[6];
    6480           0 :     bf1[7] = bf0[7];
    6481           0 :     bf1[8] = bf0[8];
    6482           0 :     bf1[9] = bf0[9];
    6483           0 :     bf1[10] = bf0[10];
    6484           0 :     bf1[11] = bf0[11];
    6485           0 :     bf1[12] = bf0[12];
    6486           0 :     bf1[13] = bf0[13];
    6487           0 :     bf1[14] = bf0[14];
    6488           0 :     bf1[15] = bf0[15];
    6489           0 :     bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit);
    6490           0 :     bf1[17] = half_btf(cospi[60], bf0[16], -cospi[4], bf0[17], cos_bit);
    6491           0 :     bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit);
    6492           0 :     bf1[19] = half_btf(cospi[44], bf0[18], -cospi[20], bf0[19], cos_bit);
    6493           0 :     bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit);
    6494           0 :     bf1[21] = half_btf(cospi[28], bf0[20], -cospi[36], bf0[21], cos_bit);
    6495           0 :     bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit);
    6496           0 :     bf1[23] = half_btf(cospi[12], bf0[22], -cospi[52], bf0[23], cos_bit);
    6497           0 :     bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit);
    6498           0 :     bf1[25] = half_btf(cospi[4], bf0[24], cospi[60], bf0[25], cos_bit);
    6499           0 :     bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit);
    6500           0 :     bf1[27] = half_btf(cospi[20], bf0[26], cospi[44], bf0[27], cos_bit);
    6501           0 :     bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit);
    6502           0 :     bf1[29] = half_btf(cospi[36], bf0[28], cospi[28], bf0[29], cos_bit);
    6503           0 :     bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit);
    6504           0 :     bf1[31] = half_btf(cospi[52], bf0[30], cospi[12], bf0[31], cos_bit);
    6505           0 :     clamp_buf(bf1, size, stage_range[stage]);
    6506             : 
    6507             :     // stage 9
    6508           0 :     stage++;
    6509           0 :     bf0 = step;
    6510           0 :     bf1 = output;
    6511           0 :     bf1[0] = bf0[0] + bf0[16];
    6512           0 :     bf1[1] = bf0[1] + bf0[17];
    6513           0 :     bf1[2] = bf0[2] + bf0[18];
    6514           0 :     bf1[3] = bf0[3] + bf0[19];
    6515           0 :     bf1[4] = bf0[4] + bf0[20];
    6516           0 :     bf1[5] = bf0[5] + bf0[21];
    6517           0 :     bf1[6] = bf0[6] + bf0[22];
    6518           0 :     bf1[7] = bf0[7] + bf0[23];
    6519           0 :     bf1[8] = bf0[8] + bf0[24];
    6520           0 :     bf1[9] = bf0[9] + bf0[25];
    6521           0 :     bf1[10] = bf0[10] + bf0[26];
    6522           0 :     bf1[11] = bf0[11] + bf0[27];
    6523           0 :     bf1[12] = bf0[12] + bf0[28];
    6524           0 :     bf1[13] = bf0[13] + bf0[29];
    6525           0 :     bf1[14] = bf0[14] + bf0[30];
    6526           0 :     bf1[15] = bf0[15] + bf0[31];
    6527           0 :     bf1[16] = bf0[0] - bf0[16];
    6528           0 :     bf1[17] = bf0[1] - bf0[17];
    6529           0 :     bf1[18] = bf0[2] - bf0[18];
    6530           0 :     bf1[19] = bf0[3] - bf0[19];
    6531           0 :     bf1[20] = bf0[4] - bf0[20];
    6532           0 :     bf1[21] = bf0[5] - bf0[21];
    6533           0 :     bf1[22] = bf0[6] - bf0[22];
    6534           0 :     bf1[23] = bf0[7] - bf0[23];
    6535           0 :     bf1[24] = bf0[8] - bf0[24];
    6536           0 :     bf1[25] = bf0[9] - bf0[25];
    6537           0 :     bf1[26] = bf0[10] - bf0[26];
    6538           0 :     bf1[27] = bf0[11] - bf0[27];
    6539           0 :     bf1[28] = bf0[12] - bf0[28];
    6540           0 :     bf1[29] = bf0[13] - bf0[29];
    6541           0 :     bf1[30] = bf0[14] - bf0[30];
    6542           0 :     bf1[31] = bf0[15] - bf0[31];
    6543           0 :     clamp_buf(bf1, size, stage_range[stage]);
    6544             : 
    6545             :     // stage 10
    6546           0 :     stage++;
    6547           0 :     cospi = cospi_arr(cos_bit);
    6548           0 :     bf0 = output;
    6549           0 :     bf1 = step;
    6550           0 :     bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit);
    6551           0 :     bf1[1] = half_btf(cospi[63], bf0[0], -cospi[1], bf0[1], cos_bit);
    6552           0 :     bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit);
    6553           0 :     bf1[3] = half_btf(cospi[59], bf0[2], -cospi[5], bf0[3], cos_bit);
    6554           0 :     bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit);
    6555           0 :     bf1[5] = half_btf(cospi[55], bf0[4], -cospi[9], bf0[5], cos_bit);
    6556           0 :     bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit);
    6557           0 :     bf1[7] = half_btf(cospi[51], bf0[6], -cospi[13], bf0[7], cos_bit);
    6558           0 :     bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit);
    6559           0 :     bf1[9] = half_btf(cospi[47], bf0[8], -cospi[17], bf0[9], cos_bit);
    6560           0 :     bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit);
    6561           0 :     bf1[11] = half_btf(cospi[43], bf0[10], -cospi[21], bf0[11], cos_bit);
    6562           0 :     bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit);
    6563           0 :     bf1[13] = half_btf(cospi[39], bf0[12], -cospi[25], bf0[13], cos_bit);
    6564           0 :     bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit);
    6565           0 :     bf1[15] = half_btf(cospi[35], bf0[14], -cospi[29], bf0[15], cos_bit);
    6566           0 :     bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit);
    6567           0 :     bf1[17] = half_btf(cospi[31], bf0[16], -cospi[33], bf0[17], cos_bit);
    6568           0 :     bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit);
    6569           0 :     bf1[19] = half_btf(cospi[27], bf0[18], -cospi[37], bf0[19], cos_bit);
    6570           0 :     bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit);
    6571           0 :     bf1[21] = half_btf(cospi[23], bf0[20], -cospi[41], bf0[21], cos_bit);
    6572           0 :     bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit);
    6573           0 :     bf1[23] = half_btf(cospi[19], bf0[22], -cospi[45], bf0[23], cos_bit);
    6574           0 :     bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit);
    6575           0 :     bf1[25] = half_btf(cospi[15], bf0[24], -cospi[49], bf0[25], cos_bit);
    6576           0 :     bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit);
    6577           0 :     bf1[27] = half_btf(cospi[11], bf0[26], -cospi[53], bf0[27], cos_bit);
    6578           0 :     bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit);
    6579           0 :     bf1[29] = half_btf(cospi[7], bf0[28], -cospi[57], bf0[29], cos_bit);
    6580           0 :     bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit);
    6581           0 :     bf1[31] = half_btf(cospi[3], bf0[30], -cospi[61], bf0[31], cos_bit);
    6582           0 :     clamp_buf(bf1, size, stage_range[stage]);
    6583             : 
    6584             :     // stage 11
    6585           0 :     stage++;
    6586           0 :     bf0 = step;
    6587           0 :     bf1 = output;
    6588           0 :     bf1[0] = bf0[1];
    6589           0 :     bf1[1] = bf0[30];
    6590           0 :     bf1[2] = bf0[3];
    6591           0 :     bf1[3] = bf0[28];
    6592           0 :     bf1[4] = bf0[5];
    6593           0 :     bf1[5] = bf0[26];
    6594           0 :     bf1[6] = bf0[7];
    6595           0 :     bf1[7] = bf0[24];
    6596           0 :     bf1[8] = bf0[9];
    6597           0 :     bf1[9] = bf0[22];
    6598           0 :     bf1[10] = bf0[11];
    6599           0 :     bf1[11] = bf0[20];
    6600           0 :     bf1[12] = bf0[13];
    6601           0 :     bf1[13] = bf0[18];
    6602           0 :     bf1[14] = bf0[15];
    6603           0 :     bf1[15] = bf0[16];
    6604           0 :     bf1[16] = bf0[17];
    6605           0 :     bf1[17] = bf0[14];
    6606           0 :     bf1[18] = bf0[19];
    6607           0 :     bf1[19] = bf0[12];
    6608           0 :     bf1[20] = bf0[21];
    6609           0 :     bf1[21] = bf0[10];
    6610           0 :     bf1[22] = bf0[23];
    6611           0 :     bf1[23] = bf0[8];
    6612           0 :     bf1[24] = bf0[25];
    6613           0 :     bf1[25] = bf0[6];
    6614           0 :     bf1[26] = bf0[27];
    6615           0 :     bf1[27] = bf0[4];
    6616           0 :     bf1[28] = bf0[29];
    6617           0 :     bf1[29] = bf0[2];
    6618           0 :     bf1[30] = bf0[31];
    6619           0 :     bf1[31] = bf0[0];
    6620           0 :     clamp_buf(bf1, size, stage_range[stage]);
    6621           0 : }
    6622           0 : void eb_av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
    6623             :     const int8_t *stage_range) {
    6624           0 :     assert(output != input);
    6625           0 :     const int32_t *cospi = cospi_arr(cos_bit);
    6626             : 
    6627           0 :     int32_t stage = 0;
    6628             :     int32_t *bf0, *bf1;
    6629             :     int32_t step[64];
    6630             : 
    6631             :     // stage 0;
    6632             : 
    6633             :     // stage 1;
    6634           0 :     stage++;
    6635           0 :     bf1 = output;
    6636           0 :     bf1[0] = input[0];
    6637           0 :     bf1[1] = input[32];
    6638           0 :     bf1[2] = input[16];
    6639           0 :     bf1[3] = input[48];
    6640           0 :     bf1[4] = input[8];
    6641           0 :     bf1[5] = input[40];
    6642           0 :     bf1[6] = input[24];
    6643           0 :     bf1[7] = input[56];
    6644           0 :     bf1[8] = input[4];
    6645           0 :     bf1[9] = input[36];
    6646           0 :     bf1[10] = input[20];
    6647           0 :     bf1[11] = input[52];
    6648           0 :     bf1[12] = input[12];
    6649           0 :     bf1[13] = input[44];
    6650           0 :     bf1[14] = input[28];
    6651           0 :     bf1[15] = input[60];
    6652           0 :     bf1[16] = input[2];
    6653           0 :     bf1[17] = input[34];
    6654           0 :     bf1[18] = input[18];
    6655           0 :     bf1[19] = input[50];
    6656           0 :     bf1[20] = input[10];
    6657           0 :     bf1[21] = input[42];
    6658           0 :     bf1[22] = input[26];
    6659           0 :     bf1[23] = input[58];
    6660           0 :     bf1[24] = input[6];
    6661           0 :     bf1[25] = input[38];
    6662           0 :     bf1[26] = input[22];
    6663           0 :     bf1[27] = input[54];
    6664           0 :     bf1[28] = input[14];
    6665           0 :     bf1[29] = input[46];
    6666           0 :     bf1[30] = input[30];
    6667           0 :     bf1[31] = input[62];
    6668           0 :     bf1[32] = input[1];
    6669           0 :     bf1[33] = input[33];
    6670           0 :     bf1[34] = input[17];
    6671           0 :     bf1[35] = input[49];
    6672           0 :     bf1[36] = input[9];
    6673           0 :     bf1[37] = input[41];
    6674           0 :     bf1[38] = input[25];
    6675           0 :     bf1[39] = input[57];
    6676           0 :     bf1[40] = input[5];
    6677           0 :     bf1[41] = input[37];
    6678           0 :     bf1[42] = input[21];
    6679           0 :     bf1[43] = input[53];
    6680           0 :     bf1[44] = input[13];
    6681           0 :     bf1[45] = input[45];
    6682           0 :     bf1[46] = input[29];
    6683           0 :     bf1[47] = input[61];
    6684           0 :     bf1[48] = input[3];
    6685           0 :     bf1[49] = input[35];
    6686           0 :     bf1[50] = input[19];
    6687           0 :     bf1[51] = input[51];
    6688           0 :     bf1[52] = input[11];
    6689           0 :     bf1[53] = input[43];
    6690           0 :     bf1[54] = input[27];
    6691           0 :     bf1[55] = input[59];
    6692           0 :     bf1[56] = input[7];
    6693           0 :     bf1[57] = input[39];
    6694           0 :     bf1[58] = input[23];
    6695           0 :     bf1[59] = input[55];
    6696           0 :     bf1[60] = input[15];
    6697           0 :     bf1[61] = input[47];
    6698           0 :     bf1[62] = input[31];
    6699           0 :     bf1[63] = input[63];
    6700             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    6701             : 
    6702             :     // stage 2
    6703           0 :     stage++;
    6704           0 :     bf0 = output;
    6705           0 :     bf1 = step;
    6706           0 :     bf1[0] = bf0[0];
    6707           0 :     bf1[1] = bf0[1];
    6708           0 :     bf1[2] = bf0[2];
    6709           0 :     bf1[3] = bf0[3];
    6710           0 :     bf1[4] = bf0[4];
    6711           0 :     bf1[5] = bf0[5];
    6712           0 :     bf1[6] = bf0[6];
    6713           0 :     bf1[7] = bf0[7];
    6714           0 :     bf1[8] = bf0[8];
    6715           0 :     bf1[9] = bf0[9];
    6716           0 :     bf1[10] = bf0[10];
    6717           0 :     bf1[11] = bf0[11];
    6718           0 :     bf1[12] = bf0[12];
    6719           0 :     bf1[13] = bf0[13];
    6720           0 :     bf1[14] = bf0[14];
    6721           0 :     bf1[15] = bf0[15];
    6722           0 :     bf1[16] = bf0[16];
    6723           0 :     bf1[17] = bf0[17];
    6724           0 :     bf1[18] = bf0[18];
    6725           0 :     bf1[19] = bf0[19];
    6726           0 :     bf1[20] = bf0[20];
    6727           0 :     bf1[21] = bf0[21];
    6728           0 :     bf1[22] = bf0[22];
    6729           0 :     bf1[23] = bf0[23];
    6730           0 :     bf1[24] = bf0[24];
    6731           0 :     bf1[25] = bf0[25];
    6732           0 :     bf1[26] = bf0[26];
    6733           0 :     bf1[27] = bf0[27];
    6734           0 :     bf1[28] = bf0[28];
    6735           0 :     bf1[29] = bf0[29];
    6736           0 :     bf1[30] = bf0[30];
    6737           0 :     bf1[31] = bf0[31];
    6738           0 :     bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit);
    6739           0 :     bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit);
    6740           0 :     bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit);
    6741           0 :     bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit);
    6742           0 :     bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit);
    6743           0 :     bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit);
    6744           0 :     bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit);
    6745           0 :     bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit);
    6746           0 :     bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit);
    6747           0 :     bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit);
    6748           0 :     bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit);
    6749           0 :     bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit);
    6750           0 :     bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit);
    6751           0 :     bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit);
    6752           0 :     bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit);
    6753           0 :     bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit);
    6754           0 :     bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit);
    6755           0 :     bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit);
    6756           0 :     bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit);
    6757           0 :     bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit);
    6758           0 :     bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit);
    6759           0 :     bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit);
    6760           0 :     bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit);
    6761           0 :     bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit);
    6762           0 :     bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit);
    6763           0 :     bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit);
    6764           0 :     bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit);
    6765           0 :     bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit);
    6766           0 :     bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit);
    6767           0 :     bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
    6768           0 :     bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
    6769           0 :     bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
    6770             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    6771             : 
    6772             :     // stage 3
    6773           0 :     stage++;
    6774           0 :     bf0 = step;
    6775           0 :     bf1 = output;
    6776           0 :     bf1[0] = bf0[0];
    6777           0 :     bf1[1] = bf0[1];
    6778           0 :     bf1[2] = bf0[2];
    6779           0 :     bf1[3] = bf0[3];
    6780           0 :     bf1[4] = bf0[4];
    6781           0 :     bf1[5] = bf0[5];
    6782           0 :     bf1[6] = bf0[6];
    6783           0 :     bf1[7] = bf0[7];
    6784           0 :     bf1[8] = bf0[8];
    6785           0 :     bf1[9] = bf0[9];
    6786           0 :     bf1[10] = bf0[10];
    6787           0 :     bf1[11] = bf0[11];
    6788           0 :     bf1[12] = bf0[12];
    6789           0 :     bf1[13] = bf0[13];
    6790           0 :     bf1[14] = bf0[14];
    6791           0 :     bf1[15] = bf0[15];
    6792           0 :     bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
    6793           0 :     bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
    6794           0 :     bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
    6795           0 :     bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
    6796           0 :     bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
    6797           0 :     bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
    6798           0 :     bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
    6799           0 :     bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
    6800           0 :     bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
    6801           0 :     bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
    6802           0 :     bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
    6803           0 :     bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
    6804           0 :     bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
    6805           0 :     bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
    6806           0 :     bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
    6807           0 :     bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
    6808           0 :     bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]);
    6809           0 :     bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]);
    6810           0 :     bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]);
    6811           0 :     bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]);
    6812           0 :     bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]);
    6813           0 :     bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]);
    6814           0 :     bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]);
    6815           0 :     bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]);
    6816           0 :     bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]);
    6817           0 :     bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]);
    6818           0 :     bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]);
    6819           0 :     bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]);
    6820           0 :     bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]);
    6821           0 :     bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]);
    6822           0 :     bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]);
    6823           0 :     bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]);
    6824           0 :     bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]);
    6825           0 :     bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]);
    6826           0 :     bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]);
    6827           0 :     bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]);
    6828           0 :     bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]);
    6829           0 :     bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]);
    6830           0 :     bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]);
    6831           0 :     bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]);
    6832           0 :     bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]);
    6833           0 :     bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]);
    6834           0 :     bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]);
    6835           0 :     bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]);
    6836           0 :     bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]);
    6837           0 :     bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
    6838           0 :     bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
    6839           0 :     bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
    6840             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    6841             : 
    6842             :     // stage 4
    6843           0 :     stage++;
    6844           0 :     bf0 = output;
    6845           0 :     bf1 = step;
    6846           0 :     bf1[0] = bf0[0];
    6847           0 :     bf1[1] = bf0[1];
    6848           0 :     bf1[2] = bf0[2];
    6849           0 :     bf1[3] = bf0[3];
    6850           0 :     bf1[4] = bf0[4];
    6851           0 :     bf1[5] = bf0[5];
    6852           0 :     bf1[6] = bf0[6];
    6853           0 :     bf1[7] = bf0[7];
    6854           0 :     bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
    6855           0 :     bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
    6856           0 :     bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
    6857           0 :     bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
    6858           0 :     bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
    6859           0 :     bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
    6860           0 :     bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
    6861           0 :     bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
    6862           0 :     bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
    6863           0 :     bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
    6864           0 :     bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
    6865           0 :     bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
    6866           0 :     bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
    6867           0 :     bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
    6868           0 :     bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
    6869           0 :     bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
    6870           0 :     bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
    6871           0 :     bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
    6872           0 :     bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
    6873           0 :     bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
    6874           0 :     bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
    6875           0 :     bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
    6876           0 :     bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
    6877           0 :     bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
    6878           0 :     bf1[32] = bf0[32];
    6879           0 :     bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
    6880           0 :     bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
    6881           0 :     bf1[35] = bf0[35];
    6882           0 :     bf1[36] = bf0[36];
    6883           0 :     bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
    6884           0 :     bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
    6885           0 :     bf1[39] = bf0[39];
    6886           0 :     bf1[40] = bf0[40];
    6887           0 :     bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
    6888           0 :     bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
    6889           0 :     bf1[43] = bf0[43];
    6890           0 :     bf1[44] = bf0[44];
    6891           0 :     bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
    6892           0 :     bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
    6893           0 :     bf1[47] = bf0[47];
    6894           0 :     bf1[48] = bf0[48];
    6895           0 :     bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit);
    6896           0 :     bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit);
    6897           0 :     bf1[51] = bf0[51];
    6898           0 :     bf1[52] = bf0[52];
    6899           0 :     bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit);
    6900           0 :     bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit);
    6901           0 :     bf1[55] = bf0[55];
    6902           0 :     bf1[56] = bf0[56];
    6903           0 :     bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit);
    6904           0 :     bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit);
    6905           0 :     bf1[59] = bf0[59];
    6906           0 :     bf1[60] = bf0[60];
    6907           0 :     bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
    6908           0 :     bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
    6909           0 :     bf1[63] = bf0[63];
    6910             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    6911             : 
    6912             :     // stage 5
    6913           0 :     stage++;
    6914           0 :     bf0 = step;
    6915           0 :     bf1 = output;
    6916           0 :     bf1[0] = bf0[0];
    6917           0 :     bf1[1] = bf0[1];
    6918           0 :     bf1[2] = bf0[2];
    6919           0 :     bf1[3] = bf0[3];
    6920           0 :     bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
    6921           0 :     bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
    6922           0 :     bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
    6923           0 :     bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
    6924           0 :     bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
    6925           0 :     bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
    6926           0 :     bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
    6927           0 :     bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
    6928           0 :     bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
    6929           0 :     bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
    6930           0 :     bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
    6931           0 :     bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
    6932           0 :     bf1[16] = bf0[16];
    6933           0 :     bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
    6934           0 :     bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
    6935           0 :     bf1[19] = bf0[19];
    6936           0 :     bf1[20] = bf0[20];
    6937           0 :     bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
    6938           0 :     bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
    6939           0 :     bf1[23] = bf0[23];
    6940           0 :     bf1[24] = bf0[24];
    6941           0 :     bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
    6942           0 :     bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
    6943           0 :     bf1[27] = bf0[27];
    6944           0 :     bf1[28] = bf0[28];
    6945           0 :     bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
    6946           0 :     bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
    6947           0 :     bf1[31] = bf0[31];
    6948           0 :     bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]);
    6949           0 :     bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]);
    6950           0 :     bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]);
    6951           0 :     bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]);
    6952           0 :     bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]);
    6953           0 :     bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]);
    6954           0 :     bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]);
    6955           0 :     bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]);
    6956           0 :     bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]);
    6957           0 :     bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]);
    6958           0 :     bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]);
    6959           0 :     bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]);
    6960           0 :     bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]);
    6961           0 :     bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]);
    6962           0 :     bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]);
    6963           0 :     bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]);
    6964           0 :     bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]);
    6965           0 :     bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]);
    6966           0 :     bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]);
    6967           0 :     bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]);
    6968           0 :     bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]);
    6969           0 :     bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]);
    6970           0 :     bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]);
    6971           0 :     bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]);
    6972           0 :     bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]);
    6973           0 :     bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]);
    6974           0 :     bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]);
    6975           0 :     bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]);
    6976           0 :     bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]);
    6977           0 :     bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
    6978           0 :     bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
    6979           0 :     bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
    6980             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    6981             : 
    6982             :     // stage 6
    6983           0 :     stage++;
    6984           0 :     bf0 = output;
    6985           0 :     bf1 = step;
    6986           0 :     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
    6987           0 :     bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
    6988           0 :     bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
    6989           0 :     bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
    6990           0 :     bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
    6991           0 :     bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
    6992           0 :     bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
    6993           0 :     bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
    6994           0 :     bf1[8] = bf0[8];
    6995           0 :     bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
    6996           0 :     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
    6997           0 :     bf1[11] = bf0[11];
    6998           0 :     bf1[12] = bf0[12];
    6999           0 :     bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
    7000           0 :     bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
    7001           0 :     bf1[15] = bf0[15];
    7002           0 :     bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
    7003           0 :     bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
    7004           0 :     bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
    7005           0 :     bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
    7006           0 :     bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
    7007           0 :     bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
    7008           0 :     bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
    7009           0 :     bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
    7010           0 :     bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
    7011           0 :     bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
    7012           0 :     bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
    7013           0 :     bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
    7014           0 :     bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
    7015           0 :     bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
    7016           0 :     bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
    7017           0 :     bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
    7018           0 :     bf1[32] = bf0[32];
    7019           0 :     bf1[33] = bf0[33];
    7020           0 :     bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
    7021           0 :     bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
    7022           0 :     bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
    7023           0 :     bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
    7024           0 :     bf1[38] = bf0[38];
    7025           0 :     bf1[39] = bf0[39];
    7026           0 :     bf1[40] = bf0[40];
    7027           0 :     bf1[41] = bf0[41];
    7028           0 :     bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
    7029           0 :     bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
    7030           0 :     bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
    7031           0 :     bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
    7032           0 :     bf1[46] = bf0[46];
    7033           0 :     bf1[47] = bf0[47];
    7034           0 :     bf1[48] = bf0[48];
    7035           0 :     bf1[49] = bf0[49];
    7036           0 :     bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit);
    7037           0 :     bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit);
    7038           0 :     bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit);
    7039           0 :     bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit);
    7040           0 :     bf1[54] = bf0[54];
    7041           0 :     bf1[55] = bf0[55];
    7042           0 :     bf1[56] = bf0[56];
    7043           0 :     bf1[57] = bf0[57];
    7044           0 :     bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit);
    7045           0 :     bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit);
    7046           0 :     bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit);
    7047           0 :     bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
    7048           0 :     bf1[62] = bf0[62];
    7049           0 :     bf1[63] = bf0[63];
    7050             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    7051             : 
    7052             :     // stage 7
    7053           0 :     stage++;
    7054           0 :     bf0 = step;
    7055           0 :     bf1 = output;
    7056           0 :     bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
    7057           0 :     bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
    7058           0 :     bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
    7059           0 :     bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
    7060           0 :     bf1[4] = bf0[4];
    7061           0 :     bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    7062           0 :     bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
    7063           0 :     bf1[7] = bf0[7];
    7064           0 :     bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
    7065           0 :     bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
    7066           0 :     bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
    7067           0 :     bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
    7068           0 :     bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
    7069           0 :     bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
    7070           0 :     bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
    7071           0 :     bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
    7072           0 :     bf1[16] = bf0[16];
    7073           0 :     bf1[17] = bf0[17];
    7074           0 :     bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
    7075           0 :     bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
    7076           0 :     bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
    7077           0 :     bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
    7078           0 :     bf1[22] = bf0[22];
    7079           0 :     bf1[23] = bf0[23];
    7080           0 :     bf1[24] = bf0[24];
    7081           0 :     bf1[25] = bf0[25];
    7082           0 :     bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
    7083           0 :     bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
    7084           0 :     bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
    7085           0 :     bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
    7086           0 :     bf1[30] = bf0[30];
    7087           0 :     bf1[31] = bf0[31];
    7088           0 :     bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]);
    7089           0 :     bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]);
    7090           0 :     bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]);
    7091           0 :     bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]);
    7092           0 :     bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]);
    7093           0 :     bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]);
    7094           0 :     bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]);
    7095           0 :     bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]);
    7096           0 :     bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]);
    7097           0 :     bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]);
    7098           0 :     bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]);
    7099           0 :     bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]);
    7100           0 :     bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]);
    7101           0 :     bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]);
    7102           0 :     bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]);
    7103           0 :     bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]);
    7104           0 :     bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]);
    7105           0 :     bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]);
    7106           0 :     bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]);
    7107           0 :     bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]);
    7108           0 :     bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]);
    7109           0 :     bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]);
    7110           0 :     bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]);
    7111           0 :     bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]);
    7112           0 :     bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]);
    7113           0 :     bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]);
    7114           0 :     bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]);
    7115           0 :     bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]);
    7116           0 :     bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]);
    7117           0 :     bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
    7118           0 :     bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
    7119           0 :     bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
    7120             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    7121             : 
    7122             :     // stage 8
    7123           0 :     stage++;
    7124           0 :     bf0 = output;
    7125           0 :     bf1 = step;
    7126           0 :     bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
    7127           0 :     bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
    7128           0 :     bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
    7129           0 :     bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
    7130           0 :     bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
    7131           0 :     bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
    7132           0 :     bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
    7133           0 :     bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
    7134           0 :     bf1[8] = bf0[8];
    7135           0 :     bf1[9] = bf0[9];
    7136           0 :     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    7137           0 :     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    7138           0 :     bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
    7139           0 :     bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
    7140           0 :     bf1[14] = bf0[14];
    7141           0 :     bf1[15] = bf0[15];
    7142           0 :     bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
    7143           0 :     bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
    7144           0 :     bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
    7145           0 :     bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
    7146           0 :     bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
    7147           0 :     bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
    7148           0 :     bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
    7149           0 :     bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
    7150           0 :     bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
    7151           0 :     bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
    7152           0 :     bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
    7153           0 :     bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
    7154           0 :     bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
    7155           0 :     bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
    7156           0 :     bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
    7157           0 :     bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
    7158           0 :     bf1[32] = bf0[32];
    7159           0 :     bf1[33] = bf0[33];
    7160           0 :     bf1[34] = bf0[34];
    7161           0 :     bf1[35] = bf0[35];
    7162           0 :     bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
    7163           0 :     bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
    7164           0 :     bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
    7165           0 :     bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
    7166           0 :     bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
    7167           0 :     bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
    7168           0 :     bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
    7169           0 :     bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
    7170           0 :     bf1[44] = bf0[44];
    7171           0 :     bf1[45] = bf0[45];
    7172           0 :     bf1[46] = bf0[46];
    7173           0 :     bf1[47] = bf0[47];
    7174           0 :     bf1[48] = bf0[48];
    7175           0 :     bf1[49] = bf0[49];
    7176           0 :     bf1[50] = bf0[50];
    7177           0 :     bf1[51] = bf0[51];
    7178           0 :     bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit);
    7179           0 :     bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit);
    7180           0 :     bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit);
    7181           0 :     bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit);
    7182           0 :     bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit);
    7183           0 :     bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit);
    7184           0 :     bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit);
    7185           0 :     bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit);
    7186           0 :     bf1[60] = bf0[60];
    7187           0 :     bf1[61] = bf0[61];
    7188           0 :     bf1[62] = bf0[62];
    7189           0 :     bf1[63] = bf0[63];
    7190             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    7191             : 
    7192             :     // stage 9
    7193           0 :     stage++;
    7194           0 :     bf0 = step;
    7195           0 :     bf1 = output;
    7196           0 :     bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
    7197           0 :     bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
    7198           0 :     bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
    7199           0 :     bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
    7200           0 :     bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
    7201           0 :     bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
    7202           0 :     bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
    7203           0 :     bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
    7204           0 :     bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
    7205           0 :     bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
    7206           0 :     bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
    7207           0 :     bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
    7208           0 :     bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
    7209           0 :     bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
    7210           0 :     bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
    7211           0 :     bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
    7212           0 :     bf1[16] = bf0[16];
    7213           0 :     bf1[17] = bf0[17];
    7214           0 :     bf1[18] = bf0[18];
    7215           0 :     bf1[19] = bf0[19];
    7216           0 :     bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
    7217           0 :     bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
    7218           0 :     bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
    7219           0 :     bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
    7220           0 :     bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
    7221           0 :     bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
    7222           0 :     bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
    7223           0 :     bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
    7224           0 :     bf1[28] = bf0[28];
    7225           0 :     bf1[29] = bf0[29];
    7226           0 :     bf1[30] = bf0[30];
    7227           0 :     bf1[31] = bf0[31];
    7228           0 :     bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]);
    7229           0 :     bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]);
    7230           0 :     bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]);
    7231           0 :     bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]);
    7232           0 :     bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]);
    7233           0 :     bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]);
    7234           0 :     bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]);
    7235           0 :     bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]);
    7236           0 :     bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]);
    7237           0 :     bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]);
    7238           0 :     bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]);
    7239           0 :     bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]);
    7240           0 :     bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]);
    7241           0 :     bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]);
    7242           0 :     bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]);
    7243           0 :     bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]);
    7244           0 :     bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]);
    7245           0 :     bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]);
    7246           0 :     bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]);
    7247           0 :     bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]);
    7248           0 :     bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]);
    7249           0 :     bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]);
    7250           0 :     bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]);
    7251           0 :     bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]);
    7252           0 :     bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]);
    7253           0 :     bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]);
    7254           0 :     bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]);
    7255           0 :     bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]);
    7256           0 :     bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]);
    7257           0 :     bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
    7258           0 :     bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
    7259           0 :     bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
    7260             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    7261             : 
    7262             :     // stage 10
    7263           0 :     stage++;
    7264           0 :     bf0 = output;
    7265           0 :     bf1 = step;
    7266           0 :     bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
    7267           0 :     bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
    7268           0 :     bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
    7269           0 :     bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
    7270           0 :     bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
    7271           0 :     bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
    7272           0 :     bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
    7273           0 :     bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
    7274           0 :     bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
    7275           0 :     bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
    7276           0 :     bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
    7277           0 :     bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
    7278           0 :     bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
    7279           0 :     bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
    7280           0 :     bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
    7281           0 :     bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
    7282           0 :     bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
    7283           0 :     bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
    7284           0 :     bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
    7285           0 :     bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
    7286           0 :     bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
    7287           0 :     bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
    7288           0 :     bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
    7289           0 :     bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
    7290           0 :     bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
    7291           0 :     bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
    7292           0 :     bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
    7293           0 :     bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
    7294           0 :     bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
    7295           0 :     bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
    7296           0 :     bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
    7297           0 :     bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
    7298           0 :     bf1[32] = bf0[32];
    7299           0 :     bf1[33] = bf0[33];
    7300           0 :     bf1[34] = bf0[34];
    7301           0 :     bf1[35] = bf0[35];
    7302           0 :     bf1[36] = bf0[36];
    7303           0 :     bf1[37] = bf0[37];
    7304           0 :     bf1[38] = bf0[38];
    7305           0 :     bf1[39] = bf0[39];
    7306           0 :     bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
    7307           0 :     bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
    7308           0 :     bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
    7309           0 :     bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
    7310           0 :     bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
    7311           0 :     bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
    7312           0 :     bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
    7313           0 :     bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
    7314           0 :     bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
    7315           0 :     bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
    7316           0 :     bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
    7317           0 :     bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
    7318           0 :     bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
    7319           0 :     bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
    7320           0 :     bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
    7321           0 :     bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
    7322           0 :     bf1[56] = bf0[56];
    7323           0 :     bf1[57] = bf0[57];
    7324           0 :     bf1[58] = bf0[58];
    7325           0 :     bf1[59] = bf0[59];
    7326           0 :     bf1[60] = bf0[60];
    7327           0 :     bf1[61] = bf0[61];
    7328           0 :     bf1[62] = bf0[62];
    7329           0 :     bf1[63] = bf0[63];
    7330             :     //range_check_buf(stage, input, bf1, size, stage_range[stage]);
    7331             : 
    7332             :     // stage 11
    7333           0 :     stage++;
    7334           0 :     bf0 = step;
    7335           0 :     bf1 = output;
    7336           0 :     bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]);
    7337           0 :     bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]);
    7338           0 :     bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]);
    7339           0 :     bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]);
    7340           0 :     bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]);
    7341           0 :     bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]);
    7342           0 :     bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]);
    7343           0 :     bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]);
    7344           0 :     bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]);
    7345           0 :     bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]);
    7346           0 :     bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]);
    7347           0 :     bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]);
    7348           0 :     bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]);
    7349           0 :     bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]);
    7350           0 :     bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]);
    7351           0 :     bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]);
    7352           0 :     bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]);
    7353           0 :     bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]);
    7354           0 :     bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]);
    7355           0 :     bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]);
    7356           0 :     bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]);
    7357           0 :     bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]);
    7358           0 :     bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]);
    7359           0 :     bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]);
    7360           0 :     bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]);
    7361           0 :     bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]);
    7362           0 :     bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]);
    7363           0 :     bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]);
    7364           0 :     bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]);
    7365           0 :     bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]);
    7366           0 :     bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]);
    7367           0 :     bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]);
    7368           0 :     bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]);
    7369           0 :     bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]);
    7370           0 :     bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]);
    7371           0 :     bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]);
    7372           0 :     bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]);
    7373           0 :     bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]);
    7374           0 :     bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]);
    7375           0 :     bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]);
    7376           0 :     bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]);
    7377           0 :     bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]);
    7378           0 :     bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]);
    7379           0 :     bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]);
    7380           0 :     bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]);
    7381           0 :     bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]);
    7382           0 :     bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]);
    7383           0 :     bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]);
    7384           0 :     bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]);
    7385           0 :     bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]);
    7386           0 :     bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]);
    7387           0 :     bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]);
    7388           0 :     bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]);
    7389           0 :     bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]);
    7390           0 :     bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]);
    7391           0 :     bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]);
    7392           0 :     bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]);
    7393           0 :     bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]);
    7394           0 :     bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]);
    7395           0 :     bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]);
    7396           0 :     bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]);
    7397           0 :     bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]);
    7398           0 :     bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]);
    7399           0 :     bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]);
    7400           0 : }
    7401           0 : void eb_av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
    7402             :     const int8_t *stage_range) {
    7403             :     (void)cos_bit;
    7404             :     (void)stage_range;
    7405           0 :     for (int32_t i = 0; i < 4; ++i) {
    7406             :         // Normal input should fit into 32-bit. Cast to 64-bit here to avoid
    7407             :         // overflow with corrupted/fuzzed input. The same for av1_iidentity/16/64_c.
    7408           0 :         output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits);
    7409             :     }
    7410           0 :     assert(stage_range[0] + NewSqrt2Bits <= 32);
    7411           0 : }
    7412           0 : void eb_av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
    7413             :     const int8_t *stage_range) {
    7414             :     (void)cos_bit;
    7415             :     (void)stage_range;
    7416           0 :     for (int32_t i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2);
    7417           0 : }
    7418           0 : void eb_av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
    7419             :     const int8_t *stage_range) {
    7420             :     (void)cos_bit;
    7421             :     (void)stage_range;
    7422           0 :     for (int32_t i = 0; i < 16; ++i)
    7423           0 :         output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits);
    7424           0 :     assert(stage_range[0] + NewSqrt2Bits <= 32);
    7425           0 : }
    7426           0 : void eb_av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
    7427             :     const int8_t *stage_range) {
    7428             :     (void)cos_bit;
    7429             :     (void)stage_range;
    7430           0 :     for (int32_t i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4);
    7431           0 : }
    7432           0 : void av1_iidentity64_c(const int32_t *input, int32_t *output, int8_t cos_bit,
    7433             :     const int8_t *stage_range) {
    7434             :     (void)cos_bit;
    7435             :     (void)stage_range;
    7436           0 :     for (int32_t i = 0; i < 64; ++i)
    7437           0 :         output[i] = round_shift((int64_t)NewSqrt2 * 4 * input[i], NewSqrt2Bits);
    7438           0 :     assert(stage_range[0] + NewSqrt2Bits <= 32);
    7439           0 : }
    7440           0 : static INLINE TxfmFunc inv_txfm_type_to_func(TxfmType TxfmType) {
    7441           0 :     switch (TxfmType) {
    7442           0 :     case TXFM_TYPE_DCT4: return eb_av1_idct4_new;
    7443           0 :     case TXFM_TYPE_DCT8: return eb_av1_idct8_new;
    7444           0 :     case TXFM_TYPE_DCT16: return eb_av1_idct16_new;
    7445           0 :     case TXFM_TYPE_DCT32: return eb_av1_idct32_new;
    7446           0 :     case TXFM_TYPE_DCT64: return eb_av1_idct64_new;
    7447           0 :     case TXFM_TYPE_ADST4: return eb_av1_iadst4_new;
    7448           0 :     case TXFM_TYPE_ADST8: return eb_av1_iadst8_new;
    7449           0 :     case TXFM_TYPE_ADST16: return eb_av1_iadst16_new;
    7450           0 :     case TXFM_TYPE_ADST32: return av1_iadst32_new;
    7451           0 :     case TXFM_TYPE_IDENTITY4: return eb_av1_iidentity4_c;
    7452           0 :     case TXFM_TYPE_IDENTITY8: return eb_av1_iidentity8_c;
    7453           0 :     case TXFM_TYPE_IDENTITY16: return eb_av1_iidentity16_c;
    7454           0 :     case TXFM_TYPE_IDENTITY32: return eb_av1_iidentity32_c;
    7455           0 :     case TXFM_TYPE_IDENTITY64: return av1_iidentity64_c;
    7456           0 :     default: assert(0); return NULL;
    7457             :     }
    7458             : }
    7459             : 
    7460             : //void eb_av1_round_shift_array_c(int32_t *arr, int32_t size, int32_t bit) {
    7461             : //    int32_t i;
    7462             : //    if (bit == 0) {
    7463             : //        return;
    7464             : //    }
    7465             : //    else {
    7466             : //        if (bit > 0) {
    7467             : //            for (i = 0; i < size; i++) {
    7468             : //                arr[i] = round_shift(arr[i], bit);
    7469             : //            }
    7470             : //        }
    7471             : //        else {
    7472             : //            for (i = 0; i < size; i++) {
    7473             : //                arr[i] = arr[i] * (1 << (-bit));
    7474             : //            }
    7475             : //        }
    7476             : //    }
    7477             : //}
    7478           0 : static INLINE TranHigh check_range(TranHigh input, int32_t bd) {
    7479             :     // AV1 TX case
    7480             :     // - 8 bit: signed 16 bit integer
    7481             :     // - 10 bit: signed 18 bit integer
    7482             :     // - 12 bit: signed 20 bit integer
    7483             :     // - max quantization error = 1828 << (bd - 8)
    7484           0 :     const int32_t int_max = (1 << (7 + bd)) - 1 + (914 << (bd - 7));
    7485           0 :     const int32_t int_min = -int_max - 1;
    7486             : #if CONFIG_COEFFICIENT_RANGE_CHECKING
    7487             :     assert(int_min <= input);
    7488             :     assert(input <= int_max);
    7489             : #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
    7490           0 :     return (TranHigh)clamp64(input, int_min, int_max);
    7491             : }
    7492             : #define HIGHBD_WRAPLOW(x, bd) ((int32_t)check_range((x), bd))
    7493           0 : static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, TranHigh trans,
    7494             :     int32_t bd) {
    7495           0 :     trans = HIGHBD_WRAPLOW(trans, bd);
    7496           0 :     return clip_pixel_highbd(dest + (int32_t)trans, bd);
    7497             : }
    7498           0 : static INLINE void Av1InverseTransformTwoDCore_c(
    7499             :     const int32_t *input,
    7500             :     int32_t inpuStride,
    7501             :     TranLow *output,
    7502             :     int32_t ouputStride,
    7503             :     Txfm2DFlipCfg *cfg,
    7504             :     int32_t *txfm_buf,
    7505             :     TxSize tx_size,
    7506             :     int32_t bd)
    7507             : {
    7508             :     // Note when assigning txfm_size_col, we use the txfm_size from the
    7509             :     // row configuration and vice versa. This is intentionally done to
    7510             :     // accurately perform rectangular transforms. When the transform is
    7511             :     // rectangular, the number of columns will be the same as the
    7512             :     // txfm_size stored in the row cfg struct. It will make no difference
    7513             :     // for square transforms.
    7514           0 :     const int32_t txfm_size_col = tx_size_wide[cfg->tx_size];
    7515           0 :     const int32_t txfm_size_row = tx_size_high[cfg->tx_size];
    7516             :     // Take the shift from the larger dimension in the rectangular case.
    7517           0 :     const int8_t *shift = cfg->shift;
    7518           0 :     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
    7519             :     int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
    7520             :     int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
    7521           0 :     assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
    7522           0 :     assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
    7523           0 :     eb_av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd);
    7524             : 
    7525           0 :     const int8_t cos_bit_col = cfg->cos_bit_col;
    7526           0 :     const int8_t cos_bit_row = cfg->cos_bit_row;
    7527           0 :     const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col);
    7528           0 :     const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row);
    7529             :     ASSERT(txfm_func_col);
    7530             :     ASSERT(txfm_func_row);
    7531             :     // txfm_buf's length is  txfm_size_row * txfm_size_col + 2 *
    7532             :     // AOMMAX(txfm_size_row, txfm_size_col)
    7533             :     // it is used for intermediate data buffering
    7534           0 :     const int32_t buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
    7535           0 :     int32_t *temp_in = txfm_buf;
    7536           0 :     int32_t *temp_out = temp_in + buf_offset;
    7537           0 :     int32_t *buf = temp_out + buf_offset;
    7538           0 :     int32_t *buf_ptr = buf;
    7539             :     int32_t c, r;
    7540             : 
    7541             :     // Rows
    7542           0 :     for (r = 0; r < txfm_size_row; ++r) {
    7543           0 :         if (abs(rect_type) == 1) {
    7544           0 :             for (c = 0; c < txfm_size_col; ++c)
    7545           0 :                 temp_in[c] = round_shift((int64_t)input[c] * NewInvSqrt2, NewSqrt2Bits);
    7546           0 :             clamp_buf(temp_in, txfm_size_col, (int8_t)(bd + 8));
    7547           0 :             txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
    7548             :         }
    7549             :         else {
    7550           0 :             for (c = 0; c < txfm_size_col; ++c)
    7551           0 :                 temp_in[c] = input[c];
    7552           0 :             clamp_buf(temp_in, txfm_size_col, (int8_t)(bd + 8));
    7553           0 :             txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
    7554             :         }
    7555           0 :         eb_av1_round_shift_array_c(buf_ptr, txfm_size_col, -shift[0]);
    7556           0 :         input += inpuStride;// txfm_size_col;
    7557           0 :         buf_ptr += txfm_size_col;
    7558             :     }
    7559             :     // Columns
    7560           0 :     for (c = 0; c < txfm_size_col; ++c) {
    7561           0 :         if (cfg->lr_flip == 0) {
    7562           0 :             for (r = 0; r < txfm_size_row; ++r)
    7563           0 :                 temp_in[r] = buf[r * txfm_size_col + c];
    7564             :         }
    7565             :         else {
    7566             :             // flip left right
    7567           0 :             for (r = 0; r < txfm_size_row; ++r)
    7568           0 :                 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
    7569             :         }
    7570           0 :         clamp_buf(temp_in, txfm_size_row, (int8_t)AOMMAX(bd + 6, 16));
    7571           0 :         txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
    7572           0 :         eb_av1_round_shift_array_c(temp_out, txfm_size_row, -shift[1]);
    7573           0 :         if (cfg->ud_flip == 0) {
    7574           0 :             for (r = 0; r < txfm_size_row; ++r)
    7575           0 :                 output[r * ouputStride + c] = temp_out[r];
    7576             :         }
    7577             :         else {
    7578             :             // flip upside down
    7579           0 :             for (r = 0; r < txfm_size_row; ++r)
    7580           0 :                 output[r * ouputStride + c] = temp_out[txfm_size_row - r - 1];
    7581             :         }
    7582             :     }
    7583           0 : }
    7584             : 
    7585           0 : void Av1InverseTransformTwoD_4x4_c(
    7586             :     int32_t        *input,
    7587             :     uint32_t         input_stride,
    7588             :     int32_t        *output,
    7589             :     uint32_t         outputStride,
    7590             :     TxType        transform_type,
    7591             :     uint8_t          bit_depth)
    7592             : {
    7593             :     DECLARE_ALIGNED(32, int32_t, intermediateInverseTransformBuffer[4 * 4 + 4 + 4]);
    7594             :     Txfm2DFlipCfg cfg;
    7595             : 
    7596           0 :     Av1InverseTransformConfig(
    7597             :         transform_type,
    7598             :         TX_4X4,
    7599             :         &cfg);
    7600             :     // Forward shift sum uses larger square size, to be consistent with what
    7601             :     // eb_av1_gen_inv_stage_range() does for inverse shifts.
    7602           0 :     Av1InverseTransformTwoDCore_c(
    7603             :         input,
    7604             :         input_stride,
    7605             :         output,
    7606             :         outputStride,
    7607             :         &cfg,
    7608             :         intermediateInverseTransformBuffer,
    7609             :         TX_4X4,
    7610             :         bit_depth);
    7611           0 : }
    7612             : 
    7613           0 : void Av1InverseTransformTwoD_8x8_c(
    7614             :     int32_t        *input,
    7615             :     uint32_t         input_stride,
    7616             :     int32_t        *output,
    7617             :     uint32_t         outputStride,
    7618             :     TxType        transform_type,
    7619             :     uint8_t          bit_depth)
    7620             : {
    7621             :     DECLARE_ALIGNED(32, int32_t, intermediateInverseTransformBuffer[8 * 8 + 8 + 8]);
    7622             :     Txfm2DFlipCfg cfg;
    7623             : 
    7624           0 :     Av1InverseTransformConfig(
    7625             :         transform_type,
    7626             :         TX_8X8,
    7627             :         &cfg);
    7628             :     // Forward shift sum uses larger square size, to be consistent with what
    7629             :     // eb_av1_gen_inv_stage_range() does for inverse shifts.
    7630           0 :     Av1InverseTransformTwoDCore_c(
    7631             :         input,
    7632             :         input_stride,
    7633             :         output,
    7634             :         outputStride,
    7635             :         &cfg,
    7636             :         intermediateInverseTransformBuffer,
    7637             :         TX_8X8,
    7638             :         bit_depth);
    7639           0 : }
    7640             : 
    7641           0 : void Av1InverseTransformTwoD_16x16_c(
    7642             :     int32_t        *input,
    7643             :     uint32_t         input_stride,
    7644             :     int32_t        *output,
    7645             :     uint32_t         outputStride,
    7646             :     TxType        transform_type,
    7647             :     uint8_t          bit_depth)
    7648             : {
    7649             :     DECLARE_ALIGNED(32, int32_t, intermediateInverseTransformBuffer[16 * 16 + 16 + 16]);
    7650             :     Txfm2DFlipCfg cfg;
    7651             : 
    7652           0 :     Av1InverseTransformConfig(
    7653             :         transform_type,
    7654             :         TX_16X16,
    7655             :         &cfg);
    7656             :     // Forward shift sum uses larger square size, to be consistent with what
    7657             :     // eb_av1_gen_inv_stage_range() does for inverse shifts.
    7658           0 :     Av1InverseTransformTwoDCore_c(
    7659             :         input,
    7660             :         input_stride,
    7661             :         output,
    7662             :         outputStride,
    7663             :         &cfg,
    7664             :         intermediateInverseTransformBuffer,
    7665             :         TX_16X16,
    7666             :         bit_depth);
    7667           0 : }
    7668             : 
    7669           0 : void Av1InverseTransformTwoD_32x32_c(
    7670             :     int32_t        *input,
    7671             :     uint32_t         input_stride,
    7672             :     int32_t        *output,
    7673             :     uint32_t         outputStride,
    7674             :     TxType        transform_type,
    7675             :     uint8_t          bit_depth)
    7676             : {
    7677             :     DECLARE_ALIGNED(32, int32_t, intermediateInverseTransformBuffer[32 * 32 + 32 + 32]);
    7678             :     Txfm2DFlipCfg cfg;
    7679             : 
    7680           0 :     Av1InverseTransformConfig(
    7681             :         transform_type,
    7682             :         TX_32X32,
    7683             :         &cfg);
    7684             :     // Forward shift sum uses larger square size, to be consistent with what
    7685             :     // eb_av1_gen_inv_stage_range() does for inverse shifts.
    7686           0 :     Av1InverseTransformTwoDCore_c(
    7687             :         input,
    7688             :         input_stride,
    7689             :         output,
    7690             :         outputStride,
    7691             :         &cfg,
    7692             :         intermediateInverseTransformBuffer,
    7693             :         TX_32X32,
    7694             :         bit_depth);
    7695           0 : }
    7696             : 
    7697           0 : void Av1InverseTransformTwoD_64x64_c(
    7698             :     int32_t        *input,
    7699             :     uint32_t         input_stride,
    7700             :     int32_t        *output,
    7701             :     uint32_t         outputStride,
    7702             :     TxType        transform_type,
    7703             :     uint8_t          bit_depth)
    7704             : {
    7705             :     (void)input_stride;
    7706             :     // TODO(urvang): Can the same array be reused, instead of using a new array?
    7707             :     // Remap 32x32 input into a modified 64x64 by:
    7708             :     // - Copying over these values in top-left 32x32 locations.
    7709             :     // - Setting the rest of the locations to 0.
    7710             :     uint32_t row;
    7711             :     int32_t mod_input[64 * 64];
    7712             : 
    7713           0 :     for (row = 0; row < 32; ++row) {
    7714           0 :         memcpy(mod_input + row * 64, input + row * 64, 32 * sizeof(*mod_input));
    7715           0 :         memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
    7716             :     }
    7717           0 :     memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input));
    7718             : 
    7719             :     DECLARE_ALIGNED(32, int32_t, intermediateInverseTransformBuffer[64 * 64 + 64 + 64]);
    7720             : 
    7721             :     Txfm2DFlipCfg cfg;
    7722             : 
    7723           0 :     Av1InverseTransformConfig(
    7724             :         transform_type,
    7725             :         TX_64X64,
    7726             :         &cfg);
    7727             :     // Forward shift sum uses larger square size, to be consistent with what
    7728             :     // eb_av1_gen_inv_stage_range() does for inverse shifts.
    7729           0 :     Av1InverseTransformTwoDCore_c(
    7730             :         mod_input,
    7731             :         64,
    7732             :         output,
    7733             :         outputStride,
    7734             :         &cfg,
    7735             :         intermediateInverseTransformBuffer,
    7736             :         TX_64X64,
    7737             :         bit_depth);
    7738           0 : }
    7739             : 
    7740             : /*********************************************************************
    7741             : * Estimate Inverse Transform
    7742             : *********************************************************************/
    7743           0 : EbErrorType av1_estimate_inv_transform(
    7744             :     int32_t      *coeff_buffer,
    7745             :     uint32_t      coeff_stride,
    7746             :     int32_t      *recon_buffer,
    7747             :     uint32_t      recon_stride,
    7748             :     TxSize        transform_size,
    7749             :     int16_t      *transform_inner_array_ptr,
    7750             :     uint32_t      bit_increment,
    7751             :     TxType        transform_type,
    7752             :     uint32_t      eob,
    7753             :     uint32_t      partial_frequency_n2_flag)
    7754             : {
    7755           0 :     EbErrorType return_error = EB_ErrorNone;
    7756             : 
    7757             :     // Nader inverse tranform
    7758             :     (void)transform_inner_array_ptr;
    7759             :     (void)partial_frequency_n2_flag;
    7760             : 
    7761             :     //TxSetType  transformSetType = transform_type == DCT_DCT ? EXT_TX_SET_DCTONLY : /*ADST_ADST*/ EXT_TX_SET_DTT4_IDTX ; // NM - Set to zero for the moment
    7762             : 
    7763           0 :     uint8_t      bit_depth = bit_increment ? 10 : 8;// NM - Set to zero for the moment
    7764             : 
    7765           0 :     if (eob) {
    7766             :         //    assert(av1_ext_tx_used[transformSetType][transform_type]);
    7767             : 
    7768           0 :         switch (transform_size) {
    7769           0 :         case TX_32X32:
    7770           0 :             Av1InverseTransformTwoD_32x32_c(
    7771             :                 coeff_buffer,
    7772             :                 coeff_stride,
    7773             :                 recon_buffer,
    7774             :                 recon_stride,
    7775             :                 transform_type,
    7776             :                 bit_depth);
    7777           0 :             break;
    7778           0 :         case TX_16X16:
    7779           0 :             Av1InverseTransformTwoD_16x16_c(
    7780             :                 coeff_buffer,
    7781             :                 coeff_stride,
    7782             :                 recon_buffer,
    7783             :                 recon_stride,
    7784             :                 transform_type,
    7785             :                 bit_depth);
    7786           0 :             break;
    7787           0 :         case TX_8X8:
    7788           0 :             Av1InverseTransformTwoD_8x8_c(
    7789             :                 coeff_buffer,
    7790             :                 coeff_stride,
    7791             :                 recon_buffer,
    7792             :                 recon_stride,
    7793             :                 transform_type,
    7794             :                 bit_depth);
    7795           0 :             break;
    7796           0 :         case TX_64X64:
    7797           0 :             Av1InverseTransformTwoD_64x64_c(
    7798             :                 coeff_buffer,
    7799             :                 coeff_stride,
    7800             :                 recon_buffer,
    7801             :                 recon_stride,
    7802             :                 transform_type,
    7803             :                 bit_depth);
    7804           0 :             break;
    7805           0 :         case TX_4X4:
    7806             :             // this is like av1_short_idct4x4 but has a special case around eob<=1
    7807             :             // which is significant (not just an optimization) for the lossless
    7808             :             // case.
    7809           0 :             Av1InverseTransformTwoD_4x4_c(
    7810             :                 coeff_buffer,
    7811             :                 coeff_stride,
    7812             :                 recon_buffer,
    7813             :                 recon_stride,
    7814             :                 transform_type,
    7815             :                 bit_depth);
    7816           0 :             break;
    7817             : 
    7818             :             break;
    7819           0 :         default: assert(0 && "Invalid transform size"); break;
    7820             :         }
    7821           0 :     }
    7822             : 
    7823           0 :     return return_error;
    7824             : }
    7825             : 
    7826           0 : static const int32_t *cast_to_int32(const TranLow *input) {
    7827             :     assert(sizeof(int32_t) == sizeof(TranLow));
    7828           0 :     return (const int32_t *)input;
    7829             : }
    7830           0 : void eb_av1_get_inv_txfm_cfg(TxType tx_type, TxSize tx_size,
    7831             :     Txfm2DFlipCfg *cfg) {
    7832           0 :     assert(cfg != NULL);
    7833           0 :     cfg->tx_size = tx_size;
    7834           0 :     set_flip_cfg(tx_type, cfg);
    7835           0 :     av1_zero(cfg->stage_range_col);
    7836           0 :     av1_zero(cfg->stage_range_row);
    7837           0 :     set_flip_cfg(tx_type, cfg);
    7838           0 :     const TxType1D tx_type_1d_col = vtx_tab[tx_type];
    7839           0 :     const TxType1D tx_type_1d_row = htx_tab[tx_type];
    7840           0 :     cfg->shift = eb_inv_txfm_shift_ls[tx_size];
    7841           0 :     const int32_t txw_idx = get_txw_idx(tx_size);
    7842           0 :     const int32_t txh_idx = get_txh_idx(tx_size);
    7843           0 :     cfg->cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
    7844           0 :     cfg->cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
    7845           0 :     cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
    7846           0 :     if (cfg->txfm_type_col == TXFM_TYPE_ADST4)
    7847           0 :         memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range));
    7848           0 :     cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
    7849           0 :     if (cfg->txfm_type_row == TXFM_TYPE_ADST4)
    7850           0 :         memcpy(cfg->stage_range_row, iadst4_range, sizeof(iadst4_range));
    7851           0 :     cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
    7852           0 :     cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
    7853           0 : }
    7854           0 : static INLINE void inv_txfm2d_add_c(const int32_t *input,
    7855             :     uint16_t *output_r, int32_t stride_r,
    7856             :     uint16_t *output_w, int32_t stride_w,
    7857             :     Txfm2DFlipCfg *cfg,
    7858             :     int32_t *txfm_buf, TxSize tx_size,
    7859             :     int32_t bd) {
    7860             :     // Note when assigning txfm_size_col, we use the txfm_size from the
    7861             :     // row configuration and vice versa. This is intentionally done to
    7862             :     // accurately perform rectangular transforms. When the transform is
    7863             :     // rectangular, the number of columns will be the same as the
    7864             :     // txfm_size stored in the row cfg struct. It will make no difference
    7865             :     // for square transforms.
    7866           0 :     const int32_t txfm_size_col = tx_size_wide[cfg->tx_size];
    7867           0 :     const int32_t txfm_size_row = tx_size_high[cfg->tx_size];
    7868             :     // Take the shift from the larger dimension in the rectangular case.
    7869           0 :     const int8_t *shift = cfg->shift;
    7870           0 :     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
    7871             :     int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
    7872             :     int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
    7873           0 :     assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
    7874           0 :     assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
    7875           0 :     eb_av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd);
    7876             : 
    7877           0 :     const int8_t cos_bit_col = cfg->cos_bit_col;
    7878           0 :     const int8_t cos_bit_row = cfg->cos_bit_row;
    7879           0 :     const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col);
    7880           0 :     const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row);
    7881             :     ASSERT(txfm_func_col);
    7882             :     ASSERT(txfm_func_row);
    7883             :     // txfm_buf's length is  txfm_size_row * txfm_size_col + 2 *
    7884             :     // AOMMAX(txfm_size_row, txfm_size_col)
    7885             :     // it is used for intermediate data buffering
    7886           0 :     const int32_t buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
    7887           0 :     int32_t *temp_in = txfm_buf;
    7888           0 :     int32_t *temp_out = temp_in + buf_offset;
    7889           0 :     int32_t *buf = temp_out + buf_offset;
    7890           0 :     int32_t *buf_ptr = buf;
    7891             :     int32_t c, r;
    7892             : 
    7893             :     // Rows
    7894           0 :     for (r = 0; r < txfm_size_row; ++r) {
    7895           0 :         if (abs(rect_type) == 1) {
    7896           0 :             for (c = 0; c < txfm_size_col; ++c)
    7897           0 :                 temp_in[c] = round_shift((int64_t)input[c] * NewInvSqrt2, NewSqrt2Bits);
    7898           0 :             clamp_buf(temp_in, txfm_size_col, (int8_t)(bd + 8));
    7899           0 :             txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
    7900             :         }
    7901             :         else {
    7902           0 :             for (c = 0; c < txfm_size_col; ++c)
    7903           0 :                 temp_in[c] = input[c];
    7904           0 :             clamp_buf(temp_in, txfm_size_col, (int8_t)(bd + 8));
    7905           0 :             txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
    7906             :         }
    7907           0 :         eb_av1_round_shift_array_c(buf_ptr, txfm_size_col, -shift[0]);
    7908           0 :         input += txfm_size_col;
    7909           0 :         buf_ptr += txfm_size_col;
    7910             :     }
    7911             : 
    7912             :     // Columns
    7913           0 :     for (c = 0; c < txfm_size_col; ++c) {
    7914           0 :         if (cfg->lr_flip == 0) {
    7915           0 :             for (r = 0; r < txfm_size_row; ++r)
    7916           0 :                 temp_in[r] = buf[r * txfm_size_col + c];
    7917             :         }
    7918             :         else {
    7919             :             // flip left right
    7920           0 :             for (r = 0; r < txfm_size_row; ++r)
    7921           0 :                 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
    7922             :         }
    7923           0 :         clamp_buf(temp_in, txfm_size_row, (int8_t)(AOMMAX(bd + 6, 16)));
    7924           0 :         txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
    7925           0 :         eb_av1_round_shift_array_c(temp_out, txfm_size_row, -shift[1]);
    7926           0 :         if (cfg->ud_flip == 0) {
    7927           0 :             for (r = 0; r < txfm_size_row; ++r) {
    7928           0 :                 output_w[r * stride_w + c] =
    7929           0 :                     highbd_clip_pixel_add(output_r[r * stride_r + c], temp_out[r], bd);
    7930             :             }
    7931             :         }
    7932             :         else {
    7933             :             // flip upside down
    7934           0 :             for (r = 0; r < txfm_size_row; ++r) {
    7935           0 :                 output_w[r * stride_w + c] = highbd_clip_pixel_add(
    7936           0 :                     output_r[r * stride_r + c], temp_out[txfm_size_row - r - 1], bd);
    7937             :             }
    7938             :         }
    7939             :     }
    7940           0 : }
    7941           0 : static INLINE void inv_txfm2d_add_facade(const int32_t *input,
    7942             :     uint16_t *output_r, int32_t stride_r,
    7943             :     uint16_t *output_w, int32_t stride_w,
    7944             :     int32_t *txfm_buf,
    7945             :     TxType tx_type, TxSize tx_size,
    7946             :     int32_t bd) {
    7947             :     Txfm2DFlipCfg cfg;
    7948           0 :     eb_av1_get_inv_txfm_cfg(tx_type, tx_size, &cfg);
    7949             :     // Forward shift sum uses larger square size, to be consistent with what
    7950             :     // eb_av1_gen_inv_stage_range() does for inverse shifts.
    7951           0 :     inv_txfm2d_add_c(input, output_r, stride_r, output_w, stride_w,
    7952             :         &cfg, txfm_buf, tx_size, bd);
    7953           0 : }
    7954           0 : void eb_av1_inv_txfm2d_add_4x4_c(const int32_t *input,
    7955             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w,
    7956             :     int32_t stride_w, TxType tx_type, int32_t bd) {
    7957             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 4 + 4 + 4]);
    7958           0 :     inv_txfm2d_add_facade(input, output_r, stride_r, output_w, stride_w,
    7959             :         txfm_buf, tx_type, TX_4X4, bd);
    7960           0 : }
    7961           0 : void eb_av1_inv_txfm2d_add_8x8_c(const int32_t *input,
    7962             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    7963             :     TxType tx_type, int32_t bd) {
    7964             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 8 + 8 + 8]);
    7965           0 :     inv_txfm2d_add_facade(input, output_r, stride_r, output_w, stride_w,
    7966             :         txfm_buf, tx_type, TX_8X8, bd);
    7967           0 : }
    7968           0 : void eb_av1_inv_txfm2d_add_16x16_c(const int32_t *input,
    7969             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    7970             :     TxType tx_type, int32_t bd) {
    7971             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 16 + 16 + 16]);
    7972           0 :     inv_txfm2d_add_facade(input, output_r, stride_r, output_w, stride_w,
    7973             :         txfm_buf, tx_type, TX_16X16, bd);
    7974           0 : }
    7975             : 
    7976           0 : void eb_av1_inv_txfm2d_add_32x32_c(const int32_t *input,
    7977             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    7978             :     TxType tx_type, int32_t bd) {
    7979             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 32 + 32 + 32]);
    7980           0 :     inv_txfm2d_add_facade(input, output_r, stride_r, output_w, stride_w,
    7981             :         txfm_buf, tx_type, TX_32X32, bd);
    7982           0 : }
    7983             : 
    7984           0 : void eb_av1_inv_txfm2d_add_64x64_c(const int32_t *input,
    7985             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    7986             :     TxType tx_type, int32_t bd) {
    7987             :     // TODO(urvang): Can the same array be reused, instead of using a new array?
    7988             :     // Remap 32x32 input into a modified 64x64 by:
    7989             :     // - Copying over these values in top-left 32x32 locations.
    7990             :     // - Setting the rest of the locations to 0.
    7991             :     int32_t mod_input[64 * 64];
    7992           0 :     for (int32_t row = 0; row < 32; ++row) {
    7993           0 :         memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
    7994           0 :         memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
    7995             :     }
    7996           0 :     memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input));
    7997             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 64 + 64 + 64]);
    7998           0 :     inv_txfm2d_add_facade(mod_input, output_r, stride_r, output_w, stride_w,
    7999             :         txfm_buf, tx_type, TX_64X64, bd);
    8000           0 : }
    8001             : 
    8002           0 : void eb_av1_inv_txfm2d_add_4x8_c(const int32_t *input,
    8003             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    8004             :     TxType tx_type, TxSize tx_size, int32_t bd) {
    8005             :     (void)tx_size;
    8006             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8 + 8 + 8]);
    8007           0 :     inv_txfm2d_add_facade(input, output_r, stride_r, output_w, stride_w,
    8008             :         txfm_buf, tx_type, TX_4X8, bd);
    8009           0 : }
    8010             : 
    8011           0 : void eb_av1_inv_txfm2d_add_8x4_c(const int32_t *input,
    8012             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    8013             :     TxType tx_type, TxSize tx_size, int32_t bd) {
    8014             :     (void)tx_size;
    8015             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 4 + 8 + 8]);
    8016           0 :     inv_txfm2d_add_facade(input, output_r, stride_r, output_w, stride_w,
    8017             :         txfm_buf, tx_type, TX_8X4, bd);
    8018           0 : }
    8019             : 
    8020           0 : void eb_av1_inv_txfm2d_add_8x16_c(const int32_t *input,
    8021             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    8022             :     TxType tx_type, TxSize tx_size, int32_t eob, int32_t bd) {
    8023             :     UNUSED(tx_size);
    8024             :     UNUSED(eob);
    8025             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16 + 16 + 16]);
    8026           0 :     inv_txfm2d_add_facade(input, output_r, stride_r, output_w, stride_w,
    8027             :         txfm_buf, tx_type, TX_8X16, bd);
    8028           0 : }
    8029             : 
    8030           0 : void eb_av1_inv_txfm2d_add_16x8_c(const int32_t *input,
    8031             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    8032             :     TxType tx_type, TxSize tx_size, int32_t eob, int32_t bd) {
    8033             :     UNUSED(tx_size);
    8034             :     UNUSED(eob);
    8035             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 8 + 16 + 16]);
    8036           0 :     inv_txfm2d_add_facade(input, output_r, stride_r, output_w, stride_w,
    8037             :         txfm_buf, tx_type, TX_16X8, bd);
    8038           0 : }
    8039             : 
    8040           0 : void eb_av1_inv_txfm2d_add_16x32_c(const int32_t *input,
    8041             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    8042             :     TxType tx_type, TxSize tx_size, int32_t eob, int32_t bd) {
    8043             :     UNUSED(tx_size);
    8044             :     UNUSED(eob);
    8045             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32 + 32 + 32]);
    8046           0 :     inv_txfm2d_add_facade(input, output_r, stride_r, output_w, stride_w,
    8047             :         txfm_buf, tx_type, TX_16X32, bd);
    8048           0 : }
    8049             : 
    8050           0 : void eb_av1_inv_txfm2d_add_32x16_c(const int32_t *input,
    8051             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    8052             :     TxType tx_type, TxSize tx_size, int32_t eob, int32_t bd) {
    8053             :     UNUSED(tx_size);
    8054             :     UNUSED(eob);
    8055             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 16 + 32 + 32]);
    8056           0 :     inv_txfm2d_add_facade(input, output_r, stride_r, output_w, stride_w,
    8057             :         txfm_buf, tx_type, TX_32X16, bd);
    8058           0 : }
    8059             : 
    8060           0 : void eb_av1_inv_txfm2d_add_64x32_c(const int32_t *input,
    8061             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    8062             :     TxType tx_type, TxSize tx_size, int32_t eob, int32_t bd) {
    8063             :     UNUSED(tx_size);
    8064             :     UNUSED(eob);
    8065             :     // Remap 32x32 input into a modified 64x32 by:
    8066             :     // - Copying over these values in top-left 32x32 locations.
    8067             :     // - Setting the rest of the locations to 0.
    8068             :     int32_t mod_input[64 * 32];
    8069           0 :     for (int32_t row = 0; row < 32; ++row) {
    8070           0 :         memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
    8071           0 :         memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
    8072             :     }
    8073             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 32 + 64 + 64]);
    8074           0 :     inv_txfm2d_add_facade(mod_input, output_r, stride_r, output_w, stride_w,
    8075             :         txfm_buf, tx_type, TX_64X32,
    8076             :         bd);
    8077           0 : }
    8078             : 
    8079           0 : void eb_av1_inv_txfm2d_add_32x64_c(const int32_t *input,
    8080             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    8081             :     TxType tx_type, TxSize tx_size, int32_t eob, int32_t bd)  {
    8082             :     UNUSED(tx_size);
    8083             :     UNUSED(eob);
    8084             :     // Remap 32x32 input into a modified 32x64 input by:
    8085             :     // - Copying over these values in top-left 32x32 locations.
    8086             :     // - Setting the rest of the locations to 0.
    8087             :     int32_t mod_input[32 * 64];
    8088           0 :     memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input));
    8089           0 :     memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input));
    8090             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 32 + 64 + 64]);
    8091           0 :     inv_txfm2d_add_facade(mod_input, output_r, stride_r, output_w, stride_w,
    8092             :         txfm_buf, tx_type, TX_32X64, bd);
    8093           0 : }
    8094             : 
    8095           0 : void eb_av1_inv_txfm2d_add_16x64_c(const int32_t *input,
    8096             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    8097             :     TxType tx_type, TxSize tx_size, int32_t eob, int32_t bd)  {
    8098             :     UNUSED(tx_size);
    8099             :     UNUSED(eob);
    8100             :     // Remap 16x32 input into a modified 16x64 input by:
    8101             :     // - Copying over these values in top-left 16x32 locations.
    8102             :     // - Setting the rest of the locations to 0.
    8103             :     int32_t mod_input[16 * 64];
    8104           0 :     memcpy(mod_input, input, 16 * 32 * sizeof(*mod_input));
    8105           0 :     memset(mod_input + 16 * 32, 0, 16 * 32 * sizeof(*mod_input));
    8106             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 64 + 64 + 64]);
    8107           0 :     inv_txfm2d_add_facade(mod_input, output_r, stride_r, output_w, stride_w,
    8108             :         txfm_buf, tx_type, TX_16X64, bd);
    8109           0 : }
    8110             : 
    8111           0 : void eb_av1_inv_txfm2d_add_64x16_c(const int32_t *input,
    8112             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    8113             :     TxType tx_type, TxSize tx_size, int32_t eob, int32_t bd) {
    8114             :     UNUSED(tx_size);
    8115             :     UNUSED(eob);
    8116             :     // Remap 32x16 input into a modified 64x16 by:
    8117             :     // - Copying over these values in top-left 32x16 locations.
    8118             :     // - Setting the rest of the locations to 0.
    8119             :     int32_t mod_input[64 * 16];
    8120           0 :     for (int32_t row = 0; row < 16; ++row) {
    8121           0 :         memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
    8122           0 :         memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
    8123             :     }
    8124             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 64 + 64 + 64]);
    8125           0 :     inv_txfm2d_add_facade(mod_input, output_r, stride_r, output_w, stride_w,
    8126             :         txfm_buf, tx_type, TX_64X16, bd);
    8127           0 : }
    8128             : 
    8129           0 : void eb_av1_inv_txfm2d_add_4x16_c(const int32_t *input,
    8130             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    8131             :     TxType tx_type, TxSize tx_size, int32_t bd)  {
    8132             :     UNUSED(tx_size);
    8133             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16 + 16 + 16]);
    8134           0 :     inv_txfm2d_add_facade(input, output_r, stride_r, output_w, stride_w,
    8135             :         txfm_buf, tx_type, TX_4X16, bd);
    8136           0 : }
    8137             : 
    8138           0 : void eb_av1_inv_txfm2d_add_16x4_c(const int32_t *input,
    8139             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    8140             :     TxType tx_type, TxSize tx_size, int32_t bd)  {
    8141             :     UNUSED(tx_size);
    8142             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16 + 16 + 16]);
    8143           0 :     inv_txfm2d_add_facade(input, output_r, stride_r, output_w, stride_w,
    8144             :         txfm_buf, tx_type, TX_16X4, bd);
    8145           0 : }
    8146             : 
    8147           0 : void eb_av1_inv_txfm2d_add_8x32_c(const int32_t *input,
    8148             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    8149             :     TxType tx_type, TxSize tx_size, int32_t eob, int32_t bd)  {
    8150             :     UNUSED(tx_size);
    8151             :     UNUSED(eob);
    8152             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 32 + 32 + 32]);
    8153           0 :     inv_txfm2d_add_facade(input, output_r, stride_r, output_w, stride_w,
    8154             :         txfm_buf, tx_type, TX_8X32, bd);
    8155           0 : }
    8156             : 
    8157           0 : void eb_av1_inv_txfm2d_add_32x8_c(const int32_t *input,
    8158             :     uint16_t *output_r, int32_t stride_r, uint16_t *output_w, int32_t stride_w,
    8159             :     TxType tx_type, TxSize tx_size, int32_t eob, int32_t bd) {
    8160             :     UNUSED(tx_size);
    8161             :     UNUSED(eob);
    8162             :     DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 32 + 32 + 32]);
    8163           0 :     inv_txfm2d_add_facade(input, output_r, stride_r, output_w, stride_w,
    8164             :         txfm_buf, tx_type, TX_32X8, bd);
    8165           0 : }
    8166             : 
    8167           0 : static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
    8168             : #if CONFIG_COEFFICIENT_RANGE_CHECKING
    8169             :     const int64_t max_value = (1LL << (bit - 1)) - 1;
    8170             :     const int64_t min_value = -(1LL << (bit - 1));
    8171             :     if (value < min_value || value > max_value) {
    8172             :         fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit);
    8173             :         assert(0);
    8174             :     }
    8175             : #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
    8176             : #if DO_RANGE_CHECK_CLAMP
    8177             :     bit = AOMMIN(bit, 31);
    8178             :     return clamp(value, (1 << (bit - 1)) - 1, -(1 << (bit - 1)));
    8179             : #endif  // DO_RANGE_CHECK_CLAMP
    8180             :     (void)bit;
    8181           0 :     return value;
    8182             : }
    8183             : 
    8184           0 : void eb_av1_highbd_iwht4x4_16_add_c(const TranLow *input,
    8185             :     uint8_t *dest8_r, int32_t stride_r, uint8_t *dest8_w, int32_t stride_w,
    8186             :     int32_t bd) {
    8187             :     /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
    8188             :        0.5 shifts per pixel. */
    8189             :     int32_t i;
    8190             :     TranLow output[16];
    8191             :     TranLow a1, b1, c1, d1, e1;
    8192           0 :     const TranLow *ip = input;
    8193           0 :     TranLow *op = output;
    8194           0 :     uint16_t *dest_r = CONVERT_TO_SHORTPTR(dest8_r);
    8195           0 :     uint16_t *dest_w = CONVERT_TO_SHORTPTR(dest8_w);
    8196             : 
    8197           0 :     for (i = 0; i < 4; i++) {
    8198           0 :         a1 = ip[0] >> UNIT_QUANT_SHIFT;
    8199           0 :         c1 = ip[1] >> UNIT_QUANT_SHIFT;
    8200           0 :         d1 = ip[2] >> UNIT_QUANT_SHIFT;
    8201           0 :         b1 = ip[3] >> UNIT_QUANT_SHIFT;
    8202           0 :         a1 += c1;
    8203           0 :         d1 -= b1;
    8204           0 :         e1 = (a1 - d1) >> 1;
    8205           0 :         b1 = e1 - b1;
    8206           0 :         c1 = e1 - c1;
    8207           0 :         a1 -= b1;
    8208           0 :         d1 += c1;
    8209           0 :         op[0] = a1;
    8210           0 :         op[1] = b1;
    8211           0 :         op[2] = c1;
    8212           0 :         op[3] = d1;
    8213           0 :         ip += 4;
    8214           0 :         op += 4;
    8215             :     }
    8216             : 
    8217           0 :     ip = output;
    8218           0 :     for (i = 0; i < 4; i++) {
    8219           0 :         a1 = ip[4 * 0];
    8220           0 :         c1 = ip[4 * 1];
    8221           0 :         d1 = ip[4 * 2];
    8222           0 :         b1 = ip[4 * 3];
    8223           0 :         a1 += c1;
    8224           0 :         d1 -= b1;
    8225           0 :         e1 = (a1 - d1) >> 1;
    8226           0 :         b1 = e1 - b1;
    8227           0 :         c1 = e1 - c1;
    8228           0 :         a1 -= b1;
    8229           0 :         d1 += c1;
    8230           0 :         range_check_value(a1, (int8_t)(bd + 1));
    8231           0 :         range_check_value(b1, (int8_t)(bd + 1));
    8232           0 :         range_check_value(c1, (int8_t)(bd + 1));
    8233           0 :         range_check_value(d1, (int8_t)(bd + 1));
    8234             : 
    8235           0 :         dest_w[stride_w * 0] = highbd_clip_pixel_add(dest_r[stride_r * 0], a1, bd);
    8236           0 :         dest_w[stride_w * 1] = highbd_clip_pixel_add(dest_r[stride_r * 1], b1, bd);
    8237           0 :         dest_w[stride_w * 2] = highbd_clip_pixel_add(dest_r[stride_r * 2], c1, bd);
    8238           0 :         dest_w[stride_w * 3] = highbd_clip_pixel_add(dest_r[stride_r * 3], d1, bd);
    8239             : 
    8240           0 :         ip++;
    8241           0 :         dest_r++;
    8242           0 :         dest_w++;
    8243             :     }
    8244           0 : }
    8245             : 
    8246           0 : void eb_av1_highbd_iwht4x4_1_add_c(const TranLow *in,
    8247             :     uint8_t *dest8_r, int32_t dest_stride_r,
    8248             :     uint8_t *dest8_w, int32_t dest_stride_w,
    8249             :     int32_t bd) {
    8250             :     int32_t i;
    8251             :     TranLow a1, e1;
    8252             :     TranLow tmp[4];
    8253           0 :     const TranLow *ip = in;
    8254           0 :     TranLow *op = tmp;
    8255           0 :     uint16_t *dest_r = CONVERT_TO_SHORTPTR(dest8_r);
    8256           0 :     uint16_t *dest_w = CONVERT_TO_SHORTPTR(dest8_w);
    8257             :     (void)bd;
    8258             : 
    8259           0 :     a1 = ip[0] >> UNIT_QUANT_SHIFT;
    8260           0 :     e1 = a1 >> 1;
    8261           0 :     a1 -= e1;
    8262           0 :     op[0] = a1;
    8263           0 :     op[1] = op[2] = op[3] = e1;
    8264             : 
    8265           0 :     ip = tmp;
    8266           0 :     for (i = 0; i < 4; i++) {
    8267           0 :         e1 = ip[0] >> 1;
    8268           0 :         a1 = ip[0] - e1;
    8269           0 :         dest_w[dest_stride_w * 0] =
    8270           0 :             highbd_clip_pixel_add(dest_r[dest_stride_r * 0], a1, bd);
    8271           0 :         dest_w[dest_stride_w * 1] =
    8272           0 :             highbd_clip_pixel_add(dest_r[dest_stride_r * 1], e1, bd);
    8273           0 :         dest_w[dest_stride_w * 2] =
    8274           0 :             highbd_clip_pixel_add(dest_r[dest_stride_r * 2], e1, bd);
    8275           0 :         dest_w[dest_stride_w * 3] =
    8276           0 :             highbd_clip_pixel_add(dest_r[dest_stride_r * 3], e1, bd);
    8277           0 :         ip++;
    8278           0 :         dest_r++;
    8279           0 :         dest_w++;
    8280             :     }
    8281           0 : }
    8282           0 : static void highbd_iwht4x4_add(const TranLow *input,
    8283             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8284             :     int32_t eob, int32_t bd) {
    8285           0 :     if (eob > 1)
    8286           0 :         eb_av1_highbd_iwht4x4_16_add_c(input,
    8287             :             dest_r, stride_r, dest_w, stride_w, bd);
    8288             :     else
    8289           0 :         eb_av1_highbd_iwht4x4_1_add_c(input,
    8290             :             dest_r, stride_r, dest_w, stride_w, bd);
    8291           0 : }
    8292           0 : void eb_av1_highbd_inv_txfm_add_4x4(const TranLow *input,
    8293             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8294             :     const TxfmParam *txfm_param) {
    8295             :     // assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
    8296           0 :     int32_t eob = txfm_param->eob;
    8297           0 :     int32_t bd = txfm_param->bd;
    8298           0 :     int32_t lossless = txfm_param->lossless;
    8299           0 :     const int32_t *src = cast_to_int32(input);
    8300           0 :     const TxType tx_type = txfm_param->tx_type;
    8301           0 :     if (lossless) {
    8302           0 :         assert(tx_type == DCT_DCT);
    8303           0 :         highbd_iwht4x4_add(input,
    8304             :             dest_r, stride_r, dest_w, stride_w, eob, bd);
    8305           0 :         return;
    8306             :     }
    8307           0 :     eb_av1_inv_txfm2d_add_4x4(src,
    8308           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8309           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8310             :         tx_type, bd);
    8311             : }
    8312           0 : static void highbd_inv_txfm_add_8x8(const TranLow *input,
    8313             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8314             :     const TxfmParam *txfm_param) {
    8315           0 :     int32_t bd = txfm_param->bd;
    8316           0 :     const TxType tx_type = txfm_param->tx_type;
    8317           0 :     const int32_t *src = cast_to_int32(input);
    8318           0 :     eb_av1_inv_txfm2d_add_8x8(src,
    8319           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8320           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8321             :         tx_type, bd);
    8322           0 : }
    8323             : 
    8324           0 : static void highbd_inv_txfm_add_16x16(const TranLow *input,
    8325             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8326             :     const TxfmParam *txfm_param) {
    8327           0 :     int32_t bd = txfm_param->bd;
    8328           0 :     const TxType tx_type = txfm_param->tx_type;
    8329           0 :     const int32_t *src = cast_to_int32(input);
    8330           0 :     eb_av1_inv_txfm2d_add_16x16(src,
    8331           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8332           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8333             :         tx_type, bd);
    8334           0 : }
    8335             : 
    8336           0 : static void highbd_inv_txfm_add_32x32(const TranLow *input,
    8337             :     uint8_t *dest_r, int32_t stride_r,
    8338             :     uint8_t *dest_w, int32_t stride_w,
    8339             :     const TxfmParam *txfm_param) {
    8340           0 :     const int32_t bd = txfm_param->bd;
    8341           0 :     const TxType tx_type = txfm_param->tx_type;
    8342           0 :     const int32_t *src = cast_to_int32(input);
    8343           0 :     switch (tx_type) {
    8344           0 :     case DCT_DCT:
    8345             :     case IDTX:
    8346           0 :         eb_av1_inv_txfm2d_add_32x32(src,
    8347           0 :             CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8348           0 :             CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8349             :             tx_type, bd);
    8350           0 :         break;
    8351           0 :     default:
    8352           0 :         assert(0);
    8353             :     }
    8354           0 : }
    8355             : 
    8356           0 : static void highbd_inv_txfm_add_64x64(const TranLow *input,
    8357             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8358             :     const TxfmParam *txfm_param) {
    8359           0 :     const int32_t bd = txfm_param->bd;
    8360           0 :     const TxType tx_type = txfm_param->tx_type;
    8361           0 :     const int32_t *src = cast_to_int32(input);
    8362           0 :     assert(tx_type == DCT_DCT);
    8363           0 :     eb_av1_inv_txfm2d_add_64x64(src,
    8364           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8365           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8366             :         tx_type, bd);
    8367           0 : }
    8368             : 
    8369           0 : static void highbd_inv_txfm_add_4x8(const TranLow *input,
    8370             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8371             :     const TxfmParam *txfm_param) {
    8372             :     //TODO: add this assert once we fill tx_set_type    assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
    8373           0 :     const int32_t *src = cast_to_int32(input);
    8374           0 :     eb_av1_inv_txfm2d_add_4x8(src,
    8375           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8376           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8377           0 :         txfm_param->tx_type, txfm_param->tx_size, txfm_param->bd);
    8378           0 : }
    8379             : 
    8380           0 : static void highbd_inv_txfm_add_8x4(const TranLow *input,
    8381             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8382             :     const TxfmParam *txfm_param) {
    8383             :     //TODO: add this assert once we fill tx_set_type    assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
    8384           0 :     const int32_t *src = cast_to_int32(input);
    8385           0 :     eb_av1_inv_txfm2d_add_8x4(src,
    8386           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8387           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8388           0 :         txfm_param->tx_type, txfm_param->tx_size, txfm_param->bd);
    8389           0 : }
    8390             : 
    8391           0 : static void highbd_inv_txfm_add_8x16(const TranLow *input,
    8392             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8393             :     const TxfmParam *txfm_param) {
    8394           0 :     const int32_t *src = cast_to_int32(input);
    8395           0 :     eb_av1_inv_txfm2d_add_8x16(src,
    8396           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8397           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8398           0 :         txfm_param->tx_type, txfm_param->tx_size, txfm_param->eob, txfm_param->bd);
    8399           0 : }
    8400             : 
    8401           0 : static void highbd_inv_txfm_add_16x8(const TranLow *input,
    8402             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8403             :     const TxfmParam *txfm_param) {
    8404           0 :     const int32_t *src = cast_to_int32(input);
    8405           0 :     eb_av1_inv_txfm2d_add_16x8(src,
    8406           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8407           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8408           0 :         txfm_param->tx_type, txfm_param->tx_size, txfm_param->eob, txfm_param->bd);
    8409           0 : }
    8410             : 
    8411           0 : static void highbd_inv_txfm_add_16x32(const TranLow *input,
    8412             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8413             :     const TxfmParam *txfm_param) {
    8414           0 :     const int32_t *src = cast_to_int32(input);
    8415           0 :     eb_av1_inv_txfm2d_add_16x32(src,
    8416           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8417           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8418           0 :         txfm_param->tx_type, txfm_param->tx_size, txfm_param->eob, txfm_param->bd);
    8419           0 : }
    8420             : 
    8421           0 : static void highbd_inv_txfm_add_32x16(const TranLow *input,
    8422             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8423             :     const TxfmParam *txfm_param) {
    8424           0 :     const int32_t *src = cast_to_int32(input);
    8425           0 :     eb_av1_inv_txfm2d_add_32x16(src,
    8426           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8427           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8428           0 :         txfm_param->tx_type, txfm_param->tx_size, txfm_param->eob, txfm_param->bd);
    8429           0 : }
    8430             : 
    8431           0 : static void highbd_inv_txfm_add_16x4(const TranLow *input,
    8432             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8433             :     const TxfmParam *txfm_param) {
    8434           0 :     const int32_t *src = cast_to_int32(input);
    8435           0 :     eb_av1_inv_txfm2d_add_16x4(src,
    8436           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8437           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8438           0 :         txfm_param->tx_type, txfm_param->tx_size, txfm_param->bd);
    8439           0 : }
    8440             : 
    8441           0 : static void highbd_inv_txfm_add_4x16(const TranLow *input,
    8442             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8443             :     const TxfmParam *txfm_param) {
    8444           0 :     const int32_t *src = cast_to_int32(input);
    8445           0 :     eb_av1_inv_txfm2d_add_4x16(src,
    8446           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8447           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8448           0 :         txfm_param->tx_type, txfm_param->tx_size, txfm_param->bd);
    8449           0 : }
    8450             : 
    8451           0 : static void highbd_inv_txfm_add_32x8(const TranLow *input,
    8452             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8453             :     const TxfmParam *txfm_param) {
    8454           0 :     const int32_t *src = cast_to_int32(input);
    8455           0 :     eb_av1_inv_txfm2d_add_32x8(src,
    8456           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8457           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8458           0 :         txfm_param->tx_type, txfm_param->tx_size, txfm_param->eob, txfm_param->bd);
    8459           0 : }
    8460             : 
    8461           0 : static void highbd_inv_txfm_add_8x32(const TranLow *input,
    8462             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8463             :     const TxfmParam *txfm_param) {
    8464           0 :     const int32_t *src = cast_to_int32(input);
    8465           0 :     eb_av1_inv_txfm2d_add_8x32(src,
    8466           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8467           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8468           0 :         txfm_param->tx_type, txfm_param->tx_size, txfm_param->eob, txfm_param->bd);
    8469           0 : }
    8470             : 
    8471           0 : static void highbd_inv_txfm_add_32x64(const TranLow *input,
    8472             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8473             :     const TxfmParam *txfm_param) {
    8474           0 :     const int32_t *src = cast_to_int32(input);
    8475           0 :     eb_av1_inv_txfm2d_add_32x64(src,
    8476           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8477           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8478           0 :         txfm_param->tx_type, txfm_param->tx_size, txfm_param->eob, txfm_param->bd);
    8479           0 : }
    8480             : 
    8481           0 : static void highbd_inv_txfm_add_64x32(const TranLow *input,
    8482             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8483             :     const TxfmParam *txfm_param) {
    8484           0 :     const int32_t *src = cast_to_int32(input);
    8485           0 :     eb_av1_inv_txfm2d_add_64x32(src,
    8486           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8487           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8488           0 :         txfm_param->tx_type, txfm_param->tx_size, txfm_param->eob, txfm_param->bd);
    8489           0 : }
    8490             : 
    8491           0 : static void highbd_inv_txfm_add_16x64(const TranLow *input,
    8492             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8493             :     const TxfmParam *txfm_param) {
    8494           0 :     const int32_t *src = cast_to_int32(input);
    8495           0 :     eb_av1_inv_txfm2d_add_16x64(src,
    8496           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8497           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8498           0 :         txfm_param->tx_type, txfm_param->tx_size, txfm_param->eob, txfm_param->bd);
    8499           0 : }
    8500             : 
    8501           0 : static void highbd_inv_txfm_add_64x16(const TranLow *input,
    8502             :     uint8_t *dest_r, int32_t stride_r, uint8_t *dest_w, int32_t stride_w,
    8503             :     const TxfmParam *txfm_param) {
    8504           0 :     const int32_t *src = cast_to_int32(input);
    8505           0 :     eb_av1_inv_txfm2d_add_64x16(src,
    8506           0 :         CONVERT_TO_SHORTPTR(dest_r), stride_r,
    8507           0 :         CONVERT_TO_SHORTPTR(dest_w), stride_w,
    8508           0 :         txfm_param->tx_type, txfm_param->tx_size, txfm_param->eob, txfm_param->bd);
    8509           0 : }
    8510             : 
    8511           0 : static void highbd_inv_txfm_add(const TranLow *input,
    8512             :     uint8_t *dest_r, int32_t stride_r,
    8513             :     uint8_t *dest_w, int32_t stride_w,
    8514             :     const TxfmParam *txfm_param) {
    8515             :     //assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
    8516           0 :     const TxSize tx_size = txfm_param->tx_size;
    8517           0 :     switch (tx_size) {
    8518           0 :     case TX_32X32:
    8519           0 :         highbd_inv_txfm_add_32x32(input, dest_r, stride_r, dest_w, stride_w,
    8520             :             txfm_param);
    8521           0 :         break;
    8522           0 :     case TX_16X16:
    8523           0 :         highbd_inv_txfm_add_16x16(input, dest_r, stride_r, dest_w, stride_w,
    8524             :             txfm_param);
    8525           0 :         break;
    8526           0 :     case TX_8X8:
    8527           0 :         highbd_inv_txfm_add_8x8(input, dest_r, stride_r, dest_w, stride_w,
    8528             :             txfm_param);
    8529           0 :         break;
    8530           0 :     case TX_4X8:
    8531           0 :         highbd_inv_txfm_add_4x8(input, dest_r, stride_r, dest_w, stride_w,
    8532             :             txfm_param);
    8533           0 :         break;
    8534           0 :     case TX_8X4:
    8535           0 :         highbd_inv_txfm_add_8x4(input, dest_r, stride_r, dest_w, stride_w,
    8536             :             txfm_param);
    8537           0 :         break;
    8538           0 :     case TX_8X16:
    8539           0 :         highbd_inv_txfm_add_8x16(input, dest_r, stride_r, dest_w, stride_w,
    8540             :             txfm_param);
    8541           0 :         break;
    8542           0 :     case TX_16X8:
    8543           0 :         highbd_inv_txfm_add_16x8(input, dest_r, stride_r, dest_w, stride_w,
    8544             :             txfm_param);
    8545           0 :         break;
    8546           0 :     case TX_16X32:
    8547           0 :         highbd_inv_txfm_add_16x32(input, dest_r, stride_r, dest_w, stride_w,
    8548             :             txfm_param);
    8549           0 :         break;
    8550           0 :     case TX_32X16:
    8551           0 :         highbd_inv_txfm_add_32x16(input, dest_r, stride_r, dest_w, stride_w,
    8552             :             txfm_param);
    8553           0 :         break;
    8554           0 :     case TX_64X64:
    8555           0 :         highbd_inv_txfm_add_64x64(input, dest_r, stride_r, dest_w, stride_w,
    8556             :             txfm_param);
    8557           0 :         break;
    8558           0 :     case TX_32X64:
    8559           0 :         highbd_inv_txfm_add_32x64(input, dest_r, stride_r, dest_w, stride_w,
    8560             :             txfm_param);
    8561           0 :         break;
    8562           0 :     case TX_64X32:
    8563           0 :         highbd_inv_txfm_add_64x32(input, dest_r, stride_r, dest_w, stride_w,
    8564             :             txfm_param);
    8565           0 :         break;
    8566           0 :     case TX_16X64:
    8567           0 :         highbd_inv_txfm_add_16x64(input, dest_r, stride_r, dest_w, stride_w,
    8568             :             txfm_param);
    8569           0 :         break;
    8570           0 :     case TX_64X16:
    8571           0 :         highbd_inv_txfm_add_64x16(input, dest_r, stride_r, dest_w, stride_w,
    8572             :             txfm_param);
    8573           0 :         break;
    8574           0 :     case TX_4X4:
    8575             :         // this is like av1_short_idct4x4 but has a special case around eob<=1
    8576             :         // which is significant (not just an optimization) for the lossless
    8577             :         // case.
    8578           0 :         eb_av1_highbd_inv_txfm_add_4x4(input,
    8579             :             dest_r, stride_r, dest_w, stride_w,
    8580             :             txfm_param);
    8581           0 :         break;
    8582           0 :     case TX_16X4:
    8583           0 :         highbd_inv_txfm_add_16x4(input, dest_r, stride_r, dest_w, stride_w,
    8584             :             txfm_param);
    8585           0 :         break;
    8586           0 :     case TX_4X16:
    8587           0 :         highbd_inv_txfm_add_4x16(input, dest_r, stride_r, dest_w, stride_w,
    8588             :             txfm_param);
    8589           0 :         break;
    8590           0 :     case TX_8X32:
    8591           0 :         highbd_inv_txfm_add_8x32(input, dest_r, stride_r, dest_w, stride_w,
    8592             :             txfm_param);
    8593           0 :         break;
    8594           0 :     case TX_32X8:
    8595           0 :         highbd_inv_txfm_add_32x8(input, dest_r, stride_r, dest_w, stride_w,
    8596             :             txfm_param);
    8597           0 :         break;
    8598           0 :     default: assert(0 && "Invalid transform size"); break;
    8599             :     }
    8600           0 : }
    8601             : 
    8602           0 : void eb_av1_inv_txfm_add_c(const TranLow *dqcoeff,
    8603             :     uint8_t *dst_r, int32_t stride_r,
    8604             :     uint8_t *dst_w, int32_t stride_w,
    8605             :     const TxfmParam *txfm_param) {
    8606           0 :     const TxSize tx_size = txfm_param->tx_size;
    8607             :     DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]);
    8608           0 :     int32_t tmp_stride = MAX_TX_SIZE;
    8609           0 :     int32_t w = tx_size_wide[tx_size];
    8610           0 :     int32_t h = tx_size_high[tx_size];
    8611           0 :     for (int32_t r = 0; r < h; ++r) {
    8612           0 :         for (int32_t c = 0; c < w; ++c)
    8613           0 :             tmp[r * tmp_stride + c] = dst_r[r * stride_r + c];
    8614             :     }
    8615             : 
    8616           0 :     highbd_inv_txfm_add(dqcoeff,
    8617           0 :         CONVERT_TO_BYTEPTR(tmp), tmp_stride,
    8618           0 :         CONVERT_TO_BYTEPTR(tmp), tmp_stride,
    8619             :         txfm_param);
    8620             : 
    8621           0 :     for (int32_t r = 0; r < h; ++r) {
    8622           0 :         for (int32_t c = 0; c < w; ++c)
    8623           0 :             dst_w[r * stride_w + c] = (uint8_t)tmp[r * tmp_stride + c];
    8624             :     }
    8625           0 : }
    8626             : 
    8627           0 : EbErrorType av1_inv_transform_recon(
    8628             :     int32_t      *coeff_buffer,//1D buffer
    8629             :     uint8_t      *recon_buffer_r,
    8630             :     uint32_t      recon_stride_r,
    8631             :     uint8_t      *recon_buffer_w,
    8632             :     uint32_t      recon_stride_w,
    8633             :     TxSize        txsize,
    8634             :     uint32_t      bit_increment,
    8635             :     TxType        transform_type,
    8636             :     PlaneType     component_type,
    8637             :     uint32_t      eob,
    8638             :     uint8_t       lossless)
    8639             : {
    8640             :     UNUSED(component_type);
    8641           0 :     EbErrorType return_error = EB_ErrorNone;
    8642             :     TxfmParam txfm_param;
    8643           0 :     txfm_param.tx_type = transform_type;
    8644           0 :     txfm_param.tx_size = txsize;
    8645           0 :     txfm_param.eob = eob;
    8646           0 :     txfm_param.lossless = lossless;
    8647           0 :     txfm_param.bd = bit_increment + EB_8BIT;
    8648           0 :     txfm_param.is_hbd = 1;
    8649             :     //TxfmParam.tx_set_type = av1_get_ext_tx_set_type(   txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
    8650             : 
    8651           0 :     if (recon_buffer_r != recon_buffer_w) {
    8652             :         /* When output pointers to read and write are differents,
    8653             :          * then kernel copy also all buffer from read to write,
    8654             :          * and cannot be limited by End Of Buffer calculations. */
    8655           0 :         txfm_param.eob = av1_get_max_eob(txsize);
    8656             :     }
    8657             : 
    8658           0 :     highbd_inv_txfm_add((const TranLow *)coeff_buffer,
    8659             :         recon_buffer_r, recon_stride_r,
    8660             :         recon_buffer_w, recon_stride_w,
    8661             :         &txfm_param);
    8662             : 
    8663           0 :     return return_error;
    8664             : }
    8665             : 
    8666    44090500 : EbErrorType av1_inv_transform_recon8bit(
    8667             :     int32_t       *coeff_buffer,//1D buffer
    8668             :     uint8_t       *recon_buffer_r,
    8669             :     uint32_t       recon_stride_r,
    8670             :     uint8_t       *recon_buffer_w,
    8671             :     uint32_t       recon_stride_w,
    8672             :     TxSize         txsize,
    8673             :     TxType         transform_type,
    8674             :     PlaneType     component_type,
    8675             :     uint32_t       eob,
    8676             :     uint8_t        lossless
    8677             : )
    8678             : {
    8679             :     UNUSED(component_type);
    8680    44090500 :     EbErrorType return_error = EB_ErrorNone;
    8681             :     TxfmParam txfm_param;
    8682    44090500 :     txfm_param.tx_type = transform_type;
    8683    44090500 :     txfm_param.tx_size = txsize;
    8684    44090500 :     txfm_param.eob = eob;
    8685    44090500 :     txfm_param.lossless = lossless;
    8686    44090500 :     txfm_param.bd = 8;
    8687    44090500 :     txfm_param.is_hbd = 1;
    8688             :     //TxfmParam.tx_set_type = av1_get_ext_tx_set_type(   txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
    8689             : 
    8690    44090500 :     if (recon_buffer_r != recon_buffer_w) {
    8691             :         /* When output pointers to read and write are differents,
    8692             :          * then kernel copy also all buffer from read to write,
    8693             :          * and cannot be limited by End Of Buffer calculations. */
    8694    44056800 :         txfm_param.eob = av1_get_max_eob(txsize);
    8695             :     }
    8696             : 
    8697    44099300 :     eb_av1_inv_txfm_add((const TranLow *)coeff_buffer,
    8698             :         recon_buffer_r, recon_stride_r,
    8699             :         recon_buffer_w, recon_stride_w,
    8700             :         &txfm_param);
    8701             : 
    8702    44078500 :     return return_error;
    8703             : }
    8704             : 
    8705             : /*********************************************************************
    8706             :  * Map Chroma QP
    8707             :  *********************************************************************/
    8708           0 : uint8_t map_chroma_qp(
    8709             :     uint8_t  qp)
    8710             : {
    8711           0 :     return qp;
    8712             : }
    8713             : 
    8714           0 : uint8_t ConstructPmTransCoeffShapingKnob(const uint16_t *masking_matrix, uint8_t txb_size) // M_Processing is an function of type uint16_t
    8715             : {
    8716           0 :     uint8_t  stride = txb_size;
    8717           0 :     uint8_t  strideN2 = stride >> 1;
    8718           0 :     uint8_t  strideN4 = stride >> 2;
    8719             : 
    8720             :     uint16_t index, row_index, columnIndex;
    8721           0 :     uint64_t h1 = 0, h2 = 0, h3 = 0, q1 = 0, q2 = 0, q3 = 0, dc = 0;
    8722             : 
    8723           0 :     for (index = 0; index < txb_size*txb_size; index++)
    8724             :     {
    8725           0 :         row_index = index / stride;
    8726           0 :         columnIndex = index % stride;
    8727           0 :         if ((columnIndex >= strideN2) && (row_index < strideN2))
    8728           0 :             h1 += masking_matrix[index];
    8729           0 :         else if ((row_index >= strideN2) && (columnIndex < strideN2))
    8730           0 :             h2 += masking_matrix[index];
    8731           0 :         else if ((row_index > strideN2) && (columnIndex > strideN2))
    8732           0 :             h3 += masking_matrix[index];
    8733           0 :         else if ((columnIndex >= strideN4) && (row_index < strideN4))
    8734           0 :             q1 += masking_matrix[index];
    8735           0 :         else if ((row_index >= strideN4) && (columnIndex < strideN4))
    8736           0 :             q2 += masking_matrix[index];
    8737           0 :         else if ((row_index > strideN4) && (columnIndex > strideN4))
    8738           0 :             q3 += masking_matrix[index];
    8739           0 :         else if ((row_index != 0) && (columnIndex != 0))
    8740           0 :             dc += masking_matrix[index];
    8741             :     }
    8742             : 
    8743           0 :     if ((h1 == 0) && (h2 == 0) && (h3 == 0)) {
    8744           0 :         if ((q1 == 0) && (q2 == 0) && (q3 == 0))
    8745             :         {
    8746             :             {
    8747             :                 // SHAPE_N4 not supported for TU 4x4
    8748           0 :                 if (txb_size == 4)
    8749           0 :                     return(0);
    8750             :                 // SHAPE_N4 not supported for TU 8x8
    8751           0 :                 else if (txb_size == 8)
    8752           0 :                     return(1);
    8753             :                 else
    8754           0 :                     return(2);
    8755             :             }
    8756             :         }
    8757             :         else {
    8758             :             // SHAPE_N2 not supported for TU 4x4
    8759           0 :             if (txb_size == 4)
    8760           0 :                 return(0);
    8761             :             else
    8762           0 :                 return(1);
    8763             :         }
    8764             :     }
    8765             :     else
    8766           0 :         return(0);
    8767             : }
    8768           0 : void construct_pm_trans_coeff_shaping(
    8769             :     SequenceControlSet  *sequence_control_set_ptr)
    8770             : {
    8771             :     uint8_t resolutionIndex;                     // 4K or 1080p Index
    8772             :     uint8_t levelIndex;                        // PM Level Index
    8773             :     uint8_t tuSizeIndex;                         // TU Size Index
    8774           0 :     uint8_t arrayLength[4] = { 4, 8, 16, 32 }; // TU Size LUT
    8775             : 
    8776           0 :     for (resolutionIndex = 0; resolutionIndex < 2; resolutionIndex++) {
    8777           0 :         for (levelIndex = 0; levelIndex < 8; levelIndex++) {
    8778           0 :             for (tuSizeIndex = 0; tuSizeIndex < 4; tuSizeIndex++)
    8779           0 :                 sequence_control_set_ptr->trans_coeff_shape_array[resolutionIndex][levelIndex][tuSizeIndex] = ConstructPmTransCoeffShapingKnob(masking_matrix[resolutionIndex][levelIndex][tuSizeIndex], arrayLength[tuSizeIndex]);
    8780             :         }
    8781             :     }
    8782           0 : }

Generated by: LCOV version 1.14