LCOV - code coverage report
Current view: top level - Codec - EbProductCodingLoop.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 2303 4587 50.2 %
Date: 2019-11-25 17:12:20 Functions: 68 97 70.1 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : /*
       7             : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       8             : *
       9             : * This source code is subject to the terms of the BSD 2 Clause License and
      10             : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      11             : * was not distributed with this source code in the LICENSE file, you can
      12             : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      13             : * Media Patent License 1.0 was not distributed with this source code in the
      14             : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      15             : */
      16             : 
      17             : #include <stdlib.h>
      18             : 
      19             : #include "EbDefinitions.h"
      20             : #include "EbUtility.h"
      21             : #include "EbTransformUnit.h"
      22             : #include "EbRateDistortionCost.h"
      23             : #include "EbFullLoop.h"
      24             : #include "EbPictureOperators.h"
      25             : 
      26             : #include "EbModeDecisionProcess.h"
      27             : #include "EbComputeSAD.h"
      28             : #include "EbTransforms.h"
      29             : #include "EbMeSadCalculation.h"
      30             : #include "EbMotionEstimation.h"
      31             : #include "EbAvcStyleMcp.h"
      32             : #include "aom_dsp_rtcd.h"
      33             : #include "EbCodingLoop.h"
      34             : 
      35             : #define PREDICTIVE_ME_MAX_MVP_CANIDATES  4
      36             : #define PREDICTIVE_ME_DEVIATION_TH      50
      37             : #define FULL_PEL_REF_WINDOW_WIDTH        7
      38             : #define FULL_PEL_REF_WINDOW_HEIGHT       5
      39             : #define HALF_PEL_REF_WINDOW              3
      40             : #define QUARTER_PEL_REF_WINDOW           3
      41             : #if EIGHT_PEL_PREDICTIVE_ME
      42             : #define EIGHT_PEL_REF_WINDOW          3
      43             : #endif
      44             : EbErrorType generate_md_stage_0_cand(
      45             :     LargestCodingUnit   *sb_ptr,
      46             :     ModeDecisionContext *context_ptr,
      47             :     SsMeContext         *ss_mecontext,
      48             :     uint32_t            *fast_candidate_total_count,
      49             :     PictureControlSet   *picture_control_set_ptr);
      50             : 
      51             : #if II_COMP_FLAG
      52      811366 : static INLINE int is_interintra_allowed_bsize(const BlockSize bsize) {
      53      811366 :     return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32);
      54             : }
      55             : void precompute_intra_pred_for_inter_intra(
      56             :     PictureControlSet            *picture_control_set_ptr,
      57             :     ModeDecisionContext          *context_ptr);
      58             : #endif
      59             : 
      60             : #if PAL_SUP
      61             : int svt_av1_allow_palette(int allow_palette,
      62             :     BlockSize sb_type);
      63             : #endif
      64             : /*******************************************
      65             : * set Penalize Skip Flag
      66             : *
      67             : * Summary: Set the penalize_skipflag to true
      68             : * When there is luminance/chrominance change
      69             : * or in noisy clip with low motion at meduim
      70             : * varince area
      71             : *
      72             : *******************************************/
      73             : 
      74             : const EbPredictionFunc  ProductPredictionFunTable[3] = { NULL, inter_pu_prediction_av1, eb_av1_intra_prediction_cl};
      75             : 
      76             : const EbFastCostFunc   Av1ProductFastCostFuncTable[3] =
      77             : {
      78             :     NULL,
      79             :     av1_inter_fast_cost, /*INTER */
      80             :     av1_intra_fast_cost /*INTRA */
      81             : };
      82             : 
      83             : const EbAv1FullCostFunc   Av1ProductFullCostFuncTable[3] =
      84             : {
      85             :     NULL,
      86             :     av1_inter_full_cost, /*INTER */
      87             :     av1_intra_full_cost/*INTRA */
      88             : };
      89             : 
      90             : /***************************************************
      91             : * Update Recon Samples Neighbor Arrays
      92             : ***************************************************/
      93     1217070 : void mode_decision_update_neighbor_arrays(
      94             :     PictureControlSet     *picture_control_set_ptr,
      95             :     ModeDecisionContext   *context_ptr,
      96             :     uint32_t               index_mds,
      97             :     EbBool                 intraMdOpenLoop,
      98             :     EbBool                 intra4x4Selected){
      99     1217070 :     uint32_t  bwdith = context_ptr->blk_geom->bwidth;
     100     1217070 :     uint32_t  bheight = context_ptr->blk_geom->bheight;
     101             : 
     102     1217070 :     uint32_t                   origin_x = context_ptr->cu_origin_x;
     103     1217070 :     uint32_t                   origin_y = context_ptr->cu_origin_y;
     104             :     (void)intra4x4Selected;
     105             : 
     106     1217070 :     uint32_t  cu_origin_x_uv = context_ptr->round_origin_x >> 1;
     107     1217070 :     uint32_t  cu_origin_y_uv = context_ptr->round_origin_y >> 1;
     108     1217070 :     uint32_t  bwdith_uv = context_ptr->blk_geom->bwidth_uv;
     109     1217070 :     uint32_t  bwheight_uv = context_ptr->blk_geom->bheight_uv;
     110             : 
     111     1217070 :     uint8_t modeType = context_ptr->cu_ptr->prediction_mode_flag;
     112     1217070 :     uint8_t intra_luma_mode = (uint8_t)context_ptr->cu_ptr->pred_mode;
     113     1217070 :     uint8_t chroma_mode = (uint8_t)context_ptr->cu_ptr->prediction_unit_array->intra_chroma_mode;
     114     1217070 :     uint8_t skip_flag = (uint8_t)context_ptr->cu_ptr->skip_flag;
     115             : 
     116     1217070 :     context_ptr->mv_unit.pred_direction = (uint8_t)(context_ptr->md_cu_arr_nsq[index_mds].prediction_unit_array[0].inter_pred_direction_index);
     117     1217070 :     context_ptr->mv_unit.mv[REF_LIST_0].mv_union = context_ptr->md_cu_arr_nsq[index_mds].prediction_unit_array[0].mv[REF_LIST_0].mv_union;
     118     1217070 :     context_ptr->mv_unit.mv[REF_LIST_1].mv_union = context_ptr->md_cu_arr_nsq[index_mds].prediction_unit_array[0].mv[REF_LIST_1].mv_union;
     119     1217070 :     uint8_t                    inter_pred_direction_index = (uint8_t)context_ptr->cu_ptr->prediction_unit_array->inter_pred_direction_index;
     120     1217070 :     uint8_t                    ref_frame_type = (uint8_t)context_ptr->cu_ptr->prediction_unit_array[0].ref_frame_type;
     121             : 
     122     1217070 :     if (picture_control_set_ptr->parent_pcs_ptr->interpolation_search_level != IT_SEARCH_OFF)
     123     1132860 :     neighbor_array_unit_mode_write32(
     124             :         context_ptr->interpolation_type_neighbor_array,
     125     1132860 :         context_ptr->cu_ptr->interp_filters,
     126             :         origin_x,
     127             :         origin_y,
     128             :         bwdith,
     129             :         bheight,
     130             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     131             : 
     132             :     {
     133             :         struct PartitionContext partition;
     134     1217040 :         partition.above = partition_context_lookup[context_ptr->blk_geom->bsize].above;
     135     1217040 :         partition.left = partition_context_lookup[context_ptr->blk_geom->bsize].left;
     136             : 
     137     1217040 :         neighbor_array_unit_mode_write(
     138             :             context_ptr->leaf_partition_neighbor_array,
     139             :             (uint8_t*)(&partition), // NaderM
     140             :             origin_x,
     141             :             origin_y,
     142             :             bwdith,
     143             :             bheight,
     144             :             NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     145             : 
     146             :         // Mode Type Update
     147     1217130 :         neighbor_array_unit_mode_write(
     148             :             context_ptr->mode_type_neighbor_array,
     149             :             &modeType,
     150             :             origin_x,
     151             :             origin_y,
     152             :             bwdith,
     153             :             bheight,
     154             :             NEIGHBOR_ARRAY_UNIT_FULL_MASK);
     155     1217140 :         if (picture_control_set_ptr->parent_pcs_ptr->skip_sub_blks)
     156             :         // Intra Luma Mode Update
     157           0 :         neighbor_array_unit_mode_write(
     158             :             context_ptr->leaf_depth_neighbor_array,
     159           0 :             (uint8_t*)&context_ptr->blk_geom->bsize,//(uint8_t*)luma_mode,
     160             :             origin_x,
     161             :             origin_y,
     162             :             bwdith,
     163             :             bheight,
     164             :             NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     165             :         // Intra Luma Mode Update
     166     1217140 :         neighbor_array_unit_mode_write(
     167             :             context_ptr->intra_luma_mode_neighbor_array,
     168             :             &intra_luma_mode,//(uint8_t*)luma_mode,
     169             :             origin_x,
     170             :             origin_y,
     171             :             bwdith,
     172             :             bheight,
     173             :             NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     174             : 
     175     1217130 :         uint16_t txb_count = context_ptr->blk_geom->txb_count[context_ptr->cu_ptr->tx_depth];
     176     2490580 :         for (uint8_t txb_itr = 0; txb_itr < txb_count; txb_itr++)
     177             :         {
     178     1273460 :             uint8_t dc_sign_level_coeff = (int32_t)context_ptr->cu_ptr->quantized_dc[0][txb_itr];
     179             : 
     180     1273460 :             neighbor_array_unit_mode_write(
     181             :                 context_ptr->luma_dc_sign_level_coeff_neighbor_array,
     182             :                 (uint8_t*)&dc_sign_level_coeff,
     183     1273460 :                 context_ptr->sb_origin_x + context_ptr->blk_geom->tx_org_x[context_ptr->cu_ptr->tx_depth][txb_itr],
     184     1273460 :                 context_ptr->sb_origin_y + context_ptr->blk_geom->tx_org_y[context_ptr->cu_ptr->tx_depth][txb_itr],
     185     1273460 :                 context_ptr->blk_geom->tx_width[context_ptr->cu_ptr->tx_depth][txb_itr],
     186     1273460 :                 context_ptr->blk_geom->tx_height[context_ptr->cu_ptr->tx_depth][txb_itr],
     187             :                 NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     188             : 
     189     1273450 :             neighbor_array_unit_mode_write(
     190             :                 picture_control_set_ptr->md_tx_depth_1_luma_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX],
     191             :                 (uint8_t*)&dc_sign_level_coeff,
     192     1273450 :                 context_ptr->sb_origin_x + context_ptr->blk_geom->tx_org_x[context_ptr->cu_ptr->tx_depth][txb_itr],
     193     1273450 :                 context_ptr->sb_origin_y + context_ptr->blk_geom->tx_org_y[context_ptr->cu_ptr->tx_depth][txb_itr],
     194     1273450 :                 context_ptr->blk_geom->tx_width[context_ptr->cu_ptr->tx_depth][txb_itr],
     195     1273450 :                 context_ptr->blk_geom->tx_height[context_ptr->cu_ptr->tx_depth][txb_itr],
     196             :                 NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     197             :         }
     198             :     }
     199             : 
     200             :     // Hsan: chroma mode rate estimation is kept even for chroma blind
     201     1217120 :     if (context_ptr->blk_geom->has_uv) {
     202             :         // Intra Chroma Mode Update
     203      818364 :         neighbor_array_unit_mode_write(
     204             :             context_ptr->intra_chroma_mode_neighbor_array,
     205             :             &chroma_mode,
     206             :             cu_origin_x_uv,
     207             :             cu_origin_y_uv,
     208             :             bwdith_uv,
     209             :             bwheight_uv,
     210             :             NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     211             :     }
     212             : 
     213     1217140 :     neighbor_array_unit_mode_write(
     214             :         context_ptr->skip_flag_neighbor_array,
     215             :         &skip_flag,
     216             :         origin_x,
     217             :         origin_y,
     218             :         bwdith,
     219             :         bheight,
     220             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     221             : 
     222     1217150 :     if (context_ptr->blk_geom->has_uv && context_ptr->chroma_level <= CHROMA_MODE_1) {
     223             :         //  Update chroma CB cbf and Dc context
     224             :         {
     225      734071 :             uint8_t dc_sign_level_coeff = (int32_t)context_ptr->cu_ptr->quantized_dc[1][0];
     226      734071 :             neighbor_array_unit_mode_write(
     227             :                 context_ptr->cb_dc_sign_level_coeff_neighbor_array,
     228             :                 (uint8_t*)&dc_sign_level_coeff,
     229             :                 cu_origin_x_uv,
     230             :                 cu_origin_y_uv,
     231             :                 bwdith_uv,
     232             :                 bwheight_uv,
     233             :                 NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     234             :         }
     235             : 
     236             :         //  Update chroma CR cbf and Dc context
     237             :         {
     238      734067 :             uint8_t dc_sign_level_coeff = (int32_t)context_ptr->cu_ptr->quantized_dc[2][0];
     239      734067 :             neighbor_array_unit_mode_write(
     240             :                 context_ptr->cr_dc_sign_level_coeff_neighbor_array,
     241             :                 (uint8_t*)&dc_sign_level_coeff,
     242             :                 cu_origin_x_uv,
     243             :                 cu_origin_y_uv,
     244             :                 bwdith_uv,
     245             :                 bwheight_uv,
     246             :                 NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     247             :         }
     248             :     }
     249             : #if ENHANCE_ATB
     250     1217140 :     uint8_t tx_size = tx_depth_to_tx_size[context_ptr->cu_ptr->tx_depth][context_ptr->blk_geom->bsize];
     251     1217140 :     uint8_t bw = tx_size_wide[tx_size];
     252     1217140 :     uint8_t bh = tx_size_high[tx_size];
     253             : 
     254     1217140 :     neighbor_array_unit_mode_write(
     255             :         context_ptr->txfm_context_array,
     256             :         &bw,
     257             :         origin_x,
     258             :         origin_y,
     259             :         bwdith,
     260             :         bheight,
     261             :         NEIGHBOR_ARRAY_UNIT_TOP_MASK);
     262             : 
     263     1217170 :     neighbor_array_unit_mode_write(
     264             :         context_ptr->txfm_context_array,
     265             :         &bh,
     266             :         origin_x,
     267             :         origin_y,
     268             :         bwdith,
     269             :         bheight,
     270             :         NEIGHBOR_ARRAY_UNIT_LEFT_MASK);
     271             : #else
     272             :     neighbor_array_unit_mode_write(
     273             :         context_ptr->txfm_context_array,
     274             :         &context_ptr->cu_ptr->tx_depth,
     275             :         origin_x,
     276             :         origin_y,
     277             :         bwdith,
     278             :         bheight,
     279             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     280             : #endif
     281             : 
     282             :     // Update the Inter Pred Type Neighbor Array
     283             : 
     284     1217150 :     neighbor_array_unit_mode_write(
     285             :         context_ptr->inter_pred_dir_neighbor_array,
     286             :         &inter_pred_direction_index,
     287             :         origin_x,
     288             :         origin_y,
     289             :         bwdith,
     290             :         bheight,
     291             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     292             : 
     293             :     // Update the refFrame Type Neighbor Array
     294     1217140 :     neighbor_array_unit_mode_write(
     295             :         context_ptr->ref_frame_type_neighbor_array,
     296             :         &ref_frame_type,
     297             :         origin_x,
     298             :         origin_y,
     299             :         bwdith,
     300             :         bheight,
     301             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     302             : 
     303     1217160 :     if (!context_ptr->hbd_mode_decision) {
     304     1217160 :         if (intraMdOpenLoop == EB_FALSE)
     305             :         {
     306     1217160 :             update_recon_neighbor_array(
     307             :                 context_ptr->luma_recon_neighbor_array,
     308     1217160 :                 context_ptr->cu_ptr->neigh_top_recon[0],
     309     1217160 :                 context_ptr->cu_ptr->neigh_left_recon[0],
     310             :                 origin_x,
     311             :                 origin_y,
     312     1217160 :                 context_ptr->blk_geom->bwidth,
     313     1217160 :                 context_ptr->blk_geom->bheight);
     314     1217200 :             if (picture_control_set_ptr->parent_pcs_ptr->atb_mode) {
     315      159544 :                 update_recon_neighbor_array(
     316             :                     picture_control_set_ptr->md_tx_depth_1_luma_recon_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX],
     317      159544 :                     context_ptr->cu_ptr->neigh_top_recon[0],
     318      159544 :                     context_ptr->cu_ptr->neigh_left_recon[0],
     319             :                     origin_x,
     320             :                     origin_y,
     321      159544 :                     context_ptr->blk_geom->bwidth,
     322      159544 :                     context_ptr->blk_geom->bheight);
     323             :             }
     324             :         }
     325             : 
     326     1217200 :         if (intraMdOpenLoop == EB_FALSE) {
     327     1217200 :             if (context_ptr->blk_geom->has_uv && context_ptr->chroma_level <= CHROMA_MODE_1) {
     328      734074 :                 update_recon_neighbor_array(
     329             :                     context_ptr->cb_recon_neighbor_array,
     330      734074 :                     context_ptr->cu_ptr->neigh_top_recon[1],
     331      734074 :                     context_ptr->cu_ptr->neigh_left_recon[1],
     332             :                     cu_origin_x_uv,
     333             :                     cu_origin_y_uv,
     334             :                     bwdith_uv,
     335             :                     bwheight_uv);
     336      734067 :                 update_recon_neighbor_array(
     337             :                     context_ptr->cr_recon_neighbor_array,
     338      734067 :                     context_ptr->cu_ptr->neigh_top_recon[2],
     339      734067 :                     context_ptr->cu_ptr->neigh_left_recon[2],
     340             :                     cu_origin_x_uv,
     341             :                     cu_origin_y_uv,
     342             :                     bwdith_uv,
     343             :                     bwheight_uv);
     344             :             }
     345             :         }
     346             :     } else {
     347           0 :         if (intraMdOpenLoop == EB_FALSE)
     348           0 :             update_recon_neighbor_array16bit(
     349             :                 context_ptr->luma_recon_neighbor_array16bit,
     350           0 :                 context_ptr->cu_ptr->neigh_top_recon_16bit[0],
     351           0 :                 context_ptr->cu_ptr->neigh_left_recon_16bit[0],
     352             :                 origin_x,
     353             :                 origin_y,
     354           0 :                 context_ptr->blk_geom->bwidth,
     355           0 :                 context_ptr->blk_geom->bheight);
     356             : 
     357           0 :         if (picture_control_set_ptr->parent_pcs_ptr->atb_mode) {
     358           0 :             update_recon_neighbor_array16bit(
     359             :                 picture_control_set_ptr->md_tx_depth_1_luma_recon_neighbor_array16bit[MD_NEIGHBOR_ARRAY_INDEX],
     360           0 :                 context_ptr->cu_ptr->neigh_top_recon_16bit[0],
     361           0 :                 context_ptr->cu_ptr->neigh_left_recon_16bit[0],
     362             :                 origin_x,
     363             :                 origin_y,
     364           0 :                 context_ptr->blk_geom->bwidth,
     365           0 :                 context_ptr->blk_geom->bheight);
     366             :         }
     367             : 
     368           0 :         if (intraMdOpenLoop == EB_FALSE &&
     369           0 :             context_ptr->blk_geom->has_uv &&
     370           0 :             context_ptr->chroma_level <= CHROMA_MODE_1)
     371             :         {
     372           0 :             update_recon_neighbor_array16bit(
     373             :                 context_ptr->cb_recon_neighbor_array16bit,
     374           0 :                 context_ptr->cu_ptr->neigh_top_recon_16bit[1],
     375           0 :                 context_ptr->cu_ptr->neigh_left_recon_16bit[1],
     376             :                 cu_origin_x_uv,
     377             :                 cu_origin_y_uv,
     378             :                 bwdith_uv,
     379             :                 bwheight_uv);
     380           0 :             update_recon_neighbor_array16bit(
     381             :                 context_ptr->cr_recon_neighbor_array16bit,
     382           0 :                 context_ptr->cu_ptr->neigh_top_recon_16bit[2],
     383           0 :                 context_ptr->cu_ptr->neigh_left_recon_16bit[2],
     384             :                 cu_origin_x_uv,
     385             :                 cu_origin_y_uv,
     386             :                 bwdith_uv,
     387             :                 bwheight_uv);
     388             :         }
     389             :     }
     390             : 
     391     1217200 :     return;
     392             : }
     393             : 
     394      629717 : void copy_neighbour_arrays(
     395             :     PictureControlSet                *picture_control_set_ptr,
     396             :     ModeDecisionContext               *context_ptr,
     397             :     uint32_t                            src_idx,
     398             :     uint32_t                            dst_idx,
     399             :     uint32_t                            blk_mds,
     400             :     uint32_t                            sb_org_x,
     401             :     uint32_t                            sb_org_y)
     402             : {
     403             :     (void)*context_ptr;
     404             : 
     405      629717 :     const BlockGeom * blk_geom = get_blk_geom_mds(blk_mds);
     406             : 
     407      629720 :     uint32_t                            blk_org_x = sb_org_x + blk_geom->origin_x;
     408      629720 :     uint32_t                            blk_org_y = sb_org_y + blk_geom->origin_y;
     409      629720 :     uint32_t                            blk_org_x_uv = (blk_org_x >> 3 << 3) >> 1;
     410      629720 :     uint32_t                            blk_org_y_uv = (blk_org_y >> 3 << 3) >> 1;
     411      629720 :     uint32_t                            bwidth_uv = blk_geom->bwidth_uv;
     412      629720 :     uint32_t                            bheight_uv = blk_geom->bheight_uv;
     413             : 
     414      629720 :     copy_neigh_arr(
     415             :         picture_control_set_ptr->md_intra_luma_mode_neighbor_array[src_idx],
     416             :         picture_control_set_ptr->md_intra_luma_mode_neighbor_array[dst_idx],
     417             :         blk_org_x,
     418             :         blk_org_y,
     419      629720 :         blk_geom->bwidth,
     420      629720 :         blk_geom->bheight,
     421             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     422             : 
     423             :     //neighbor_array_unit_reset(picture_control_set_ptr->md_intra_chroma_mode_neighbor_array[depth]);
     424      629742 :     copy_neigh_arr(
     425             :         picture_control_set_ptr->md_intra_chroma_mode_neighbor_array[src_idx],
     426             :         picture_control_set_ptr->md_intra_chroma_mode_neighbor_array[dst_idx],
     427             :         blk_org_x_uv,
     428             :         blk_org_y_uv,
     429             :         bwidth_uv,
     430             :         bheight_uv,
     431             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     432             : 
     433             :     //neighbor_array_unit_reset(picture_control_set_ptr->md_skip_flag_neighbor_array[depth]);
     434      629727 :     copy_neigh_arr(
     435             :         picture_control_set_ptr->md_skip_flag_neighbor_array[src_idx],
     436             :         picture_control_set_ptr->md_skip_flag_neighbor_array[dst_idx],
     437             :         blk_org_x,
     438             :         blk_org_y,
     439      629727 :         blk_geom->bwidth,
     440      629727 :         blk_geom->bheight,
     441             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     442             : 
     443             :     //neighbor_array_unit_reset(picture_control_set_ptr->md_mode_type_neighbor_array[depth]);
     444      629735 :     copy_neigh_arr(
     445             :         picture_control_set_ptr->md_mode_type_neighbor_array[src_idx],
     446             :         picture_control_set_ptr->md_mode_type_neighbor_array[dst_idx],
     447             :         blk_org_x,
     448             :         blk_org_y,
     449      629735 :         blk_geom->bwidth,
     450      629735 :         blk_geom->bheight,
     451             :         NEIGHBOR_ARRAY_UNIT_FULL_MASK);
     452             : 
     453             :     //neighbor_array_unit_reset(picture_control_set_ptr->md_leaf_depth_neighbor_array[depth]);
     454      629738 :     copy_neigh_arr(
     455             :         picture_control_set_ptr->md_leaf_depth_neighbor_array[src_idx],
     456             :         picture_control_set_ptr->md_leaf_depth_neighbor_array[dst_idx],
     457             :         blk_org_x,
     458             :         blk_org_y,
     459      629738 :         blk_geom->bwidth,
     460      629738 :         blk_geom->bheight,
     461             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     462      629732 :     copy_neigh_arr(
     463             :         picture_control_set_ptr->mdleaf_partition_neighbor_array[src_idx],
     464             :         picture_control_set_ptr->mdleaf_partition_neighbor_array[dst_idx],
     465             :         blk_org_x,
     466             :         blk_org_y,
     467      629732 :         blk_geom->bwidth,
     468      629732 :         blk_geom->bheight,
     469             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     470             : 
     471      629741 :     if (!context_ptr->hbd_mode_decision) {
     472      629741 :         copy_neigh_arr(
     473             :             picture_control_set_ptr->md_luma_recon_neighbor_array[src_idx],
     474             :             picture_control_set_ptr->md_luma_recon_neighbor_array[dst_idx],
     475             :             blk_org_x,
     476             :             blk_org_y,
     477      629741 :             blk_geom->bwidth,
     478      629741 :             blk_geom->bheight,
     479             :             NEIGHBOR_ARRAY_UNIT_FULL_MASK);
     480      629751 :         if (picture_control_set_ptr->parent_pcs_ptr->atb_mode) {
     481       87089 :             copy_neigh_arr(
     482             :                 picture_control_set_ptr->md_tx_depth_1_luma_recon_neighbor_array[src_idx],
     483             :                 picture_control_set_ptr->md_tx_depth_1_luma_recon_neighbor_array[dst_idx],
     484             :                 blk_org_x,
     485             :                 blk_org_y,
     486       87089 :                 blk_geom->bwidth,
     487       87089 :                 blk_geom->bheight,
     488             :                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
     489             :         }
     490      629751 :         if (blk_geom->has_uv && context_ptr->chroma_level <= CHROMA_MODE_1) {
     491      629754 :             copy_neigh_arr(
     492             :                 picture_control_set_ptr->md_cb_recon_neighbor_array[src_idx],
     493             :                 picture_control_set_ptr->md_cb_recon_neighbor_array[dst_idx],
     494             :                 blk_org_x_uv,
     495             :                 blk_org_y_uv,
     496             :                 bwidth_uv,
     497             :                 bheight_uv,
     498             :                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
     499             : 
     500      629749 :             copy_neigh_arr(
     501             :                 picture_control_set_ptr->md_cr_recon_neighbor_array[src_idx],
     502             :                 picture_control_set_ptr->md_cr_recon_neighbor_array[dst_idx],
     503             :                 blk_org_x_uv,
     504             :                 blk_org_y_uv,
     505             :                 bwidth_uv,
     506             :                 bheight_uv,
     507             :                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
     508             :         }
     509             :     } else {
     510           0 :         copy_neigh_arr(
     511             :             picture_control_set_ptr->md_luma_recon_neighbor_array16bit[src_idx],
     512             :             picture_control_set_ptr->md_luma_recon_neighbor_array16bit[dst_idx],
     513             :             blk_org_x,
     514             :             blk_org_y,
     515           0 :             blk_geom->bwidth,
     516           0 :             blk_geom->bheight,
     517             :             NEIGHBOR_ARRAY_UNIT_FULL_MASK);
     518             : 
     519           0 :         if (picture_control_set_ptr->parent_pcs_ptr->atb_mode) {
     520           0 :             copy_neigh_arr(
     521             :                 picture_control_set_ptr->md_tx_depth_1_luma_recon_neighbor_array16bit[src_idx],
     522             :                 picture_control_set_ptr->md_tx_depth_1_luma_recon_neighbor_array16bit[dst_idx],
     523             :                 blk_org_x,
     524             :                 blk_org_y,
     525           0 :                 blk_geom->bwidth,
     526           0 :                 blk_geom->bheight,
     527             :                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
     528             :         }
     529             : 
     530           0 :         if (blk_geom->has_uv && context_ptr->chroma_level <= CHROMA_MODE_1) {
     531           0 :             copy_neigh_arr(
     532             :                 picture_control_set_ptr->md_cb_recon_neighbor_array16bit[src_idx],
     533             :                 picture_control_set_ptr->md_cb_recon_neighbor_array16bit[dst_idx],
     534             :                 blk_org_x_uv,
     535             :                 blk_org_y_uv,
     536             :                 bwidth_uv,
     537             :                 bheight_uv,
     538             :                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
     539             : 
     540           0 :             copy_neigh_arr(
     541             :                 picture_control_set_ptr->md_cr_recon_neighbor_array16bit[src_idx],
     542             :                 picture_control_set_ptr->md_cr_recon_neighbor_array16bit[dst_idx],
     543             :                 blk_org_x_uv,
     544             :                 blk_org_y_uv,
     545             :                 bwidth_uv,
     546             :                 bheight_uv,
     547             :                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
     548             :         }
     549             :     }
     550             : 
     551             :     //neighbor_array_unit_reset(picture_control_set_ptr->md_skip_coeff_neighbor_array[depth]);
     552      629743 :     copy_neigh_arr(
     553             :         picture_control_set_ptr->md_skip_coeff_neighbor_array[src_idx],
     554             :         picture_control_set_ptr->md_skip_coeff_neighbor_array[dst_idx],
     555             :         blk_org_x,
     556             :         blk_org_y,
     557      629743 :         blk_geom->bwidth,
     558      629743 :         blk_geom->bheight,
     559             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     560             :     //neighbor_array_unit_reset(picture_control_set_ptr->md_luma_dc_sign_level_coeff_neighbor_array[depth]);
     561      629711 :     copy_neigh_arr(
     562             :         picture_control_set_ptr->md_luma_dc_sign_level_coeff_neighbor_array[src_idx],
     563             :         picture_control_set_ptr->md_luma_dc_sign_level_coeff_neighbor_array[dst_idx],
     564             :         blk_org_x,
     565             :         blk_org_y,
     566      629711 :         blk_geom->bwidth,
     567      629711 :         blk_geom->bheight,
     568             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     569             : 
     570      629738 :     copy_neigh_arr(
     571             :         picture_control_set_ptr->md_tx_depth_1_luma_dc_sign_level_coeff_neighbor_array[src_idx],
     572             :         picture_control_set_ptr->md_tx_depth_1_luma_dc_sign_level_coeff_neighbor_array[dst_idx],
     573             :         blk_org_x,
     574             :         blk_org_y,
     575      629738 :         blk_geom->bwidth,
     576      629738 :         blk_geom->bheight,
     577             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     578             : 
     579      629738 :     if (blk_geom->has_uv && context_ptr->chroma_level <= CHROMA_MODE_1) {
     580      629742 :         copy_neigh_arr(
     581             :             picture_control_set_ptr->md_cb_dc_sign_level_coeff_neighbor_array[src_idx],
     582             :             picture_control_set_ptr->md_cb_dc_sign_level_coeff_neighbor_array[dst_idx],
     583             :             blk_org_x_uv,
     584             :             blk_org_y_uv,
     585             :             bwidth_uv,
     586             :             bheight_uv,
     587             :             NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     588             :         //neighbor_array_unit_reset(picture_control_set_ptr->md_cr_dc_sign_level_coeff_neighbor_array[depth]);
     589             : 
     590      629722 :         copy_neigh_arr(
     591             :             picture_control_set_ptr->md_cr_dc_sign_level_coeff_neighbor_array[src_idx],
     592             :             picture_control_set_ptr->md_cr_dc_sign_level_coeff_neighbor_array[dst_idx],
     593             :             blk_org_x_uv,
     594             :             blk_org_y_uv,
     595             :             bwidth_uv,
     596             :             bheight_uv,
     597             :             NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     598             :     }
     599             : 
     600             :     //neighbor_array_unit_reset(picture_control_set_ptr->md_txfm_context_array[depth]);
     601      629711 :     copy_neigh_arr(
     602             :         picture_control_set_ptr->md_txfm_context_array[src_idx],
     603             :         picture_control_set_ptr->md_txfm_context_array[dst_idx],
     604             :         blk_org_x,
     605             :         blk_org_y,
     606      629711 :         blk_geom->bwidth,
     607      629711 :         blk_geom->bheight,
     608             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     609             :     //neighbor_array_unit_reset(picture_control_set_ptr->md_inter_pred_dir_neighbor_array[depth]);
     610      629738 :     copy_neigh_arr(
     611             :         picture_control_set_ptr->md_inter_pred_dir_neighbor_array[src_idx],
     612             :         picture_control_set_ptr->md_inter_pred_dir_neighbor_array[dst_idx],
     613             :         blk_org_x,
     614             :         blk_org_y,
     615      629738 :         blk_geom->bwidth,
     616      629738 :         blk_geom->bheight,
     617             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     618             :     //neighbor_array_unit_reset(picture_control_set_ptr->md_ref_frame_type_neighbor_array[depth]);
     619      629733 :     copy_neigh_arr(
     620             :         picture_control_set_ptr->md_ref_frame_type_neighbor_array[src_idx],
     621             :         picture_control_set_ptr->md_ref_frame_type_neighbor_array[dst_idx],
     622             :         blk_org_x,
     623             :         blk_org_y,
     624      629733 :         blk_geom->bwidth,
     625      629733 :         blk_geom->bheight,
     626             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     627             : 
     628      629707 :     copy_neigh_arr_32(
     629             :         picture_control_set_ptr->md_interpolation_type_neighbor_array[src_idx],
     630             :         picture_control_set_ptr->md_interpolation_type_neighbor_array[dst_idx],
     631             :         blk_org_x,
     632             :         blk_org_y,
     633      629707 :         blk_geom->bwidth,
     634      629707 :         blk_geom->bheight,
     635             :         NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
     636      629740 : }
     637             : 
     638     1217140 : void md_update_all_neighbour_arrays(
     639             :     PictureControlSet                *picture_control_set_ptr,
     640             :     ModeDecisionContext               *context_ptr,
     641             :     uint32_t                             lastCuIndex_mds,
     642             :     uint32_t                            sb_origin_x,
     643             :     uint32_t                            sb_origin_y)
     644             : {
     645     1217140 :     context_ptr->blk_geom = get_blk_geom_mds(lastCuIndex_mds);
     646     1217130 :     context_ptr->cu_origin_x = sb_origin_x + context_ptr->blk_geom->origin_x;
     647     1217130 :     context_ptr->cu_origin_y = sb_origin_y + context_ptr->blk_geom->origin_y;
     648     1217130 :     context_ptr->round_origin_x = ((context_ptr->cu_origin_x >> 3) << 3);
     649     1217130 :     context_ptr->round_origin_y = ((context_ptr->cu_origin_y >> 3) << 3);
     650             : 
     651     1217130 :     context_ptr->cu_ptr = &context_ptr->md_cu_arr_nsq[lastCuIndex_mds];
     652             : 
     653     1217130 :     mode_decision_update_neighbor_arrays(
     654             :         picture_control_set_ptr,
     655             :         context_ptr,
     656             :         lastCuIndex_mds,
     657     1217130 :         picture_control_set_ptr->intra_md_open_loop_flag,
     658             :         EB_FALSE);
     659             : 
     660     1217190 :     update_mi_map(
     661             :         context_ptr,
     662             :         context_ptr->cu_ptr,
     663     1217190 :         context_ptr->cu_origin_x,
     664     1217190 :         context_ptr->cu_origin_y,
     665             :         context_ptr->blk_geom,
     666             :         0,
     667             :         picture_control_set_ptr);
     668     1217180 : }
     669             : 
     670      312992 : void md_update_all_neighbour_arrays_multiple(
     671             :     PictureControlSet                *picture_control_set_ptr,
     672             :     ModeDecisionContext               *context_ptr,
     673             :     uint32_t                            blk_mds,
     674             :     uint32_t                            sb_origin_x,
     675             :     uint32_t                            sb_origin_y){
     676      312992 :     context_ptr->blk_geom = get_blk_geom_mds(blk_mds);
     677             : 
     678             :     uint32_t blk_it;
     679      632866 :     for (blk_it = 0; blk_it < context_ptr->blk_geom->totns; blk_it++)
     680             :     {
     681      319813 :         md_update_all_neighbour_arrays(
     682             :             picture_control_set_ptr,
     683             :             context_ptr,
     684             :             blk_mds + blk_it,
     685             :             sb_origin_x,
     686             :             sb_origin_y);
     687             :     }
     688      313053 : }
     689             : 
     690             : #define TOTAL_SQ_BLOCK_COUNT 341
     691             : int sq_block_index[TOTAL_SQ_BLOCK_COUNT] = {
     692             :     0,
     693             :     25,
     694             :     50,
     695             :     75,
     696             :     80,
     697             :     81,
     698             :     82,
     699             :     83,
     700             :     84,
     701             :     89,
     702             :     90,
     703             :     91,
     704             :     92,
     705             :     93,
     706             :     98,
     707             :     99,
     708             :     100,
     709             :     101,
     710             :     102,
     711             :     107,
     712             :     108,
     713             :     109,
     714             :     110,
     715             :     111,
     716             :     136,
     717             :     141,
     718             :     142,
     719             :     143,
     720             :     144,
     721             :     145,
     722             :     150,
     723             :     151,
     724             :     152,
     725             :     153,
     726             :     154,
     727             :     159,
     728             :     160,
     729             :     161,
     730             :     162,
     731             :     163,
     732             :     168,
     733             :     169,
     734             :     170,
     735             :     171,
     736             :     172,
     737             :     197,
     738             :     202,
     739             :     203,
     740             :     204,
     741             :     205,
     742             :     206,
     743             :     211,
     744             :     212,
     745             :     213,
     746             :     214,
     747             :     215,
     748             :     220,
     749             :     221,
     750             :     222,
     751             :     223,
     752             :     224,
     753             :     229,
     754             :     230,
     755             :     231,
     756             :     232,
     757             :     233,
     758             :     258,
     759             :     263,
     760             :     264,
     761             :     265,
     762             :     266,
     763             :     267,
     764             :     272,
     765             :     273,
     766             :     274,
     767             :     275,
     768             :     276,
     769             :     281,
     770             :     282,
     771             :     283,
     772             :     284,
     773             :     285,
     774             :     290,
     775             :     291,
     776             :     292,
     777             :     293,
     778             :     294,
     779             :     319,
     780             :     344,
     781             :     349,
     782             :     350,
     783             :     351,
     784             :     352,
     785             :     353,
     786             :     358,
     787             :     359,
     788             :     360,
     789             :     361,
     790             :     362,
     791             :     367,
     792             :     368,
     793             :     369,
     794             :     370,
     795             :     371,
     796             :     376,
     797             :     377,
     798             :     378,
     799             :     379,
     800             :     380,
     801             :     405,
     802             :     410,
     803             :     411,
     804             :     412,
     805             :     413,
     806             :     414,
     807             :     419,
     808             :     420,
     809             :     421,
     810             :     422,
     811             :     423,
     812             :     428,
     813             :     429,
     814             :     430,
     815             :     431,
     816             :     432,
     817             :     437,
     818             :     438,
     819             :     439,
     820             :     440,
     821             :     441,
     822             :     466,
     823             :     471,
     824             :     472,
     825             :     473,
     826             :     474,
     827             :     475,
     828             :     480,
     829             :     481,
     830             :     482,
     831             :     483,
     832             :     484,
     833             :     489,
     834             :     490,
     835             :     491,
     836             :     492,
     837             :     493,
     838             :     498,
     839             :     499,
     840             :     500,
     841             :     501,
     842             :     502,
     843             :     527,
     844             :     532,
     845             :     533,
     846             :     534,
     847             :     535,
     848             :     536,
     849             :     541,
     850             :     542,
     851             :     543,
     852             :     544,
     853             :     545,
     854             :     550,
     855             :     551,
     856             :     552,
     857             :     553,
     858             :     554,
     859             :     559,
     860             :     560,
     861             :     561,
     862             :     562,
     863             :     563,
     864             :     588,
     865             :     613,
     866             :     618,
     867             :     619,
     868             :     620,
     869             :     621,
     870             :     622,
     871             :     627,
     872             :     628,
     873             :     629,
     874             :     630,
     875             :     631,
     876             :     636,
     877             :     637,
     878             :     638,
     879             :     639,
     880             :     640,
     881             :     645,
     882             :     646,
     883             :     647,
     884             :     648,
     885             :     649,
     886             :     674,
     887             :     679,
     888             :     680,
     889             :     681,
     890             :     682,
     891             :     683,
     892             :     688,
     893             :     689,
     894             :     690,
     895             :     691,
     896             :     692,
     897             :     697,
     898             :     698,
     899             :     699,
     900             :     700,
     901             :     701,
     902             :     706,
     903             :     707,
     904             :     708,
     905             :     709,
     906             :     710,
     907             :     735,
     908             :     740,
     909             :     741,
     910             :     742,
     911             :     743,
     912             :     744,
     913             :     749,
     914             :     750,
     915             :     751,
     916             :     752,
     917             :     753,
     918             :     758,
     919             :     759,
     920             :     760,
     921             :     761,
     922             :     762,
     923             :     767,
     924             :     768,
     925             :     769,
     926             :     770,
     927             :     771,
     928             :     796,
     929             :     801,
     930             :     802,
     931             :     803,
     932             :     804,
     933             :     805,
     934             :     810,
     935             :     811,
     936             :     812,
     937             :     813,
     938             :     814,
     939             :     819,
     940             :     820,
     941             :     821,
     942             :     822,
     943             :     823,
     944             :     828,
     945             :     829,
     946             :     830,
     947             :     831,
     948             :     832,
     949             :     857,
     950             :     882,
     951             :     887,
     952             :     888,
     953             :     889,
     954             :     890,
     955             :     891,
     956             :     896,
     957             :     897,
     958             :     898,
     959             :     899,
     960             :     900,
     961             :     905,
     962             :     906,
     963             :     907,
     964             :     908,
     965             :     909,
     966             :     914,
     967             :     915,
     968             :     916,
     969             :     917,
     970             :     918,
     971             :     943,
     972             :     948,
     973             :     949,
     974             :     950,
     975             :     951,
     976             :     952,
     977             :     957,
     978             :     958,
     979             :     959,
     980             :     960,
     981             :     961,
     982             :     966,
     983             :     967,
     984             :     968,
     985             :     969,
     986             :     970,
     987             :     975,
     988             :     976,
     989             :     977,
     990             :     978,
     991             :     979,
     992             :     1004,
     993             :     1009,
     994             :     1010,
     995             :     1011,
     996             :     1012,
     997             :     1013,
     998             :     1018,
     999             :     1019,
    1000             :     1020,
    1001             :     1021,
    1002             :     1022,
    1003             :     1027,
    1004             :     1028,
    1005             :     1029,
    1006             :     1030,
    1007             :     1031,
    1008             :     1036,
    1009             :     1037,
    1010             :     1038,
    1011             :     1039,
    1012             :     1040,
    1013             :     1065,
    1014             :     1070,
    1015             :     1071,
    1016             :     1072,
    1017             :     1073,
    1018             :     1074,
    1019             :     1079,
    1020             :     1080,
    1021             :     1081,
    1022             :     1082,
    1023             :     1083,
    1024             :     1088,
    1025             :     1089,
    1026             :     1090,
    1027             :     1091,
    1028             :     1092,
    1029             :     1097,
    1030             :     1098,
    1031             :     1099,
    1032             :     1100
    1033             : };
    1034        3598 : void init_sq_nsq_block(
    1035             :     SequenceControlSet    *sequence_control_set_ptr,
    1036             :     ModeDecisionContext   *context_ptr){
    1037        3598 :     uint32_t blk_idx = 0;
    1038             :     do {
    1039     3727280 :         const BlockGeom * blk_geom = get_blk_geom_mds(blk_idx);
    1040     3727270 :         context_ptr->md_local_cu_unit[blk_idx].avail_blk_flag = EB_FALSE;
    1041     3727270 :         if (blk_geom->shape == PART_N)
    1042             :         {
    1043     1186890 :             context_ptr->md_cu_arr_nsq[blk_idx].split_flag = EB_TRUE;
    1044     1186890 :             context_ptr->md_cu_arr_nsq[blk_idx].part = PARTITION_SPLIT;
    1045     1186890 :             context_ptr->md_local_cu_unit[blk_idx].tested_cu_flag = EB_FALSE;
    1046             :         }
    1047     3727270 :         ++blk_idx;
    1048     3727270 :     } while (blk_idx < sequence_control_set_ptr->max_block_cnt);
    1049        3590 : }
    1050        3599 : void init_sq_non4_block(
    1051             :     SequenceControlSet    *sequence_control_set_ptr,
    1052             :     ModeDecisionContext   *context_ptr){
    1053     1195380 :     for (uint32_t blk_idx = 0; blk_idx < TOTAL_SQ_BLOCK_COUNT; blk_idx++){
    1054     1191780 :         context_ptr->md_cu_arr_nsq[sq_block_index[blk_idx]].part = PARTITION_SPLIT;
    1055     1191780 :         context_ptr->md_local_cu_unit[sq_block_index[blk_idx]].tested_cu_flag = EB_FALSE;
    1056             :     }
    1057     3793120 :     for(uint32_t blk_idx = 0; blk_idx < sequence_control_set_ptr->max_block_cnt; ++blk_idx){
    1058     3789530 :         context_ptr->md_local_cu_unit[blk_idx].avail_blk_flag = EB_FALSE;
    1059             :     }
    1060        3599 : }
    1061           0 : static INLINE TranHigh check_range(TranHigh input, int32_t bd) {
    1062             :     // AV1 TX case
    1063             :     // - 8 bit: signed 16 bit integer
    1064             :     // - 10 bit: signed 18 bit integer
    1065             :     // - 12 bit: signed 20 bit integer
    1066             :     // - max quantization error = 1828 << (bd - 8)
    1067           0 :     const int32_t int_max = (1 << (7 + bd)) - 1 + (914 << (bd - 7));
    1068           0 :     const int32_t int_min = -int_max - 1;
    1069             : #if CONFIG_COEFFICIENT_RANGE_CHECKING
    1070             :     assert(int_min <= input);
    1071             :     assert(input <= int_max);
    1072             : #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
    1073           0 :     return (TranHigh)clamp64(input, int_min, int_max);
    1074             : }
    1075             : 
    1076             : #define HIGHBD_WRAPLOW(x, bd) ((int32_t)check_range((x), bd))
    1077           0 : static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, TranHigh trans,
    1078             :     int32_t bd) {
    1079           0 :     trans = HIGHBD_WRAPLOW(trans, bd);
    1080           0 :     return clip_pixel_highbd(dest + (int32_t)trans, bd);
    1081             : }
    1082             : 
    1083             : /*********************************
    1084             : * Picture Single Channel Kernel
    1085             : *********************************/
    1086           0 : void picture_addition_kernel(
    1087             :     uint8_t  *pred_ptr,
    1088             :     uint32_t  pred_stride,
    1089             :     int32_t *residual_ptr,
    1090             :     uint32_t  residual_stride,
    1091             :     uint8_t  *recon_ptr,
    1092             :     uint32_t  recon_stride,
    1093             :     uint32_t  width,
    1094             :     uint32_t  height,
    1095             :     int32_t     bd)
    1096             : {
    1097             :     uint32_t          columnIndex;
    1098           0 :     uint32_t          row_index = 0;
    1099             :     //    const int32_t    maxValue = 0xFF;
    1100             : 
    1101             :         //printf("\n");
    1102             :         //printf("Reconstruction---------------------------------------------------\n");
    1103             : 
    1104           0 :     while (row_index < height) {
    1105           0 :         columnIndex = 0;
    1106           0 :         while (columnIndex < width) {
    1107             :             //recon_ptr[columnIndex] = (uint8_t)CLIP3(0, maxValue, ((int32_t)residual_ptr[columnIndex]) + ((int32_t)pred_ptr[columnIndex]));
    1108           0 :             uint16_t rec = (uint16_t)pred_ptr[columnIndex];
    1109           0 :             recon_ptr[columnIndex] = (uint8_t)highbd_clip_pixel_add(rec, (TranLow)residual_ptr[columnIndex], bd);
    1110             : 
    1111             :             //printf("%d\t", recon_ptr[columnIndex]);
    1112           0 :             ++columnIndex;
    1113             :         }
    1114             : 
    1115             :         //printf("\n");
    1116           0 :         residual_ptr += residual_stride;
    1117           0 :         pred_ptr += pred_stride;
    1118           0 :         recon_ptr += recon_stride;
    1119           0 :         ++row_index;
    1120             :     }
    1121             :     //printf("-----------------------------------------------------------------\n");
    1122             :     //printf("\n");
    1123             :     //printf("\n");
    1124           0 :     return;
    1125             : }
    1126             : 
    1127           0 : void picture_addition_kernel16_bit(
    1128             :     uint16_t  *pred_ptr,
    1129             :     uint32_t  pred_stride,
    1130             :     int32_t *residual_ptr,
    1131             :     uint32_t  residual_stride,
    1132             :     uint16_t  *recon_ptr,
    1133             :     uint32_t  recon_stride,
    1134             :     uint32_t  width,
    1135             :     uint32_t  height,
    1136             :     int32_t     bd)
    1137             : {
    1138             :     uint32_t          columnIndex;
    1139           0 :     uint32_t          row_index = 0;
    1140             :     //    const int32_t    maxValue = 0xFF;
    1141             : 
    1142             :         //printf("\n");
    1143             :         //printf("Reconstruction---------------------------------------------------\n");
    1144             : 
    1145           0 :     while (row_index < height) {
    1146           0 :         columnIndex = 0;
    1147           0 :         while (columnIndex < width) {
    1148             :             //recon_ptr[columnIndex] = (uint8_t)CLIP3(0, maxValue, ((int32_t)residual_ptr[columnIndex]) + ((int32_t)pred_ptr[columnIndex]));
    1149           0 :             uint16_t rec = (uint16_t)pred_ptr[columnIndex];
    1150           0 :             recon_ptr[columnIndex] = highbd_clip_pixel_add(rec, (TranLow)residual_ptr[columnIndex], bd);
    1151             : 
    1152             :             //printf("%d\t", recon_ptr[columnIndex]);
    1153           0 :             ++columnIndex;
    1154             :         }
    1155             : 
    1156             :         //printf("\n");
    1157           0 :         residual_ptr += residual_stride;
    1158           0 :         pred_ptr += pred_stride;
    1159           0 :         recon_ptr += recon_stride;
    1160           0 :         ++row_index;
    1161             :     }
    1162             :     //    printf("-----------------------------------------------------------------\n");
    1163             :     //    printf("\n");
    1164             :     //    printf("\n");
    1165           0 :     return;
    1166             : }
    1167             : 
    1168      519775 : void AV1PerformInverseTransformReconLuma(
    1169             :     PictureControlSet               *picture_control_set_ptr,
    1170             :     ModeDecisionContext             *context_ptr,
    1171             :     ModeDecisionCandidateBuffer     *candidate_buffer)
    1172             : {
    1173             :     uint32_t   tu_width;
    1174             :     uint32_t   tu_height;
    1175             :     uint32_t   txb_origin_x;
    1176             :     uint32_t   txb_origin_y;
    1177             :     uint32_t   tu_origin_index;
    1178             :     uint32_t   tuTotalCount;
    1179             :     uint32_t   txb_itr;
    1180             : 
    1181      519775 :     if (picture_control_set_ptr->intra_md_open_loop_flag == EB_FALSE) {
    1182      519775 :         uint8_t tx_depth = candidate_buffer->candidate_ptr->tx_depth;
    1183      519775 :         tuTotalCount = context_ptr->blk_geom->txb_count[tx_depth];
    1184      519775 :         txb_itr = 0;
    1185      519775 :         uint32_t txb_1d_offset = 0;
    1186             :         do {
    1187      869067 :             txb_origin_x = context_ptr->blk_geom->tx_org_x[tx_depth][txb_itr];
    1188      869067 :             txb_origin_y = context_ptr->blk_geom->tx_org_y[tx_depth][txb_itr];
    1189      869067 :             tu_width = context_ptr->blk_geom->tx_width[tx_depth][txb_itr];
    1190      869067 :             tu_height = context_ptr->blk_geom->tx_height[tx_depth][txb_itr];
    1191      869067 :             tu_origin_index = txb_origin_x + txb_origin_y * candidate_buffer->prediction_ptr->stride_y;
    1192      869067 :             uint32_t recLumaOffset = txb_origin_x + txb_origin_y * candidate_buffer->recon_ptr->stride_y;
    1193      869067 :             uint32_t y_has_coeff = (candidate_buffer->candidate_ptr->y_has_coeff & (1 << txb_itr)) > 0;
    1194             : 
    1195      869067 :             if (y_has_coeff)
    1196      672988 :                 inv_transform_recon_wrapper(
    1197      672988 :                     candidate_buffer->prediction_ptr->buffer_y,
    1198             :                     tu_origin_index,
    1199      672988 :                     candidate_buffer->prediction_ptr->stride_y,
    1200      672988 :                     context_ptr->hbd_mode_decision ? (uint8_t *)context_ptr->cfl_temp_luma_recon16bit : context_ptr->cfl_temp_luma_recon,
    1201             :                     recLumaOffset,
    1202      672988 :                     candidate_buffer->recon_ptr->stride_y,
    1203      672988 :                     (int32_t*) candidate_buffer->recon_coeff_ptr->buffer_y,
    1204             :                     txb_1d_offset,
    1205      672988 :                     context_ptr->hbd_mode_decision,
    1206      672988 :                     context_ptr->blk_geom->txsize[tx_depth][txb_itr],
    1207      672988 :                     candidate_buffer->candidate_ptr->transform_type[txb_itr],
    1208             :                     PLANE_TYPE_Y,
    1209      672988 :                     (uint32_t)candidate_buffer->candidate_ptr->eob[0][txb_itr]);
    1210             :             else {
    1211      196079 :                 if (context_ptr->hbd_mode_decision) {
    1212           0 :                     pic_copy_kernel_16bit(
    1213           0 :                         ((uint16_t *) candidate_buffer->prediction_ptr->buffer_y) + tu_origin_index,
    1214           0 :                         candidate_buffer->prediction_ptr->stride_y,
    1215           0 :                         context_ptr->cfl_temp_luma_recon16bit + recLumaOffset,
    1216           0 :                         candidate_buffer->recon_ptr->stride_y,
    1217             :                         tu_width,
    1218             :                         tu_height);
    1219             :                 } else {
    1220      196079 :                     pic_copy_kernel_8bit(
    1221      196079 :                         &(candidate_buffer->prediction_ptr->buffer_y[tu_origin_index]),
    1222      196079 :                         candidate_buffer->prediction_ptr->stride_y,
    1223      196079 :                         &(context_ptr->cfl_temp_luma_recon[recLumaOffset]),
    1224      196079 :                         candidate_buffer->recon_ptr->stride_y,
    1225             :                         tu_width,
    1226             :                         tu_height);
    1227             :                 }
    1228             :             }
    1229      869062 :             txb_1d_offset += context_ptr->blk_geom->tx_width[tx_depth][txb_itr] * context_ptr->blk_geom->tx_height[tx_depth][txb_itr];
    1230      869062 :             ++txb_itr;
    1231      869062 :         } while (txb_itr < tuTotalCount);
    1232             :     }
    1233      519770 : }
    1234      811320 : void AV1PerformInverseTransformRecon(
    1235             :     PictureControlSet               *picture_control_set_ptr,
    1236             :     ModeDecisionContext             *context_ptr,
    1237             :     ModeDecisionCandidateBuffer     *candidate_buffer,
    1238             :     CodingUnit                      *cu_ptr,
    1239             :     const BlockGeom                   *blk_geom)
    1240             : {
    1241             :     uint32_t                           tu_width;
    1242             :     uint32_t                           tu_height;
    1243             :     uint32_t                           txb_origin_x;
    1244             :     uint32_t                           txb_origin_y;
    1245             :     uint32_t                           tu_origin_index;
    1246             :     uint32_t                           tuTotalCount;
    1247             :     uint32_t                           tu_index;
    1248             :     uint32_t                           txb_itr;
    1249             :     TransformUnit                   *txb_ptr;
    1250             : 
    1251             :     UNUSED(blk_geom);
    1252             : 
    1253      811320 :     if (picture_control_set_ptr->intra_md_open_loop_flag == EB_FALSE) {
    1254      811360 :         uint8_t tx_depth = candidate_buffer->candidate_ptr->tx_depth;
    1255      811360 :         tuTotalCount = context_ptr->blk_geom->txb_count[tx_depth];
    1256      811360 :         tu_index = 0;
    1257      811360 :         txb_itr = 0;
    1258      811360 :         uint32_t txb_1d_offset = 0, txb_1d_offset_uv = 0;
    1259             :         uint32_t recLumaOffset, recCbOffset, recCrOffset;
    1260             : 
    1261             :         do {
    1262      877594 :             txb_origin_x = context_ptr->blk_geom->tx_org_x[tx_depth][txb_itr];
    1263      877594 :             txb_origin_y = context_ptr->blk_geom->tx_org_y[tx_depth][txb_itr];
    1264      877594 :             tu_width = context_ptr->blk_geom->tx_width[tx_depth][txb_itr];
    1265      877594 :             tu_height = context_ptr->blk_geom->tx_height[tx_depth][txb_itr];
    1266      877594 :             txb_ptr = &cu_ptr->transform_unit_array[tu_index];
    1267      877594 :             recLumaOffset = context_ptr->blk_geom->tx_org_x[tx_depth][txb_itr] + context_ptr->blk_geom->tx_org_y[tx_depth][txb_itr] * candidate_buffer->recon_ptr->stride_y;
    1268      877594 :             recCbOffset = ((((context_ptr->blk_geom->tx_org_x[tx_depth][txb_itr] >> 3) << 3) + ((context_ptr->blk_geom->tx_org_y[tx_depth][txb_itr] >> 3) << 3) * candidate_buffer->recon_ptr->stride_cb) >> 1);
    1269      877594 :             recCrOffset = ((((context_ptr->blk_geom->tx_org_x[tx_depth][txb_itr] >> 3) << 3) + ((context_ptr->blk_geom->tx_org_y[tx_depth][txb_itr] >> 3) << 3) * candidate_buffer->recon_ptr->stride_cr) >> 1);
    1270      877594 :             tu_origin_index = txb_origin_x + txb_origin_y * candidate_buffer->prediction_ptr->stride_y;
    1271      877594 :             if (txb_ptr->y_has_coeff)
    1272      156282 :                 inv_transform_recon_wrapper(
    1273      156282 :                     candidate_buffer->prediction_ptr->buffer_y,
    1274             :                     tu_origin_index,
    1275      156282 :                     candidate_buffer->prediction_ptr->stride_y,
    1276      156282 :                     candidate_buffer->recon_ptr->buffer_y,
    1277             :                     recLumaOffset,
    1278      156282 :                     candidate_buffer->recon_ptr->stride_y,
    1279      156282 :                     (int32_t*) candidate_buffer->recon_coeff_ptr->buffer_y,
    1280             :                     txb_1d_offset,
    1281      156282 :                     context_ptr->hbd_mode_decision,
    1282      156282 :                     context_ptr->blk_geom->txsize[tx_depth][txb_itr],
    1283      156282 :                     candidate_buffer->candidate_ptr->transform_type[txb_itr],
    1284             :                     PLANE_TYPE_Y,
    1285      156282 :                     (uint32_t)candidate_buffer->candidate_ptr->eob[0][txb_itr]);
    1286             :             else
    1287      721312 :                 picture_copy(
    1288             :                     candidate_buffer->prediction_ptr,
    1289             :                     tu_origin_index,
    1290             :                     0,//tu_chroma_origin_index,
    1291             :                     candidate_buffer->recon_ptr,
    1292             :                     recLumaOffset,
    1293             :                     0,//tu_chroma_origin_index,
    1294             :                     tu_width,
    1295             :                     tu_height,
    1296             :                     0,//chromaTuSize,
    1297             :                     0,//chromaTuSize,
    1298             :                     PICTURE_BUFFER_DESC_Y_FLAG,
    1299      721312 :                     context_ptr->hbd_mode_decision);
    1300             : 
    1301             :             //CHROMA
    1302      877562 :             uint8_t tx_depth = candidate_buffer->candidate_ptr->tx_depth;
    1303      877562 :             if (tx_depth == 0 || txb_itr == 0) {
    1304      811399 :             if (context_ptr->chroma_level <= CHROMA_MODE_1)
    1305             :             {
    1306      735760 :             uint32_t chroma_tu_width = tx_size_wide[context_ptr->blk_geom->txsize_uv[tx_depth][txb_itr]];
    1307      735760 :             uint32_t chroma_tu_height = tx_size_high[context_ptr->blk_geom->txsize_uv[tx_depth][txb_itr]];
    1308      735760 :             uint32_t cbTuChromaOriginIndex = ((((txb_origin_x >> 3) << 3) + ((txb_origin_y >> 3) << 3) * candidate_buffer->recon_coeff_ptr->stride_cb) >> 1);
    1309      735760 :             uint32_t crTuChromaOriginIndex = ((((txb_origin_x >> 3) << 3) + ((txb_origin_y >> 3) << 3) * candidate_buffer->recon_coeff_ptr->stride_cr) >> 1);
    1310             : 
    1311      735760 :             if (context_ptr->blk_geom->has_uv && txb_ptr->u_has_coeff)
    1312       23534 :                     inv_transform_recon_wrapper(
    1313       23534 :                         candidate_buffer->prediction_ptr->buffer_cb,
    1314             :                         cbTuChromaOriginIndex,
    1315       23534 :                         candidate_buffer->prediction_ptr->stride_cb,
    1316       23534 :                         candidate_buffer->recon_ptr->buffer_cb,
    1317             :                         recCbOffset,
    1318       23534 :                         candidate_buffer->recon_ptr->stride_cb,
    1319       23534 :                         (int32_t*) candidate_buffer->recon_coeff_ptr->buffer_cb,
    1320             :                         txb_1d_offset_uv,
    1321       23534 :                         context_ptr->hbd_mode_decision,
    1322       23534 :                         context_ptr->blk_geom->txsize_uv[tx_depth][txb_itr],
    1323       23534 :                         candidate_buffer->candidate_ptr->transform_type_uv,
    1324             :                         PLANE_TYPE_UV,
    1325       23534 :                         (uint32_t)candidate_buffer->candidate_ptr->eob[1][txb_itr]);
    1326             :                 else
    1327      712226 :                     picture_copy(
    1328             :                         candidate_buffer->prediction_ptr,
    1329             :                         0,
    1330             :                         cbTuChromaOriginIndex,
    1331             :                         candidate_buffer->recon_ptr,
    1332             :                         0,
    1333             :                         recCbOffset,
    1334             :                         0,
    1335             :                         0,
    1336             :                         chroma_tu_width,
    1337             :                         chroma_tu_height,
    1338             :                         PICTURE_BUFFER_DESC_Cb_FLAG,
    1339      712226 :                         context_ptr->hbd_mode_decision);
    1340             : 
    1341             : 
    1342      735757 :             if (context_ptr->blk_geom->has_uv && txb_ptr->v_has_coeff)
    1343       14134 :                     inv_transform_recon_wrapper(
    1344       14134 :                         candidate_buffer->prediction_ptr->buffer_cr,
    1345             :                         crTuChromaOriginIndex,
    1346       14134 :                         candidate_buffer->prediction_ptr->stride_cr,
    1347       14134 :                         candidate_buffer->recon_ptr->buffer_cr,
    1348             :                         recCrOffset,
    1349       14134 :                         candidate_buffer->recon_ptr->stride_cr,
    1350       14134 :                         (int32_t*) candidate_buffer->recon_coeff_ptr->buffer_cr,
    1351             :                         txb_1d_offset_uv,
    1352       14134 :                         context_ptr->hbd_mode_decision,
    1353       14134 :                         context_ptr->blk_geom->txsize_uv[tx_depth][txb_itr],
    1354       14134 :                         candidate_buffer->candidate_ptr->transform_type_uv,
    1355             :                         PLANE_TYPE_UV,
    1356       14134 :                         (uint32_t)candidate_buffer->candidate_ptr->eob[2][txb_itr]);
    1357             :                 else
    1358      721623 :                     picture_copy(
    1359             :                         candidate_buffer->prediction_ptr,
    1360             :                         0,
    1361             :                         crTuChromaOriginIndex,
    1362             :                         candidate_buffer->recon_ptr,
    1363             :                         0,
    1364             :                         recCrOffset,
    1365             :                         0,
    1366             :                         0,
    1367             :                         chroma_tu_width,
    1368             :                         chroma_tu_height,
    1369             :                         PICTURE_BUFFER_DESC_Cr_FLAG,
    1370      721623 :                         context_ptr->hbd_mode_decision);
    1371             : 
    1372      735756 :                 if (context_ptr->blk_geom->has_uv)
    1373      551032 :                     txb_1d_offset_uv += context_ptr->blk_geom->tx_width_uv[tx_depth][txb_itr] * context_ptr->blk_geom->tx_height_uv[tx_depth][txb_itr];
    1374             :             }
    1375             :             }
    1376      877558 :             txb_1d_offset += context_ptr->blk_geom->tx_width[tx_depth][txb_itr] * context_ptr->blk_geom->tx_height[tx_depth][txb_itr];
    1377      877558 :             ++tu_index;
    1378      877558 :             ++txb_itr;
    1379      877558 :         } while (txb_itr < tuTotalCount);
    1380             :     }
    1381      811284 : }
    1382             : 
    1383             : /*******************************************
    1384             : * Coding Loop - Fast Loop Initialization
    1385             : *******************************************/
    1386      811363 : void ProductCodingLoopInitFastLoop(
    1387             :     ModeDecisionContext      *context_ptr,
    1388             :     NeighborArrayUnit        *skip_coeff_neighbor_array,
    1389             :     NeighborArrayUnit        *inter_pred_dir_neighbor_array,
    1390             :     NeighborArrayUnit        *ref_frame_type_neighbor_array,
    1391             :     NeighborArrayUnit        *intra_luma_mode_neighbor_array,
    1392             :     NeighborArrayUnit        *skip_flag_neighbor_array,
    1393             :     NeighborArrayUnit        *mode_type_neighbor_array,
    1394             :     NeighborArrayUnit        *leaf_depth_neighbor_array,
    1395             :     NeighborArrayUnit        *leaf_partition_neighbor_array
    1396             : )
    1397             : {
    1398      811363 :     context_ptr->tx_depth = context_ptr->cu_ptr->tx_depth = 0;
    1399             :     // Generate Split, Skip and intra mode contexts for the rate estimation
    1400      811363 :     coding_loop_context_generation(
    1401             :         context_ptr,
    1402             :         context_ptr->cu_ptr,
    1403      811363 :         context_ptr->cu_origin_x,
    1404      811363 :         context_ptr->cu_origin_y,
    1405             :         BLOCK_SIZE_64,
    1406             :         skip_coeff_neighbor_array,
    1407             :         inter_pred_dir_neighbor_array,
    1408             :         ref_frame_type_neighbor_array,
    1409             :         intra_luma_mode_neighbor_array,
    1410             :         skip_flag_neighbor_array,
    1411             :         mode_type_neighbor_array,
    1412             :         leaf_depth_neighbor_array,
    1413             :         leaf_partition_neighbor_array);
    1414   109474000 :     for (uint32_t index = 0; index < MAX_NFL_BUFF; ++index)
    1415   108662000 :         context_ptr->fast_cost_array[index] = MAX_CU_COST;
    1416      811344 :     return;
    1417             : }
    1418             : 
    1419   111293000 : void fast_loop_core(
    1420             :     ModeDecisionCandidateBuffer *candidate_buffer,
    1421             :     PictureControlSet           *picture_control_set_ptr,
    1422             :     ModeDecisionContext         *context_ptr,
    1423             :     EbPictureBufferDesc         *input_picture_ptr,
    1424             :     uint32_t                     input_origin_index,
    1425             :     uint32_t                     input_cb_origin_index,
    1426             :     uint32_t                     input_cr_origin_index,
    1427             :     CodingUnit                  *cu_ptr,
    1428             :     uint32_t                     cu_origin_index,
    1429             :     uint32_t                     cu_chroma_origin_index,
    1430             :     EbBool                       use_ssd)
    1431             : {
    1432             :     uint64_t lumaFastDistortion;
    1433             :     uint64_t chromaFastDistortion;
    1434             : 
    1435   111293000 :     ModeDecisionCandidate       *candidate_ptr = candidate_buffer->candidate_ptr;
    1436   111293000 :     EbPictureBufferDesc         *prediction_ptr = candidate_buffer->prediction_ptr;
    1437   111293000 :     context_ptr->pu_itr = 0;
    1438             :     // Prediction
    1439             :     // Set default interp_filters
    1440   111293000 :     candidate_buffer->candidate_ptr->interp_filters = (context_ptr->md_staging_use_bilinear) ? av1_make_interp_filters(BILINEAR, BILINEAR) : 0;
    1441   111256000 :     ProductPredictionFunTable[candidate_buffer->candidate_ptr->use_intrabc ? INTER_MODE : candidate_ptr->type](
    1442             :         context_ptr,
    1443             :         picture_control_set_ptr,
    1444             :         candidate_buffer);
    1445             : 
    1446             :     // Distortion
    1447             :     // Y
    1448   111308000 :     if (use_ssd) {
    1449           0 :         EbSpatialFullDistType spatial_full_dist_type_fun = context_ptr->hbd_mode_decision ?
    1450           0 :             full_distortion_kernel16_bits : spatial_full_distortion_kernel;
    1451             : 
    1452           0 :         candidate_buffer->candidate_ptr->luma_fast_distortion = (uint32_t)(lumaFastDistortion = spatial_full_dist_type_fun(
    1453             :             input_picture_ptr->buffer_y,
    1454             :             input_origin_index,
    1455           0 :             input_picture_ptr->stride_y,
    1456             :             prediction_ptr->buffer_y,
    1457             :             cu_origin_index,
    1458           0 :             prediction_ptr->stride_y,
    1459           0 :             context_ptr->blk_geom->bwidth,
    1460           0 :             context_ptr->blk_geom->bheight));
    1461             :     }
    1462             :     else {
    1463   111308000 :         assert((context_ptr->blk_geom->bwidth >> 3) < 17);
    1464   111308000 :         if (!context_ptr->hbd_mode_decision) {
    1465   111411000 :             candidate_buffer->candidate_ptr->luma_fast_distortion = (uint32_t)(lumaFastDistortion = nxm_sad_kernel_sub_sampled(
    1466   111344000 :                 input_picture_ptr->buffer_y + input_origin_index,
    1467   111344000 :                 input_picture_ptr->stride_y,
    1468   111344000 :                 prediction_ptr->buffer_y + cu_origin_index,
    1469   111344000 :                 prediction_ptr->stride_y,
    1470   111344000 :                 context_ptr->blk_geom->bheight,
    1471   111344000 :                 context_ptr->blk_geom->bwidth));
    1472             :         }
    1473             :         else {
    1474           0 :             candidate_buffer->candidate_ptr->luma_fast_distortion = (uint32_t)(lumaFastDistortion = sad_16b_kernel(
    1475           0 :                 ((uint16_t *)input_picture_ptr->buffer_y) + input_origin_index,
    1476           0 :                 input_picture_ptr->stride_y,
    1477           0 :                 ((uint16_t *)prediction_ptr->buffer_y) + cu_origin_index,
    1478           0 :                 prediction_ptr->stride_y,
    1479           0 :                 context_ptr->blk_geom->bheight,
    1480           0 :                 context_ptr->blk_geom->bwidth));
    1481             :         }
    1482             :     }
    1483             : 
    1484   111411000 :     if (context_ptr->blk_geom->has_uv && context_ptr->chroma_level <= CHROMA_MODE_1 && context_ptr->md_staging_skip_inter_chroma_pred == EB_FALSE) {
    1485     6904200 :         if (use_ssd) {
    1486           0 :             EbSpatialFullDistType spatial_full_dist_type_fun = context_ptr->hbd_mode_decision ?
    1487           0 :                 full_distortion_kernel16_bits : spatial_full_distortion_kernel;
    1488             : 
    1489           0 :             chromaFastDistortion = spatial_full_dist_type_fun(
    1490             :                 input_picture_ptr->buffer_cb,
    1491             :                 input_cb_origin_index,
    1492           0 :                 input_picture_ptr->stride_cb,
    1493           0 :                 candidate_buffer->prediction_ptr->buffer_cb,
    1494             :                 cu_chroma_origin_index,
    1495           0 :                 prediction_ptr->stride_cb,
    1496           0 :                 context_ptr->blk_geom->bwidth_uv,
    1497           0 :                 context_ptr->blk_geom->bheight_uv);
    1498             : 
    1499           0 :             chromaFastDistortion += spatial_full_dist_type_fun(
    1500             :                 input_picture_ptr->buffer_cr,
    1501             :                 input_cr_origin_index,
    1502           0 :                 input_picture_ptr->stride_cb,
    1503           0 :                 candidate_buffer->prediction_ptr->buffer_cr,
    1504             :                 cu_chroma_origin_index,
    1505           0 :                 prediction_ptr->stride_cr,
    1506           0 :                 context_ptr->blk_geom->bwidth_uv,
    1507           0 :                 context_ptr->blk_geom->bheight_uv);
    1508             :         }
    1509             :         else {
    1510     6904200 :             assert((context_ptr->blk_geom->bwidth_uv >> 3) < 17);
    1511             : 
    1512     6904200 :             if (!context_ptr->hbd_mode_decision) {
    1513    13808400 :                 chromaFastDistortion = nxm_sad_kernel_sub_sampled(
    1514     6904230 :                     input_picture_ptr->buffer_cb + input_cb_origin_index,
    1515     6904230 :                     input_picture_ptr->stride_cb,
    1516     6904230 :                     candidate_buffer->prediction_ptr->buffer_cb + cu_chroma_origin_index,
    1517     6904230 :                     prediction_ptr->stride_cb,
    1518     6904230 :                     context_ptr->blk_geom->bheight_uv,
    1519     6904230 :                     context_ptr->blk_geom->bwidth_uv);
    1520             : 
    1521     6904070 :                 chromaFastDistortion += nxm_sad_kernel_sub_sampled(
    1522     6904190 :                     input_picture_ptr->buffer_cr + input_cr_origin_index,
    1523     6904190 :                     input_picture_ptr->stride_cr,
    1524     6904190 :                     candidate_buffer->prediction_ptr->buffer_cr + cu_chroma_origin_index,
    1525     6904190 :                     prediction_ptr->stride_cr,
    1526     6904190 :                     context_ptr->blk_geom->bheight_uv,
    1527     6904190 :                     context_ptr->blk_geom->bwidth_uv);
    1528             :             }
    1529             :             else {
    1530           0 :                 chromaFastDistortion = sad_16b_kernel(
    1531           0 :                     ((uint16_t *)input_picture_ptr->buffer_cb) + input_cb_origin_index,
    1532           0 :                     input_picture_ptr->stride_cb,
    1533           0 :                     ((uint16_t *)candidate_buffer->prediction_ptr->buffer_cb) + cu_chroma_origin_index,
    1534           0 :                     prediction_ptr->stride_cb,
    1535           0 :                     context_ptr->blk_geom->bheight_uv,
    1536           0 :                     context_ptr->blk_geom->bwidth_uv);
    1537             : 
    1538          15 :                 chromaFastDistortion += sad_16b_kernel(
    1539           0 :                     ((uint16_t *)input_picture_ptr->buffer_cr) + input_cr_origin_index,
    1540           0 :                     input_picture_ptr->stride_cr,
    1541           0 :                     ((uint16_t *)candidate_buffer->prediction_ptr->buffer_cr) + cu_chroma_origin_index,
    1542           0 :                     prediction_ptr->stride_cr,
    1543           0 :                     context_ptr->blk_geom->bheight_uv,
    1544           0 :                     context_ptr->blk_geom->bwidth_uv);
    1545             :             }
    1546             :         }
    1547             :     }
    1548             :     else
    1549   104507000 :         chromaFastDistortion = 0;
    1550             :     // Fast Cost
    1551   445299000 :     *(candidate_buffer->fast_cost_ptr) = Av1ProductFastCostFuncTable[candidate_ptr->type](
    1552             :         cu_ptr,
    1553   111411000 :         candidate_buffer->candidate_ptr,
    1554   111411000 :         cu_ptr->qp,
    1555             :         lumaFastDistortion,
    1556             :         chromaFastDistortion,
    1557   111411000 :         use_ssd ? context_ptr->full_lambda : context_ptr->fast_lambda,
    1558             :         use_ssd,
    1559             :         picture_control_set_ptr,
    1560   111411000 :         &(context_ptr->md_local_cu_unit[context_ptr->blk_geom->blkidx_mds].ed_ref_mv_stack[candidate_ptr->ref_frame_type][0]),
    1561             :         context_ptr->blk_geom,
    1562   111411000 :         context_ptr->cu_origin_y >> MI_SIZE_LOG2,
    1563   111411000 :         context_ptr->cu_origin_x >> MI_SIZE_LOG2,
    1564             :         1,
    1565   111411000 :         context_ptr->intra_luma_left_mode,
    1566   111411000 :         context_ptr->intra_luma_top_mode);
    1567   111066000 : }
    1568             : #if REMOVE_MD_STAGE_1
    1569      811174 : void set_md_stage_counts(
    1570             :     PictureControlSet       *picture_control_set_ptr,
    1571             :     ModeDecisionContext     *context_ptr,
    1572             :     uint32_t                 fastCandidateTotalCount)
    1573             : {
    1574      811174 :     SequenceControlSet* scs = (SequenceControlSet*)(picture_control_set_ptr->sequence_control_set_wrapper_ptr->object_ptr);
    1575             : 
    1576             :     // Step 1: derive bypass_stage1 flags
    1577      811174 :     if (context_ptr->md_staging_mode == MD_STAGING_MODE_1)
    1578      735751 :         memset(context_ptr->bypass_md_stage_1, EB_FALSE, CAND_CLASS_TOTAL);
    1579             :     else
    1580       75423 :         memset(context_ptr->bypass_md_stage_1, EB_TRUE, CAND_CLASS_TOTAL);
    1581             : 
    1582             :     // Step 2: set md_stage count
    1583      811174 :     context_ptr->md_stage_1_count[CAND_CLASS_0] = (picture_control_set_ptr->slice_type == I_SLICE) ? fastCandidateTotalCount : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? INTRA_NFL : (INTRA_NFL >> 1);
    1584      811174 :     context_ptr->md_stage_1_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? INTER_NEW_NFL : (INTER_NEW_NFL >> 1);
    1585      811174 :     context_ptr->md_stage_1_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? INTER_PRED_NFL : (INTER_PRED_NFL >> 1);
    1586      811174 :     context_ptr->md_stage_1_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? INTER_PRED_NFL : (INTER_PRED_NFL >> 1);
    1587             : #if II_COMP_FLAG
    1588      811174 :     context_ptr->md_stage_1_count[CAND_CLASS_4] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 14 : 6;
    1589             : #endif
    1590             : #if OBMC_FLAG
    1591      811174 :     context_ptr->md_stage_1_count[CAND_CLASS_5] = 16;
    1592             : #endif
    1593             : #if FILTER_INTRA_FLAG
    1594      811174 :     context_ptr->md_stage_1_count[CAND_CLASS_6] = (picture_control_set_ptr->temporal_layer_index == 0) ? 10 : 5;
    1595             : #endif
    1596             : #if PAL_CLASS
    1597      811174 :     context_ptr->md_stage_1_count[CAND_CLASS_7] = 12;
    1598             : #endif
    1599      811174 :     context_ptr->md_stage_1_count[CAND_CLASS_8] = (picture_control_set_ptr->temporal_layer_index == 0) ? 5 : 4;
    1600      811174 :     if (context_ptr->combine_class12) {
    1601       75636 :         context_ptr->md_stage_1_count[CAND_CLASS_1] = context_ptr->md_stage_1_count[CAND_CLASS_1] * 2;
    1602             :     }
    1603      811174 :     if (picture_control_set_ptr->enc_mode >= ENC_M2) {
    1604       75640 :         context_ptr->md_stage_1_count[CAND_CLASS_1] = context_ptr->md_stage_1_count[CAND_CLASS_1] / 2;
    1605       75640 :         context_ptr->md_stage_1_count[CAND_CLASS_2] = context_ptr->md_stage_1_count[CAND_CLASS_2] / 2;
    1606       75640 :         context_ptr->md_stage_1_count[CAND_CLASS_3] = context_ptr->md_stage_1_count[CAND_CLASS_3] / 2;
    1607             :     }
    1608             : 
    1609             : 
    1610      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_0] = (picture_control_set_ptr->slice_type == I_SLICE) ? 10 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? ((scs->input_resolution >= INPUT_SIZE_1080i_RANGE) ? 7 : 10) : 4;
    1611             : #if FILTER_INTRA_FLAG
    1612      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_6] = (picture_control_set_ptr->temporal_layer_index == 0) ? 5 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 3 : 2;
    1613             : #endif
    1614             : #if PAL_CLASS
    1615      811174 :     if (picture_control_set_ptr->parent_pcs_ptr->palette_mode == 1)
    1616           0 :         context_ptr->md_stage_2_count[CAND_CLASS_7] =
    1617           0 :         (picture_control_set_ptr->temporal_layer_index == 0) ? 7 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 4 : 4;
    1618      811174 :     else if (picture_control_set_ptr->parent_pcs_ptr->palette_mode == 2 || picture_control_set_ptr->parent_pcs_ptr->palette_mode == 3)
    1619           0 :         context_ptr->md_stage_2_count[CAND_CLASS_7] =
    1620           0 :         (picture_control_set_ptr->temporal_layer_index == 0) ? 7 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 2;
    1621      811411 :     else if (picture_control_set_ptr->parent_pcs_ptr->palette_mode == 4 || picture_control_set_ptr->parent_pcs_ptr->palette_mode == 5)
    1622           4 :         context_ptr->md_stage_2_count[CAND_CLASS_7] =
    1623           4 :         (picture_control_set_ptr->temporal_layer_index == 0) ? 4 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1624             :     else
    1625      811407 :         context_ptr->md_stage_2_count[CAND_CLASS_7] =
    1626      811407 :         (picture_control_set_ptr->temporal_layer_index == 0) ? 2 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 1 : 1;
    1627             : #endif
    1628      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_8] = (picture_control_set_ptr->temporal_layer_index == 0) ? 4 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 3 : 2;
    1629             : #if REMOVE_MD_STAGE_1
    1630      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 6 : 3;
    1631      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 6 : 3;
    1632      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 6 : 3;
    1633             : #else
    1634             :     context_ptr->md_stage_2_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 3 : 1;
    1635             :     context_ptr->md_stage_2_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 3 : 1;
    1636             :     context_ptr->md_stage_2_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 3 : 1;
    1637             : #endif
    1638             : #if II_COMP_FLAG
    1639      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_4] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 12 : 4;// 14 : 4;
    1640             : #endif
    1641             : #if OBMC_FLAG
    1642      811174 :     if (picture_control_set_ptr->parent_pcs_ptr->pic_obmc_mode == 1)
    1643           0 :         context_ptr->md_stage_2_count[CAND_CLASS_5] = 14;
    1644      811174 :     else if (picture_control_set_ptr->parent_pcs_ptr->pic_obmc_mode <= 3)
    1645      811411 :         context_ptr->md_stage_2_count[CAND_CLASS_5] = (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 12 : 4;
    1646             :     else
    1647           0 :         context_ptr->md_stage_2_count[CAND_CLASS_5] = (picture_control_set_ptr->temporal_layer_index == 0) ? 12 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 8 : 4;
    1648             : #endif
    1649             : 
    1650      811174 :     if (context_ptr->combine_class12) {
    1651       75645 :         context_ptr->md_stage_2_count[CAND_CLASS_1] = context_ptr->md_stage_2_count[CAND_CLASS_1] * 2;
    1652             :     }
    1653             : 
    1654      811174 :     if (!context_ptr->combine_class12 && picture_control_set_ptr->parent_pcs_ptr->sc_content_detected && picture_control_set_ptr->enc_mode == ENC_M0) {
    1655           0 :         context_ptr->md_stage_2_count[CAND_CLASS_0] = (picture_control_set_ptr->slice_type == I_SLICE) ? 10 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 8 : 4;
    1656             : #if REMOVE_MD_STAGE_1
    1657           0 :         context_ptr->md_stage_2_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 12 : 6;
    1658           0 :         context_ptr->md_stage_2_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 12 : 6;
    1659           0 :         context_ptr->md_stage_2_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 12 : 6;
    1660             : #else
    1661             :         context_ptr->md_stage_2_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 8 : 4;
    1662             :         context_ptr->md_stage_2_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 8 : 4;
    1663             :         context_ptr->md_stage_2_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 8 : 4;
    1664             : #endif
    1665             :     }
    1666             : 
    1667      811174 :     if (picture_control_set_ptr->enc_mode >= ENC_M1)
    1668       75644 :         context_ptr->md_stage_2_count[CAND_CLASS_0] = (picture_control_set_ptr->slice_type == I_SLICE) ? 10 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 4 : 1;
    1669             : 
    1670      811174 :     if (picture_control_set_ptr->enc_mode >= ENC_M2 && picture_control_set_ptr->enc_mode <= ENC_M4) {
    1671           0 :         context_ptr->md_stage_2_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 4 : 2;
    1672           0 :         context_ptr->md_stage_2_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1673           0 :         if (!context_ptr->combine_class12) {
    1674           0 :             context_ptr->md_stage_2_count[CAND_CLASS_1] = context_ptr->md_stage_2_count[CAND_CLASS_1] / 2;
    1675           0 :             context_ptr->md_stage_2_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1676             :         }
    1677             :     }
    1678      811174 :     else if (picture_control_set_ptr->enc_mode >= ENC_M5) {
    1679       75643 :         if (picture_control_set_ptr->enc_mode <= ENC_M6) {
    1680           0 :             context_ptr->md_stage_1_count[CAND_CLASS_0] = (picture_control_set_ptr->slice_type == I_SLICE) ? 8 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 3 : 1;
    1681             : 
    1682           0 :             if (context_ptr->combine_class12) {
    1683           0 :                 context_ptr->md_stage_1_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 5 : 3;
    1684           0 :                 context_ptr->md_stage_1_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 1 : 1;
    1685             : 
    1686             :             }
    1687             :             else {
    1688             : 
    1689           0 :                 context_ptr->md_stage_1_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1690           0 :                 context_ptr->md_stage_1_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1691           0 :                 context_ptr->md_stage_1_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 1 : 1;
    1692             :             }
    1693             : 
    1694           0 :             context_ptr->md_stage_2_count[CAND_CLASS_0] = context_ptr->md_stage_1_count[CAND_CLASS_0];
    1695           0 :             context_ptr->md_stage_2_count[CAND_CLASS_1] = context_ptr->md_stage_1_count[CAND_CLASS_1];
    1696           0 :             context_ptr->md_stage_2_count[CAND_CLASS_2] = context_ptr->md_stage_1_count[CAND_CLASS_2];
    1697           0 :             if (!context_ptr->combine_class12)
    1698           0 :                 context_ptr->md_stage_2_count[CAND_CLASS_3] = context_ptr->md_stage_1_count[CAND_CLASS_3];
    1699             :         }
    1700             :         else {
    1701       75643 :             context_ptr->md_stage_1_count[CAND_CLASS_0] = (picture_control_set_ptr->slice_type == I_SLICE) ? 6 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1702       75643 :             if (context_ptr->combine_class12) {
    1703       75634 :                 context_ptr->md_stage_1_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 4 : 2;
    1704       75634 :                 context_ptr->md_stage_1_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 1 : 1;
    1705             : 
    1706             :             }
    1707             :             else {
    1708             : 
    1709           9 :                 context_ptr->md_stage_1_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1710           9 :                 context_ptr->md_stage_1_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1711           9 :                 context_ptr->md_stage_1_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 1 : 1;
    1712             :             }
    1713       75643 :             context_ptr->md_stage_2_count[CAND_CLASS_0] = context_ptr->md_stage_1_count[CAND_CLASS_0];
    1714       75643 :             context_ptr->md_stage_2_count[CAND_CLASS_1] = context_ptr->md_stage_1_count[CAND_CLASS_1];
    1715       75643 :             context_ptr->md_stage_2_count[CAND_CLASS_2] = context_ptr->md_stage_1_count[CAND_CLASS_2];
    1716       75643 :             if (!context_ptr->combine_class12)
    1717           0 :                 context_ptr->md_stage_2_count[CAND_CLASS_3] = context_ptr->md_stage_1_count[CAND_CLASS_3];
    1718             :         }
    1719             :     }
    1720             : 
    1721             :     // Step 3: update count for md_stage_1 and d_stage_2 if bypassed (no NIC setting should be done beyond this point)
    1722      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_0] = context_ptr->bypass_md_stage_1[CAND_CLASS_0] ? context_ptr->md_stage_1_count[CAND_CLASS_0] : context_ptr->md_stage_2_count[CAND_CLASS_0];
    1723      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_1] = context_ptr->bypass_md_stage_1[CAND_CLASS_1] ? context_ptr->md_stage_1_count[CAND_CLASS_1] : context_ptr->md_stage_2_count[CAND_CLASS_1];
    1724      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_2] = context_ptr->bypass_md_stage_1[CAND_CLASS_2] ? context_ptr->md_stage_1_count[CAND_CLASS_2] : context_ptr->md_stage_2_count[CAND_CLASS_2];
    1725      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_3] = context_ptr->bypass_md_stage_1[CAND_CLASS_3] ? context_ptr->md_stage_1_count[CAND_CLASS_3] : context_ptr->md_stage_2_count[CAND_CLASS_3];
    1726      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_4] = context_ptr->bypass_md_stage_1[CAND_CLASS_4] ? context_ptr->md_stage_1_count[CAND_CLASS_4] : context_ptr->md_stage_2_count[CAND_CLASS_4];
    1727      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_5] = context_ptr->bypass_md_stage_1[CAND_CLASS_5] ? context_ptr->md_stage_1_count[CAND_CLASS_5] : context_ptr->md_stage_2_count[CAND_CLASS_5];
    1728      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_6] = context_ptr->bypass_md_stage_1[CAND_CLASS_6] ? context_ptr->md_stage_1_count[CAND_CLASS_6] : context_ptr->md_stage_2_count[CAND_CLASS_6];
    1729      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_8] = context_ptr->bypass_md_stage_1[CAND_CLASS_8] ? context_ptr->md_stage_1_count[CAND_CLASS_8] : context_ptr->md_stage_2_count[CAND_CLASS_8];
    1730             : 
    1731             : 
    1732             : #if PAL_CLASS
    1733             :     //TODO: use actual number of stages on the setting section and update using the following logic.
    1734             :     // stage1_cand_count[CAND_CLASS_i] = bypass_stage1 ? stage2_cand_count[CAND_CLASS_i] : stage1_cand_count[CAND_CLASS_i];
    1735      811174 :     context_ptr->md_stage_2_count[CAND_CLASS_7] = context_ptr->bypass_md_stage_1[CAND_CLASS_7] ? context_ptr->md_stage_1_count[CAND_CLASS_7] : context_ptr->md_stage_2_count[CAND_CLASS_7];
    1736             : #endif
    1737             : 
    1738             :     // Step 4: zero-out count for CAND_CLASS_3 if CAND_CLASS_1 and CAND_CLASS_2 are merged (i.e. shift to the left)
    1739      811174 :     if (context_ptr->combine_class12)
    1740       75646 :         context_ptr->md_stage_1_count[CAND_CLASS_3] = context_ptr->md_stage_2_count[CAND_CLASS_3] = 0;
    1741      811174 : }
    1742             : #else
    1743             : void set_md_stage_counts(
    1744             :     PictureControlSet       *picture_control_set_ptr,
    1745             :     ModeDecisionContext     *context_ptr,
    1746             :     uint32_t                 fastCandidateTotalCount)
    1747             : {
    1748             :     SequenceControlSet* scs = (SequenceControlSet*)(picture_control_set_ptr->sequence_control_set_wrapper_ptr->object_ptr);
    1749             :     // Step 0: derive bypass_stage1 flags
    1750             :     if (context_ptr->md_staging_mode) {
    1751             :         context_ptr->bypass_stage1[CAND_CLASS_0] = EB_TRUE;
    1752             : #if FILTER_INTRA_FLAG
    1753             :         context_ptr->bypass_stage1[CAND_CLASS_6] = EB_TRUE;
    1754             : #endif
    1755             : #if PAL_CLASS
    1756             :         context_ptr->bypass_stage1[CAND_CLASS_7] = EB_TRUE;
    1757             : #endif
    1758             :         context_ptr->bypass_stage1[CAND_CLASS_1] = EB_FALSE;
    1759             :         context_ptr->bypass_stage1[CAND_CLASS_2] = EB_FALSE;
    1760             :         context_ptr->bypass_stage1[CAND_CLASS_3] = context_ptr->combine_class12 ? EB_TRUE : EB_FALSE;
    1761             : #if II_COMP_FLAG
    1762             :         context_ptr->bypass_stage1[CAND_CLASS_4] = EB_FALSE;
    1763             : #endif
    1764             : #if OBMC_FLAG
    1765             :         context_ptr->bypass_stage1[CAND_CLASS_5] = EB_FALSE;
    1766             : #endif
    1767             :         context_ptr->bypass_stage1[CAND_CLASS_8] = EB_FALSE;
    1768             :     }
    1769             :     else
    1770             :         memset(context_ptr->bypass_stage1, EB_TRUE, CAND_CLASS_TOTAL);
    1771             :     // Step 1: derive bypass_stage1 flags
    1772             :     if (context_ptr->md_staging_mode)
    1773             :     {
    1774             :         context_ptr->bypass_stage2[CAND_CLASS_0] = EB_FALSE;
    1775             : #if FILTER_INTRA_FLAG
    1776             :         context_ptr->bypass_stage2[CAND_CLASS_6] = EB_FALSE;
    1777             : #endif
    1778             : #if PAL_CLASS
    1779             :         context_ptr->bypass_stage2[CAND_CLASS_7] = EB_FALSE;
    1780             : #endif
    1781             :         if (context_ptr->md_staging_mode == MD_STAGING_MODE_2 || context_ptr->md_staging_mode == MD_STAGING_MODE_3) {
    1782             :             context_ptr->bypass_stage2[CAND_CLASS_1] = EB_FALSE;
    1783             :             context_ptr->bypass_stage2[CAND_CLASS_2] = EB_FALSE;
    1784             :             context_ptr->bypass_stage2[CAND_CLASS_3] = context_ptr->combine_class12 ? EB_TRUE : EB_FALSE;
    1785             :         }
    1786             :         else {
    1787             :             context_ptr->bypass_stage2[CAND_CLASS_1] = EB_TRUE;
    1788             :             context_ptr->bypass_stage2[CAND_CLASS_2] = EB_TRUE;
    1789             :             context_ptr->bypass_stage2[CAND_CLASS_3] = EB_TRUE;
    1790             :         }
    1791             : #if II_COMP_FLAG
    1792             :             context_ptr->bypass_stage2[CAND_CLASS_4] = EB_TRUE;
    1793             : #endif
    1794             : #if OBMC_FLAG
    1795             :             context_ptr->bypass_stage2[CAND_CLASS_5] = EB_TRUE;
    1796             : #endif
    1797             :         context_ptr->bypass_stage2[CAND_CLASS_8] = EB_TRUE;
    1798             :     }
    1799             :     else
    1800             :         memset(context_ptr->bypass_stage2, EB_TRUE, CAND_CLASS_TOTAL);
    1801             :     // Step 2: set md_stage count
    1802             :     context_ptr->md_stage_1_count[CAND_CLASS_0] = (picture_control_set_ptr->slice_type == I_SLICE) ? fastCandidateTotalCount : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? INTRA_NFL : (INTRA_NFL >> 1);
    1803             : #if FILTER_INTRA_FLAG
    1804             :     context_ptr->md_stage_1_count[CAND_CLASS_6] = 5;
    1805             : #endif
    1806             : #if PAL_CLASS
    1807             :     context_ptr->md_stage_1_count[CAND_CLASS_7] = 14;
    1808             : #endif
    1809             :     context_ptr->md_stage_1_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? INTER_NEW_NFL : (INTER_NEW_NFL >> 1);
    1810             :     context_ptr->md_stage_1_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? INTER_PRED_NFL : (INTER_PRED_NFL >> 1);
    1811             : 
    1812             :     if (context_ptr->combine_class12) {
    1813             :         context_ptr->md_stage_1_count[CAND_CLASS_1] = context_ptr->md_stage_1_count[CAND_CLASS_1] * 2;
    1814             :     }
    1815             :     else {
    1816             :         context_ptr->md_stage_1_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? INTER_PRED_NFL : (INTER_PRED_NFL >> 1);
    1817             :     }
    1818             : 
    1819             : #if II_COMP_FLAG
    1820             :         context_ptr->md_stage_1_count[CAND_CLASS_4] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 14 :6;// INTER_PRED_NFL: (INTER_PRED_NFL >> 1);
    1821             : #endif
    1822             : #if OBMC_FLAG
    1823             :     if (picture_control_set_ptr->parent_pcs_ptr->pic_obmc_mode == 1)
    1824             :         context_ptr->md_stage_1_count[CAND_CLASS_5] = 16 ;
    1825             :     else if (picture_control_set_ptr->parent_pcs_ptr->pic_obmc_mode <= 3)
    1826             :         context_ptr->md_stage_1_count[CAND_CLASS_5] =  (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 12 : 4;
    1827             :     else
    1828             :         context_ptr->md_stage_1_count[CAND_CLASS_5] =   (picture_control_set_ptr->temporal_layer_index == 0 ) ? 12 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 8: 4;
    1829             : #endif
    1830             :     context_ptr->md_stage_1_count[CAND_CLASS_8] = (picture_control_set_ptr->temporal_layer_index == 0) ? 5 : 4;
    1831             :     if (picture_control_set_ptr->enc_mode >= ENC_M2) {
    1832             :         context_ptr->md_stage_1_count[CAND_CLASS_1] = context_ptr->md_stage_1_count[CAND_CLASS_1] / 2;
    1833             :         context_ptr->md_stage_1_count[CAND_CLASS_2] = context_ptr->md_stage_1_count[CAND_CLASS_2] / 2;
    1834             : 
    1835             :         if (!context_ptr->combine_class12)
    1836             :             context_ptr->md_stage_1_count[CAND_CLASS_3] = context_ptr->md_stage_1_count[CAND_CLASS_3] / 2;
    1837             :     }
    1838             : 
    1839             :     context_ptr->md_stage_2_count[CAND_CLASS_0] = (picture_control_set_ptr->slice_type == I_SLICE) ? fastCandidateTotalCount : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? INTRA_NFL : (INTRA_NFL >> 1);
    1840             : #if FILTER_INTRA_FLAG
    1841             :     context_ptr->md_stage_2_count[CAND_CLASS_6] =  5;
    1842             : #endif
    1843             : #if PAL_CLASS
    1844             :     context_ptr->md_stage_2_count[CAND_CLASS_7] = 14;// context_ptr->bypass_stage1[CAND_CLASS_7] ? context_ptr->md_stage_1_count[CAND_CLASS_7] : 14;
    1845             : #endif
    1846             :     context_ptr->md_stage_2_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 14 : 4;
    1847             :     context_ptr->md_stage_2_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 14 : 4;
    1848             : 
    1849             :     if (context_ptr->combine_class12) {
    1850             :         context_ptr->md_stage_2_count[CAND_CLASS_1] = context_ptr->md_stage_2_count[CAND_CLASS_1] * 2;
    1851             :     }
    1852             :     else {
    1853             :         context_ptr->md_stage_2_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 14 : 4;
    1854             :     }
    1855             : 
    1856             : #if II_COMP_FLAG
    1857             :     context_ptr->md_stage_2_count[CAND_CLASS_4] =  (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 12: 4;// 14 : 4;
    1858             : #endif
    1859             : #if OBMC_FLAG
    1860             :     context_ptr->md_stage_2_count[CAND_CLASS_5] =  (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : 16;
    1861             : #endif
    1862             :     context_ptr->md_stage_2_count[CAND_CLASS_8] = (picture_control_set_ptr->temporal_layer_index == 0) ? 5 : 4;
    1863             : 
    1864             : 
    1865             :     if (picture_control_set_ptr->enc_mode >= ENC_M2) {
    1866             :         context_ptr->md_stage_2_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 12 : 3;
    1867             :         context_ptr->md_stage_2_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 4 : 2;
    1868             :         if (!context_ptr->combine_class12) {
    1869             :             context_ptr->md_stage_2_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 6 : 2;
    1870             :             context_ptr->md_stage_2_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 6 : 2;
    1871             :             context_ptr->md_stage_2_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 4 : 2;
    1872             :         }
    1873             :     }
    1874             : 
    1875             :     if (picture_control_set_ptr->enc_mode >= ENC_M1)
    1876             :         context_ptr->md_stage_3_count[CAND_CLASS_0] = (picture_control_set_ptr->slice_type == I_SLICE) ? 10 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 4 : 1;
    1877             :     else
    1878             :         context_ptr->md_stage_3_count[CAND_CLASS_0] = (picture_control_set_ptr->slice_type == I_SLICE) ? 10 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? ((scs->input_resolution >= INPUT_SIZE_1080i_RANGE) ? 7 : 10) : 4;
    1879             : #if FILTER_INTRA_FLAG
    1880             :     context_ptr->md_stage_3_count[CAND_CLASS_6] = (picture_control_set_ptr->temporal_layer_index == 0) ? 5 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 3 : 2;
    1881             :     context_ptr->md_stage_3_count[CAND_CLASS_6] = (picture_control_set_ptr->temporal_layer_index == 0) ? 5 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 3 : 2;
    1882             : #endif
    1883             : #if PAL_CLASS
    1884             :     if (picture_control_set_ptr->parent_pcs_ptr->palette_mode == 1)
    1885             :         context_ptr->md_stage_3_count[CAND_CLASS_7] =
    1886             :         (picture_control_set_ptr->temporal_layer_index == 0) ? 7 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 4 : 4;
    1887             :     else if (picture_control_set_ptr->parent_pcs_ptr->palette_mode == 2 || picture_control_set_ptr->parent_pcs_ptr->palette_mode == 3)
    1888             :         context_ptr->md_stage_3_count[CAND_CLASS_7] =
    1889             :         (picture_control_set_ptr->temporal_layer_index == 0) ? 7 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 2;
    1890             :     else if (picture_control_set_ptr->parent_pcs_ptr->palette_mode == 4 || picture_control_set_ptr->parent_pcs_ptr->palette_mode == 5)
    1891             :         context_ptr->md_stage_3_count[CAND_CLASS_7] =
    1892             :         (picture_control_set_ptr->temporal_layer_index == 0) ? 4 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1893             :     else
    1894             :         context_ptr->md_stage_3_count[CAND_CLASS_7] =
    1895             :         (picture_control_set_ptr->temporal_layer_index == 0) ? 2 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 1 : 1;
    1896             : #endif
    1897             : 
    1898             :     context_ptr->md_stage_3_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 3 : 1;
    1899             :     context_ptr->md_stage_3_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 3 : 1;
    1900             : 
    1901             :     if (context_ptr->combine_class12) {
    1902             :         context_ptr->md_stage_3_count[CAND_CLASS_1] = context_ptr->md_stage_3_count[CAND_CLASS_1] * 2;
    1903             :     }
    1904             :     else {
    1905             :         context_ptr->md_stage_3_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 3 : 1;
    1906             :     }
    1907             : 
    1908             : #if II_COMP_FLAG
    1909             :     context_ptr->md_stage_3_count[CAND_CLASS_4] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 12 : 4;// 14 : 4;
    1910             : #endif
    1911             : #if OBMC_FLAG
    1912             :     if (picture_control_set_ptr->parent_pcs_ptr->pic_obmc_mode == 1)
    1913             :         context_ptr->md_stage_3_count[CAND_CLASS_5] = 16 ;
    1914             :     else if (picture_control_set_ptr->parent_pcs_ptr->pic_obmc_mode <= 3)
    1915             :         context_ptr->md_stage_3_count[CAND_CLASS_5] =  (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 12 : 4;
    1916             :     else
    1917             :         context_ptr->md_stage_3_count[CAND_CLASS_5] =   (picture_control_set_ptr->temporal_layer_index == 0 ) ? 12 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 8: 4;
    1918             : #endif
    1919             :     context_ptr->md_stage_3_count[CAND_CLASS_8] = (picture_control_set_ptr->temporal_layer_index == 0) ? 4 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 3 : 2;
    1920             : 
    1921             :     if (!context_ptr->combine_class12 && picture_control_set_ptr->parent_pcs_ptr->sc_content_detected && picture_control_set_ptr->enc_mode == ENC_M0) {
    1922             : 
    1923             :         context_ptr->md_stage_2_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ?  0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 16 : 8;
    1924             :         context_ptr->md_stage_2_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ?  0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 16 : 8;
    1925             :         context_ptr->md_stage_2_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ?  0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 16 : 8;
    1926             : 
    1927             :         context_ptr->md_stage_3_count[CAND_CLASS_0] = (picture_control_set_ptr->slice_type == I_SLICE) ? 10 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ?  8 : 4;
    1928             :         context_ptr->md_stage_3_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ?  0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ?  8 : 4;
    1929             :         context_ptr->md_stage_3_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ?  0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ?  8 : 4;
    1930             :         context_ptr->md_stage_3_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ?  0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ?  8 : 4;
    1931             :     }
    1932             : 
    1933             :     if (picture_control_set_ptr->enc_mode >= ENC_M2 && picture_control_set_ptr->enc_mode <= ENC_M4) {
    1934             :         context_ptr->md_stage_3_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 4 : 2;
    1935             :         context_ptr->md_stage_3_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1936             : 
    1937             :         if (!context_ptr->combine_class12) {
    1938             :             context_ptr->md_stage_3_count[CAND_CLASS_1] = context_ptr->md_stage_3_count[CAND_CLASS_1] / 2;
    1939             :             context_ptr->md_stage_3_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1940             :         }
    1941             :     }
    1942             :     else if (picture_control_set_ptr->enc_mode >= ENC_M5) {
    1943             :         if (context_ptr->md_staging_mode == MD_STAGING_MODE_0 && picture_control_set_ptr->enc_mode <= ENC_M6) {
    1944             :             context_ptr->md_stage_1_count[CAND_CLASS_0] = (picture_control_set_ptr->slice_type == I_SLICE) ? 8 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 3 : 1;
    1945             : 
    1946             :             if (context_ptr->combine_class12) {
    1947             :                 context_ptr->md_stage_1_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 5 : 3;
    1948             :                 context_ptr->md_stage_1_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 1 : 1;
    1949             : 
    1950             :             }
    1951             :             else {
    1952             :                 context_ptr->md_stage_1_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1953             :                 context_ptr->md_stage_1_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1954             :                 context_ptr->md_stage_1_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 1 : 1;
    1955             :             }
    1956             : 
    1957             :             context_ptr->md_stage_2_count[CAND_CLASS_0] = context_ptr->md_stage_1_count[CAND_CLASS_0];
    1958             :             context_ptr->md_stage_2_count[CAND_CLASS_1] = context_ptr->md_stage_1_count[CAND_CLASS_1];
    1959             :             context_ptr->md_stage_2_count[CAND_CLASS_2] = context_ptr->md_stage_1_count[CAND_CLASS_2];
    1960             : 
    1961             :             if (!context_ptr->combine_class12)
    1962             :                 context_ptr->md_stage_2_count[CAND_CLASS_3] = context_ptr->md_stage_1_count[CAND_CLASS_3];
    1963             : 
    1964             :             context_ptr->md_stage_3_count[CAND_CLASS_0] = context_ptr->md_stage_2_count[CAND_CLASS_0];
    1965             :             context_ptr->md_stage_3_count[CAND_CLASS_1] = context_ptr->md_stage_2_count[CAND_CLASS_1];
    1966             :             context_ptr->md_stage_3_count[CAND_CLASS_2] = context_ptr->md_stage_2_count[CAND_CLASS_2];
    1967             :             if (!context_ptr->combine_class12)
    1968             :                 context_ptr->md_stage_3_count[CAND_CLASS_3] = context_ptr->md_stage_2_count[CAND_CLASS_3];
    1969             :         }
    1970             :         else {
    1971             :             context_ptr->md_stage_1_count[CAND_CLASS_0] = (picture_control_set_ptr->slice_type == I_SLICE) ? 6 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1972             : 
    1973             :             if (context_ptr->combine_class12) {
    1974             :                 context_ptr->md_stage_1_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 4 : 2;
    1975             :                 context_ptr->md_stage_1_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 1 : 1;
    1976             : 
    1977             :             }
    1978             :             else {
    1979             :                 context_ptr->md_stage_1_count[CAND_CLASS_1] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1980             :                 context_ptr->md_stage_1_count[CAND_CLASS_2] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 2 : 1;
    1981             :                 context_ptr->md_stage_1_count[CAND_CLASS_3] = (picture_control_set_ptr->slice_type == I_SLICE) ? 0 : (picture_control_set_ptr->parent_pcs_ptr->is_used_as_reference_flag) ? 1 : 1;
    1982             : 
    1983             :             }
    1984             : 
    1985             :             context_ptr->md_stage_2_count[CAND_CLASS_0] = context_ptr->md_stage_1_count[CAND_CLASS_0];
    1986             :             context_ptr->md_stage_2_count[CAND_CLASS_1] = context_ptr->md_stage_1_count[CAND_CLASS_1];
    1987             :             context_ptr->md_stage_2_count[CAND_CLASS_2] = context_ptr->md_stage_1_count[CAND_CLASS_2];
    1988             : 
    1989             :             if (!context_ptr->combine_class12)
    1990             :                 context_ptr->md_stage_2_count[CAND_CLASS_3] = context_ptr->md_stage_1_count[CAND_CLASS_3];
    1991             : 
    1992             :             context_ptr->md_stage_3_count[CAND_CLASS_0] = context_ptr->md_stage_2_count[CAND_CLASS_0];
    1993             :             context_ptr->md_stage_3_count[CAND_CLASS_1] = context_ptr->md_stage_2_count[CAND_CLASS_1];
    1994             :             context_ptr->md_stage_3_count[CAND_CLASS_2] = context_ptr->md_stage_2_count[CAND_CLASS_2];
    1995             : 
    1996             :             if (!context_ptr->combine_class12)
    1997             :                 context_ptr->md_stage_3_count[CAND_CLASS_3] = context_ptr->md_stage_2_count[CAND_CLASS_3];
    1998             :         }
    1999             :     }
    2000             : 
    2001             :     // Step 3: update count for md_stage_1 and d_stage_2 if  bypassed (no NIC setting should be done beyond this point)
    2002             :     context_ptr->md_stage_2_count[CAND_CLASS_0] = context_ptr->bypass_stage1[CAND_CLASS_0] ? context_ptr->md_stage_1_count[CAND_CLASS_0] : context_ptr->md_stage_2_count[CAND_CLASS_0];
    2003             :     context_ptr->md_stage_2_count[CAND_CLASS_1] = context_ptr->bypass_stage1[CAND_CLASS_1] ? context_ptr->md_stage_1_count[CAND_CLASS_1] : context_ptr->md_stage_2_count[CAND_CLASS_1];
    2004             :     context_ptr->md_stage_2_count[CAND_CLASS_2] = context_ptr->bypass_stage1[CAND_CLASS_2] ? context_ptr->md_stage_1_count[CAND_CLASS_2] : context_ptr->md_stage_2_count[CAND_CLASS_2];
    2005             :     context_ptr->md_stage_2_count[CAND_CLASS_3] = context_ptr->bypass_stage1[CAND_CLASS_3] ? context_ptr->md_stage_1_count[CAND_CLASS_3] : context_ptr->md_stage_2_count[CAND_CLASS_3];
    2006             : 
    2007             :     context_ptr->md_stage_3_count[CAND_CLASS_0] = context_ptr->bypass_stage2[CAND_CLASS_0] ? context_ptr->md_stage_2_count[CAND_CLASS_0] : context_ptr->md_stage_3_count[CAND_CLASS_0];
    2008             :     context_ptr->md_stage_3_count[CAND_CLASS_1] = context_ptr->bypass_stage2[CAND_CLASS_1] ? context_ptr->md_stage_2_count[CAND_CLASS_1] : context_ptr->md_stage_3_count[CAND_CLASS_1];
    2009             :     context_ptr->md_stage_3_count[CAND_CLASS_2] = context_ptr->bypass_stage2[CAND_CLASS_2] ? context_ptr->md_stage_2_count[CAND_CLASS_2] : context_ptr->md_stage_3_count[CAND_CLASS_2];
    2010             :     context_ptr->md_stage_3_count[CAND_CLASS_3] = context_ptr->bypass_stage2[CAND_CLASS_3] ? context_ptr->md_stage_2_count[CAND_CLASS_3] : context_ptr->md_stage_3_count[CAND_CLASS_3];
    2011             :    //TODO: use actual number of stages on the setting section and update using the following logic.
    2012             :    // stage2_cand_count[CAND_CLASS_i] = bypass_stage2 ? stage3_cand_count[CAND_CLASS_i] : stage2_cand_count[CAND_CLASS_i];
    2013             :    // stage1_cand_count[CAND_CLASS_i] = bypass_stage1 ? stage2_cand_count[CAND_CLASS_i] : stage1_cand_count[CAND_CLASS_i];
    2014             : 
    2015             : 
    2016             : #if PAL_CLASS  //THIS SHOULD BE rEMOVED AFTER REBAS~~~
    2017             :     context_ptr->md_stage_2_count[CAND_CLASS_7] = context_ptr->bypass_stage1[CAND_CLASS_7] ? context_ptr->md_stage_1_count[CAND_CLASS_7] : context_ptr->md_stage_2_count[CAND_CLASS_7];
    2018             :     context_ptr->md_stage_3_count[CAND_CLASS_7] = context_ptr->bypass_stage2[CAND_CLASS_7] ? context_ptr->md_stage_2_count[CAND_CLASS_7] : context_ptr->md_stage_3_count[CAND_CLASS_7];
    2019             : #endif
    2020             : 
    2021             :     // Step 4: zero-out count for CAND_CLASS_3 if CAND_CLASS_1 and CAND_CLASS_2 are merged (i.e. shift to the left)
    2022             :     if (context_ptr->combine_class12)
    2023             :         context_ptr->md_stage_1_count[CAND_CLASS_3] = context_ptr->md_stage_2_count[CAND_CLASS_3] = context_ptr->md_stage_3_count[CAND_CLASS_3] = 0;
    2024             : }
    2025             : #endif
    2026     3305680 : void sort_stage0_fast_candidates(
    2027             :     struct ModeDecisionContext   *context_ptr,
    2028             :     uint32_t                      input_buffer_start_idx,
    2029             :     uint32_t                      input_buffer_count,  //how many cand buffers to sort. one of the buffer can have max cost.
    2030             :     uint32_t                     *cand_buff_indices
    2031             : )
    2032             : {
    2033     3305680 :     ModeDecisionCandidateBuffer **buffer_ptr_array = context_ptr->candidate_buffer_ptr_array;
    2034             : #if !SPEED_OPT
    2035             :     //  fill cand_buff_indices with surviving buffer indices ; move the scratch candidates (MAX_CU_COST) to the last spots (if any)
    2036             :     uint32_t ordered_start_idx = 0;
    2037             :     uint32_t ordered_end_idx = input_buffer_count - 1;
    2038             : #endif
    2039             : 
    2040     3305680 :     uint32_t input_buffer_end_idx = input_buffer_start_idx + input_buffer_count - 1;
    2041             : #if SPEED_OPT
    2042             :     uint32_t buffer_index, i, j;
    2043     3305680 :     uint32_t k = 0;
    2044    39270100 :     for (buffer_index = input_buffer_start_idx; buffer_index <= input_buffer_end_idx; buffer_index++, k++) {
    2045    35964400 :         cand_buff_indices[k] = buffer_index;
    2046             :     }
    2047    35940500 :     for (i = 0; i < input_buffer_count - 1; ++i) {
    2048   297162000 :         for (j = i + 1; j < input_buffer_count; ++j) {
    2049   264527000 :             if (*(buffer_ptr_array[cand_buff_indices[j]]->fast_cost_ptr) < *(buffer_ptr_array[cand_buff_indices[i]]->fast_cost_ptr)) {
    2050   136108000 :                 buffer_index = cand_buff_indices[i];
    2051   136108000 :                 cand_buff_indices[i] = (uint32_t)cand_buff_indices[j];
    2052   136108000 :                 cand_buff_indices[j] = (uint32_t)buffer_index;
    2053             : 
    2054             :             }
    2055             :         }
    2056             :     }
    2057             : #else
    2058             :     for (uint32_t buffer_index = input_buffer_start_idx; buffer_index <= input_buffer_end_idx; buffer_index++) {
    2059             :         if (*(buffer_ptr_array[buffer_index]->fast_cost_ptr) == MAX_CU_COST)
    2060             :             cand_buff_indices[ordered_end_idx--] = buffer_index;
    2061             :         else
    2062             :             cand_buff_indices[ordered_start_idx++] = buffer_index;
    2063             :     }
    2064             : #endif
    2065     3305680 : }
    2066             : 
    2067   130202000 : static INLINE void heap_sort_stage_max_node_fast_cost_ptr(
    2068             :     ModeDecisionCandidateBuffer **buffer_ptr,
    2069             :     uint32_t* sort_index, uint32_t i, uint32_t num)
    2070             : {
    2071             :     uint32_t left, right, max;
    2072             : 
    2073             :     /* Loop for removing recursion. */
    2074   102009000 :     while (1) {
    2075   130202000 :         left = 2 * i;
    2076   130202000 :         right = 2 * i + 1;
    2077   130202000 :         max = i;
    2078             : 
    2079   130202000 :         if (left <= num && *(buffer_ptr[sort_index[left]]->fast_cost_ptr) >
    2080   108461000 :             *(buffer_ptr[sort_index[i]]->fast_cost_ptr)) {
    2081    79996400 :             max = left;
    2082             :         }
    2083             : 
    2084   130202000 :         if (right <= num && *(buffer_ptr[sort_index[right]]->fast_cost_ptr) >
    2085   106788000 :             *(buffer_ptr[sort_index[max]]->fast_cost_ptr)) {
    2086    58705100 :             max = right;
    2087             :         }
    2088             : 
    2089   130202000 :         if (max == i) {
    2090    28192800 :             break;
    2091             :         }
    2092             : 
    2093   102009000 :         uint32_t swap = sort_index[i];
    2094   102009000 :         sort_index[i] = sort_index[max];
    2095   102009000 :         sort_index[max] = swap;
    2096   102009000 :         i = max;
    2097             :     }
    2098    28192800 : }
    2099             : 
    2100    18736100 : static void qsort_stage_max_node_fast_cost_ptr(
    2101             :     ModeDecisionCandidateBuffer **buffer_ptr_array, uint32_t *dst,
    2102             :     uint32_t *a, uint32_t *b, int num)
    2103             : {
    2104    18736100 :     if (num < 4) {
    2105    10058500 :         if (num < 2) {
    2106     6666630 :             if (num) {
    2107             :                 //num = 1
    2108     1643240 :                 dst[0] = a[0];
    2109             :             }
    2110     6666630 :             return;
    2111             :         }
    2112     3391900 :         if (num > 2) {
    2113             :             //num = 3
    2114     2041850 :             uint32_t tmp_a = a[0];
    2115     2041850 :             uint32_t tmp_b = a[1];
    2116     2041850 :             uint32_t tmp_c = a[2];
    2117     2041850 :             uint64_t val_a = *(buffer_ptr_array[tmp_a]->fast_cost_ptr);
    2118     2041850 :             uint64_t val_b = *(buffer_ptr_array[tmp_b]->fast_cost_ptr);
    2119     2041850 :             uint64_t val_c = *(buffer_ptr_array[tmp_c]->fast_cost_ptr);
    2120             : 
    2121     2041850 :             if (val_a < val_b) {
    2122      629721 :                 if (val_b < val_c) {
    2123             :                     //Sorted abc
    2124      347518 :                     dst[0] = tmp_a;
    2125      347518 :                     dst[1] = tmp_b;
    2126      347518 :                     dst[2] = tmp_c;
    2127             :                 }
    2128             :                 else {
    2129             :                     //xcx
    2130      282203 :                     if (val_a < val_c) {
    2131             :                         //Sorted 132
    2132      128389 :                         dst[0] = tmp_a;
    2133      128389 :                         dst[1] = tmp_c;
    2134      128389 :                         dst[2] = tmp_b;
    2135             :                     }
    2136             :                     else {
    2137             :                         //Sorted 231
    2138      153814 :                         dst[0] = tmp_c;
    2139      153814 :                         dst[1] = tmp_a;
    2140      153814 :                         dst[2] = tmp_b;
    2141             :                     }
    2142             :                 }
    2143             :             }
    2144             :             else {
    2145             :                 //a>b
    2146     1412130 :                 if (val_b > val_c) {
    2147             :                     //Sorted cba
    2148     1041830 :                     dst[0] = tmp_c;
    2149     1041830 :                     dst[1] = tmp_b;
    2150     1041830 :                     dst[2] = tmp_a;
    2151             :                 }
    2152             :                 else {
    2153             :                     //bxx
    2154      370301 :                     if (val_a < val_c) {
    2155             :                         //Sorted bac
    2156      148765 :                         dst[0] = tmp_b;
    2157      148765 :                         dst[1] = tmp_a;
    2158      148765 :                         dst[2] = tmp_c;
    2159             :                     }
    2160             :                     else {
    2161             :                         //Sorted bca
    2162      221536 :                         dst[0] = tmp_b;
    2163      221536 :                         dst[1] = tmp_c;
    2164      221536 :                         dst[2] = tmp_a;
    2165             :                     }
    2166             :                 }
    2167             :             }
    2168     2041850 :             return;
    2169             :         }
    2170             : 
    2171             :         /* bacuse a and dst can point on this same array, copy temporary values*/
    2172     1350050 :         uint32_t tmp_a = a[0];
    2173     1350050 :         uint32_t tmp_b = a[1];
    2174     1350050 :         if (*(buffer_ptr_array[tmp_a]->fast_cost_ptr) < *(buffer_ptr_array[tmp_b]->fast_cost_ptr)) {
    2175      822796 :             dst[0] = tmp_a;
    2176      822796 :             dst[1] = tmp_b;
    2177             :         }
    2178             :         else {
    2179      527256 :             dst[0] = tmp_b;
    2180      527256 :             dst[1] = tmp_a;
    2181             :         }
    2182     1350050 :         return;
    2183             :     }
    2184             : 
    2185     8677560 :     int sorted_down = 0;
    2186     8677560 :     int sorted_up = num - 1;
    2187             : 
    2188     8677560 :     uint64_t pivot_val = *(buffer_ptr_array[a[0]]->fast_cost_ptr);
    2189   100090000 :     for (int i = 1; i < num; ++i) {
    2190    91412700 :         if (pivot_val < *(buffer_ptr_array[a[i]]->fast_cost_ptr)) {
    2191    33337100 :             b[sorted_up] = a[i];
    2192    33337100 :             sorted_up--;
    2193             :         }
    2194             :         else {
    2195    58075700 :             b[sorted_down] = a[i];
    2196    58075700 :             sorted_down++;
    2197             :         }
    2198             :     }
    2199             : 
    2200     8677560 :     dst[sorted_down] = a[0];
    2201             : 
    2202     8677560 :     qsort_stage_max_node_fast_cost_ptr(buffer_ptr_array, dst,
    2203             :         b, a, sorted_down);
    2204             : 
    2205     8681790 :     qsort_stage_max_node_fast_cost_ptr(buffer_ptr_array, dst + (sorted_down + 1),
    2206     8681790 :         b + (sorted_down + 1), a + (sorted_down + 1), num - (sorted_down)-1);
    2207             : }
    2208             : 
    2209     1622750 : static INLINE void sort_array_index_fast_cost_ptr(
    2210             :     ModeDecisionCandidateBuffer** buffer_ptr,
    2211             :     uint32_t* sort_index, uint32_t num)
    2212             : {
    2213     1622750 :     if (num <= 60) {
    2214             :         //For small array uses 'quick sort', work much faster for small array,
    2215             :         //but required alloc temporary memory.
    2216             :         uint32_t  sorted_tmp[60];
    2217     1377300 :         qsort_stage_max_node_fast_cost_ptr(buffer_ptr, sort_index, sort_index, sorted_tmp, num);
    2218     1377220 :         return;
    2219             :     }
    2220             : 
    2221             :     //For big arrays uses 'heap sort', not need allocate memory
    2222             :     //For small array less that 40 elements heap sort work slower than 'insertion sort'
    2223             :     uint32_t i;
    2224     9522190 :     for (i = (num - 1) / 2; i > 0; i--)
    2225             :     {
    2226     9276800 :         heap_sort_stage_max_node_fast_cost_ptr(
    2227             :             buffer_ptr, sort_index, i, num - 1);
    2228             :     }
    2229             : 
    2230      245390 :     heap_sort_stage_max_node_fast_cost_ptr(
    2231             :         buffer_ptr, sort_index, 0, num - 1);
    2232             : 
    2233    18929500 :     for (i = num - 1; i > 0; i--)
    2234             :     {
    2235    18684300 :         uint32_t swap = sort_index[i];
    2236    18684300 :         sort_index[i] = sort_index[0];
    2237    18684300 :         sort_index[0] = swap;
    2238    18684300 :         heap_sort_stage_max_node_fast_cost_ptr(
    2239             :             buffer_ptr, sort_index, 0, i - 1);
    2240             :     }
    2241             : }
    2242             : 
    2243             : #if FIX_SORTING_METHOD
    2244           0 : void sort_stage1_fast_candidates(
    2245             :     struct ModeDecisionContext   *context_ptr,
    2246             :     uint32_t                      num_of_cand_to_sort,
    2247             :     uint32_t                     *cand_buff_indices)
    2248             : {
    2249             :     uint32_t i, j, index;
    2250           0 :     ModeDecisionCandidateBuffer **buffer_ptr_array = context_ptr->candidate_buffer_ptr_array;
    2251             : 
    2252           0 :     for (i = 0; i < num_of_cand_to_sort - 1; ++i) {
    2253           0 :         for (j = i + 1; j < num_of_cand_to_sort; ++j) {
    2254           0 :             if (*(buffer_ptr_array[cand_buff_indices[j]]->fast_cost_ptr) < *(buffer_ptr_array[cand_buff_indices[i]]->fast_cost_ptr)) {
    2255           0 :                 index = cand_buff_indices[i];
    2256           0 :                 cand_buff_indices[i] = (uint32_t)cand_buff_indices[j];
    2257           0 :                 cand_buff_indices[j] = (uint32_t)index;
    2258             : 
    2259             :             }
    2260             :         }
    2261             :     }
    2262           0 : }
    2263             : #else
    2264             : void sort_stage1_fast_candidates(
    2265             :     struct ModeDecisionContext   *context_ptr,
    2266             :     uint32_t                      num_of_cand_to_sort,
    2267             :     uint32_t                     *cand_buff_indices)
    2268             : {
    2269             :     ModeDecisionCandidateBuffer **buffer_ptr_array = context_ptr->candidate_buffer_ptr_array;
    2270             : 
    2271             :     //sorted best: *(buffer_ptr_array[sorted_candidate_index_array[?]]->fast_cost_ptr)
    2272             :     sort_array_index_fast_cost_ptr(buffer_ptr_array,
    2273             :         cand_buff_indices, num_of_cand_to_sort);
    2274             : }
    2275             : #endif
    2276             : #if REMOVE_MD_STAGE_1
    2277     3160710 : void sort_stage1_candidates(
    2278             : #else
    2279             : void sort_stage2_candidates(
    2280             : #endif
    2281             :     struct ModeDecisionContext   *context_ptr,
    2282             :     uint32_t                      num_of_cand_to_sort,
    2283             :     uint32_t                     *cand_buff_indices)
    2284             : {
    2285             :     uint32_t i, j, index;
    2286     3160710 :     ModeDecisionCandidateBuffer **buffer_ptr_array = context_ptr->candidate_buffer_ptr_array;
    2287    33624900 :     for (i = 0; i < num_of_cand_to_sort - 1; ++i) {
    2288   272014000 :         for (j = i + 1; j < num_of_cand_to_sort; ++j) {
    2289   241550000 :             if (*(buffer_ptr_array[cand_buff_indices[j]]->full_cost_ptr) < *(buffer_ptr_array[cand_buff_indices[i]]->full_cost_ptr)) {
    2290    48515900 :                 index = cand_buff_indices[i];
    2291    48515900 :                 cand_buff_indices[i] = (uint32_t)cand_buff_indices[j];
    2292    48515900 :                 cand_buff_indices[j] = (uint32_t)index;
    2293             :             }
    2294             :         }
    2295             :     }
    2296     3160710 : }
    2297             : #if REMOVE_MD_STAGE_1
    2298      811398 : void construct_best_sorted_arrays_md_stage_1(
    2299             : #else
    2300             : void construct_best_sorted_arrays_md_stage_2(
    2301             : #endif
    2302             :     struct ModeDecisionContext   *context_ptr,
    2303             :     ModeDecisionCandidateBuffer **buffer_ptr_array,
    2304             :     uint32_t                      *best_candidate_index_array,
    2305             :     uint32_t                      *sorted_candidate_index_array,
    2306             :     uint64_t                       *ref_fast_cost
    2307             : )
    2308             : {
    2309             :     //best = union from all classes
    2310      811398 :     uint32_t best_candi = 0;
    2311     8113370 :     for (CAND_CLASS class_i = CAND_CLASS_0; class_i < CAND_CLASS_TOTAL; class_i++)
    2312             : #if REMOVE_MD_STAGE_1
    2313    41282500 :         for (uint32_t candi = 0; candi < context_ptr->md_stage_1_count[class_i]; candi++)
    2314             : #else
    2315             :         for (uint32_t candi = 0; candi < context_ptr->md_stage_2_count[class_i]; candi++)
    2316             : #endif
    2317    33980500 :             sorted_candidate_index_array[best_candi++] = context_ptr->cand_buff_indices[class_i][candi];
    2318             : 
    2319             : #if REMOVE_MD_STAGE_1
    2320      811398 :     assert(best_candi == context_ptr->md_stage_1_total_count);
    2321      811398 :     uint32_t fullReconCandidateCount = context_ptr->md_stage_1_total_count;
    2322             : #else
    2323             :     assert(best_candi == context_ptr->md_stage_2_total_count);
    2324             :     uint32_t fullReconCandidateCount = context_ptr->md_stage_2_total_count;
    2325             : #endif
    2326             : 
    2327             :     //sort best: inter, then intra
    2328             :     uint32_t i, id;
    2329      811398 :     uint32_t id_inter = 0;
    2330      811398 :     uint32_t id_intra = fullReconCandidateCount - 1;
    2331    34790200 :     for (i = 0; i < fullReconCandidateCount; ++i) {
    2332    33978800 :         id = sorted_candidate_index_array[i];
    2333    33978800 :         if (buffer_ptr_array[id]->candidate_ptr->type == INTER_MODE) {
    2334    28832400 :             best_candidate_index_array[id_inter++] = id;
    2335             :         }
    2336             :         else {
    2337     5146400 :             assert(buffer_ptr_array[id]->candidate_ptr->type == INTRA_MODE);
    2338     5146400 :             best_candidate_index_array[id_intra--] = id;
    2339             :         }
    2340             :     }
    2341             : 
    2342             :     //sorted best: *(buffer_ptr_array[sorted_candidate_index_array[?]]->fast_cost_ptr)
    2343      811398 :     sort_array_index_fast_cost_ptr(buffer_ptr_array,
    2344             :         sorted_candidate_index_array, fullReconCandidateCount);
    2345             : 
    2346             :     // tx search
    2347      811395 :     *ref_fast_cost = *(buffer_ptr_array[sorted_candidate_index_array[0]]->fast_cost_ptr);
    2348      811395 : }
    2349             : 
    2350             : #if REMOVE_MD_STAGE_1
    2351      811359 : void construct_best_sorted_arrays_md_stage_2(
    2352             : #else
    2353             : void construct_best_sorted_arrays_md_stage_3(
    2354             : #endif
    2355             :     struct ModeDecisionContext   *context_ptr,
    2356             :     ModeDecisionCandidateBuffer **buffer_ptr_array,
    2357             :     uint32_t                      *best_candidate_index_array,
    2358             :     uint32_t                      *sorted_candidate_index_array)
    2359             : {
    2360             : 
    2361             :     //best = union from all classes
    2362      811359 :     uint32_t best_candi = 0;
    2363     8113510 :     for (CAND_CLASS class_i = CAND_CLASS_0; class_i < CAND_CLASS_TOTAL; class_i++)
    2364             : #if REMOVE_MD_STAGE_1
    2365    11417000 :         for (uint32_t candi = 0; candi < context_ptr->md_stage_2_count[class_i]; candi++)
    2366             : #else
    2367             :         for (uint32_t candi = 0; candi < context_ptr->md_stage_3_count[class_i]; candi++)
    2368             : #endif
    2369     4114860 :             sorted_candidate_index_array[best_candi++] = context_ptr->cand_buff_indices[class_i][candi];
    2370             : 
    2371             : #if REMOVE_MD_STAGE_1
    2372      811359 :     assert(best_candi == context_ptr->md_stage_2_total_count);
    2373      811359 :     uint32_t fullReconCandidateCount = context_ptr->md_stage_2_total_count;
    2374             : #else
    2375             :     assert(best_candi == context_ptr->md_stage_3_total_count);
    2376             :     uint32_t fullReconCandidateCount = context_ptr->md_stage_3_total_count;
    2377             : #endif
    2378             :     //sort best: inter, then intra
    2379             :     uint32_t i, id;
    2380      811359 :     uint32_t id_inter = 0;
    2381      811359 :     uint32_t id_intra = fullReconCandidateCount - 1;
    2382     4926180 :     for (i = 0; i < fullReconCandidateCount; ++i) {
    2383     4114820 :         id = sorted_candidate_index_array[i];
    2384     4114820 :         if (buffer_ptr_array[id]->candidate_ptr->type == INTER_MODE) {
    2385     3242480 :             best_candidate_index_array[id_inter++] = id;
    2386             :         }
    2387             :         else {
    2388      872341 :             assert(buffer_ptr_array[id]->candidate_ptr->type == INTRA_MODE);
    2389      872341 :             best_candidate_index_array[id_intra--] = id;
    2390             :         }
    2391             :     }
    2392             : 
    2393             :     //sorted best: *(buffer_ptr_array[sorted_candidate_index_array[?]]->fast_cost_ptr)
    2394      811359 :     sort_array_index_fast_cost_ptr(buffer_ptr_array,
    2395             :         sorted_candidate_index_array, fullReconCandidateCount);
    2396      811415 : }
    2397             : 
    2398     3305790 : void md_stage_0(
    2399             : 
    2400             :     PictureControlSet            *picture_control_set_ptr,
    2401             :     ModeDecisionContext          *context_ptr,
    2402             :     ModeDecisionCandidateBuffer **candidate_buffer_ptr_array_base,
    2403             :     ModeDecisionCandidate        *fast_candidate_array,
    2404             :     int32_t                       fast_candidate_start_index,
    2405             :     int32_t                       fast_candidate_end_index,
    2406             :     EbPictureBufferDesc        *input_picture_ptr,
    2407             :     uint32_t                      inputOriginIndex,
    2408             :     uint32_t                      inputCbOriginIndex,
    2409             :     uint32_t                      inputCrOriginIndex,
    2410             :     CodingUnit                 *cu_ptr,
    2411             :     uint32_t                      cuOriginIndex,
    2412             :     uint32_t                      cuChromaOriginIndex,
    2413             :     uint32_t                      candidate_buffer_start_index,
    2414             :     uint32_t                      maxBuffers,
    2415             :     EbBool                        scratch_buffer_pesent_flag,
    2416             :     EbBool                        use_ssd)
    2417             : {
    2418             :     int32_t  fastLoopCandidateIndex;
    2419             :     uint64_t lumaFastDistortion;
    2420             :     uint32_t highestCostIndex;
    2421             :     uint64_t highestCost;
    2422     3305790 :     uint64_t bestFirstFastCostSearchCandidateCost = MAX_CU_COST;
    2423     3305790 :     int32_t  bestFirstFastCostSearchCandidateIndex = INVALID_FAST_CANDIDATE_INDEX;
    2424             : 
    2425             : 
    2426             :     // Set MD Staging fast_loop_core settings
    2427             : #if REMOVE_MD_STAGE_1
    2428     3305790 :     context_ptr->md_staging_skip_interpolation_search = (context_ptr->md_staging_mode == MD_STAGING_MODE_1) ? EB_TRUE : picture_control_set_ptr->parent_pcs_ptr->interpolation_search_level >= IT_SEARCH_FAST_LOOP_UV_BLIND ? EB_FALSE : EB_TRUE;
    2429             : #else
    2430             :     context_ptr->md_staging_skip_interpolation_search = (context_ptr->md_staging_mode) ? EB_TRUE : picture_control_set_ptr->parent_pcs_ptr->interpolation_search_level >= IT_SEARCH_FAST_LOOP_UV_BLIND ? EB_FALSE : EB_TRUE;
    2431             : #endif
    2432             : #if FILTER_INTRA_FLAG
    2433             : #if REMOVE_MD_STAGE_1
    2434             : #if PAL_CLASS
    2435     9772310 :     context_ptr->md_staging_skip_inter_chroma_pred = (context_ptr->md_staging_mode == MD_STAGING_MODE_1 &&
    2436     3305790 :         context_ptr->target_class != CAND_CLASS_0 && context_ptr->target_class != CAND_CLASS_6 && context_ptr->target_class != CAND_CLASS_7) ? EB_TRUE : EB_FALSE;
    2437             : #else
    2438             :     context_ptr->md_staging_skip_inter_chroma_pred = (context_ptr->md_staging_mode == MD_STAGING_MODE_1 && context_ptr->target_class != CAND_CLASS_0 && context_ptr->target_class != CAND_CLASS_6) ? EB_TRUE : EB_FALSE;
    2439             : #endif
    2440             : #else
    2441             :     context_ptr->md_staging_skip_inter_chroma_pred = (context_ptr->md_staging_mode && context_ptr->md_stage == MD_STAGE_0 && context_ptr->target_class != CAND_CLASS_0 && context_ptr->target_class != CAND_CLASS_6) ? EB_TRUE : EB_FALSE;
    2442             : #endif
    2443             : #else
    2444             :     context_ptr->md_staging_skip_inter_chroma_pred = (context_ptr->md_staging_mode && context_ptr->md_stage == MD_STAGE_0 && context_ptr->target_class != CAND_CLASS_0) ? EB_TRUE : EB_FALSE;
    2445             : #endif
    2446             : #if REMOVE_MD_STAGE_1
    2447     3305790 :     context_ptr->md_staging_use_bilinear = (context_ptr->md_staging_mode == MD_STAGING_MODE_1) ? EB_TRUE : EB_FALSE;
    2448             : #else
    2449             :     context_ptr->md_staging_use_bilinear = (context_ptr->md_staging_mode) ? EB_TRUE : EB_FALSE;
    2450             : #endif
    2451             :     // 1st fast loop: src-to-src
    2452     3305790 :     fastLoopCandidateIndex = fast_candidate_end_index;
    2453   655418000 :     while (fastLoopCandidateIndex >= fast_candidate_start_index)
    2454             :     {
    2455   652112000 :         if (fast_candidate_array[fastLoopCandidateIndex].cand_class == context_ptr->target_class) {
    2456             :             // Set the Candidate Buffer
    2457   111535000 :             ModeDecisionCandidateBuffer   *candidate_buffer = candidate_buffer_ptr_array_base[candidate_buffer_start_index];
    2458   111535000 :             ModeDecisionCandidate         *candidate_ptr = candidate_buffer->candidate_ptr = &fast_candidate_array[fastLoopCandidateIndex];
    2459             :             // Initialize tx_depth
    2460   111535000 :             candidate_buffer->candidate_ptr->tx_depth = 0;
    2461             :             // Only check (src - src) candidates (Tier0 candidates)
    2462   111535000 :             if (candidate_ptr->distortion_ready) {
    2463             :                 // Distortion
    2464           0 :                 lumaFastDistortion = candidate_ptr->me_distortion;
    2465             : 
    2466             :                 // Fast Cost
    2467           0 :                 *(candidate_buffer->fast_cost_ptr) = Av1ProductFastCostFuncTable[candidate_ptr->type](
    2468             :                     cu_ptr,
    2469           0 :                     candidate_buffer->candidate_ptr,
    2470           0 :                     cu_ptr->qp,
    2471             :                     lumaFastDistortion,
    2472             :                     0,
    2473           0 :                     context_ptr->fast_lambda,
    2474             :                     0,
    2475             :                     picture_control_set_ptr,
    2476           0 :                     &(context_ptr->md_local_cu_unit[context_ptr->blk_geom->blkidx_mds].ed_ref_mv_stack[candidate_ptr->ref_frame_type][0]),
    2477             :                     context_ptr->blk_geom,
    2478           0 :                     context_ptr->cu_origin_y >> MI_SIZE_LOG2,
    2479           0 :                     context_ptr->cu_origin_x >> MI_SIZE_LOG2,
    2480             :                     1,
    2481           0 :                     context_ptr->intra_luma_left_mode,
    2482           0 :                     context_ptr->intra_luma_top_mode);
    2483             : 
    2484             :                 // Keep track of the candidate index of the best  (src - src) candidate
    2485           0 :                 if (*(candidate_buffer->fast_cost_ptr) <= bestFirstFastCostSearchCandidateCost) {
    2486           0 :                     bestFirstFastCostSearchCandidateIndex = fastLoopCandidateIndex;
    2487           0 :                     bestFirstFastCostSearchCandidateCost = *(candidate_buffer->fast_cost_ptr);
    2488             :                 }
    2489             : 
    2490             :                 // Initialize Fast Cost - to do not interact with the second Fast-Cost Search
    2491           0 :                 *(candidate_buffer->fast_cost_ptr) = MAX_CU_COST;
    2492             :             }
    2493             :         }
    2494   652112000 :         --fastLoopCandidateIndex;
    2495             :     }
    2496             : 
    2497             :     // 2nd fast loop: src-to-recon
    2498     3305790 :     highestCostIndex = candidate_buffer_start_index;
    2499     3305790 :     fastLoopCandidateIndex = fast_candidate_end_index;
    2500   655014000 :     while (fastLoopCandidateIndex >= fast_candidate_start_index)
    2501             :     {
    2502   651790000 :         if (fast_candidate_array[fastLoopCandidateIndex].cand_class == context_ptr->target_class) {
    2503   111222000 :             ModeDecisionCandidateBuffer *candidate_buffer = candidate_buffer_ptr_array_base[highestCostIndex];
    2504   111222000 :             ModeDecisionCandidate       *candidate_ptr = candidate_buffer->candidate_ptr = &fast_candidate_array[fastLoopCandidateIndex];
    2505             :             // Initialize tx_depth
    2506   111222000 :             candidate_buffer->candidate_ptr->tx_depth = 0;
    2507   111222000 :             if (!candidate_ptr->distortion_ready || fastLoopCandidateIndex == bestFirstFastCostSearchCandidateIndex) {
    2508             : 
    2509             :                 // Prediction
    2510   111222000 :                 fast_loop_core(
    2511             :                     candidate_buffer,
    2512             :                     picture_control_set_ptr,
    2513             :                     context_ptr,
    2514             :                     input_picture_ptr,
    2515             :                     inputOriginIndex,
    2516             :                     inputCbOriginIndex,
    2517             :                     inputCrOriginIndex,
    2518             :                     cu_ptr,
    2519             :                     cuOriginIndex,
    2520             :                     cuChromaOriginIndex,
    2521             :                     use_ssd);
    2522             : 
    2523             :             }
    2524             : 
    2525             :             // Find the buffer with the highest cost
    2526   111140000 :             if (fastLoopCandidateIndex || scratch_buffer_pesent_flag)
    2527             :             {
    2528             :                 // maxCost is volatile to prevent the compiler from loading 0xFFFFFFFFFFFFFF
    2529             :                 //   as a const at the early-out. Loading a large constant on intel x64 processors
    2530             :                 //   clogs the i-cache/intstruction decode. This still reloads the variable from
    2531             :                 //   the stack each pass, so a better solution would be to register the variable,
    2532             :                 //   but this might require asm.
    2533   110676000 :                 volatile uint64_t maxCost = MAX_CU_COST;
    2534   110676000 :                 const uint64_t *fast_cost_array = context_ptr->fast_cost_array;
    2535   110676000 :                 const uint32_t bufferIndexStart = candidate_buffer_start_index;
    2536   110676000 :                 const uint32_t bufferIndexEnd = bufferIndexStart + maxBuffers;
    2537             :                 uint32_t bufferIndex;
    2538             : 
    2539   110676000 :                 highestCostIndex = bufferIndexStart;
    2540   110676000 :                 bufferIndex = bufferIndexStart + 1;
    2541             : 
    2542             :                 do {
    2543  1325930000 :                     highestCost = fast_cost_array[highestCostIndex];
    2544  1325930000 :                     if (highestCost == maxCost)
    2545    29419900 :                         break;
    2546             : 
    2547  1296510000 :                     if (fast_cost_array[bufferIndex] > highestCost)
    2548   247814000 :                         highestCostIndex = bufferIndex;
    2549  1296510000 :                 } while (++bufferIndex < bufferIndexEnd);
    2550             :             }
    2551             :         }
    2552   651708000 :         --fastLoopCandidateIndex;
    2553             :     }
    2554             : 
    2555             :     // Set the cost of the scratch canidate to max to get discarded @ the sorting phase
    2556     3223960 :     *(candidate_buffer_ptr_array_base[highestCostIndex]->fast_cost_ptr) = (scratch_buffer_pesent_flag) ?
    2557     3223960 :         MAX_CU_COST :
    2558     1320610 :         *(candidate_buffer_ptr_array_base[highestCostIndex]->fast_cost_ptr);
    2559     3223960 : }
    2560             : #if !REMOVE_MD_STAGE_1
    2561             : void md_stage_1(
    2562             :     PictureControlSet            *picture_control_set_ptr,
    2563             :     ModeDecisionContext          *context_ptr,
    2564             :     ModeDecisionCandidateBuffer **candidate_buffer_ptr_array_base,
    2565             :     uint32_t                      num_of_candidates,
    2566             :     EbPictureBufferDesc          *input_picture_ptr,
    2567             :     uint32_t                      inputOriginIndex,
    2568             :     uint32_t                      inputCbOriginIndex,
    2569             :     uint32_t                      inputCrOriginIndex,
    2570             :     CodingUnit                   *cu_ptr,
    2571             :     uint32_t                      cuOriginIndex,
    2572             :     uint32_t                      cuChromaOriginIndex,
    2573             :     EbBool                        use_ssd)
    2574             : {
    2575             : 
    2576             :     // Set MD Staging fast_loop_core settings
    2577             :     context_ptr->md_staging_skip_interpolation_search = (context_ptr->md_staging_mode == MD_STAGING_MODE_3) ? EB_TRUE : picture_control_set_ptr->parent_pcs_ptr->interpolation_search_level >= IT_SEARCH_FAST_LOOP_UV_BLIND ? EB_FALSE : EB_TRUE;
    2578             :     context_ptr->md_staging_skip_inter_chroma_pred = EB_FALSE;
    2579             :     context_ptr->md_staging_use_bilinear = EB_FALSE;
    2580             : 
    2581             :     for (uint32_t cand_idx = 0; cand_idx < num_of_candidates; ++cand_idx)
    2582             :     {
    2583             : 
    2584             :         uint32_t                        candidateIndex = context_ptr->cand_buff_indices[context_ptr->target_class][cand_idx];
    2585             :         ModeDecisionCandidateBuffer    *candidate_buffer = candidate_buffer_ptr_array_base[candidateIndex];
    2586             :         ModeDecisionCandidate          *candidate_ptr = candidate_buffer->candidate_ptr;
    2587             : 
    2588             :         // Initialize tx_depth
    2589             :         candidate_buffer->candidate_ptr->tx_depth = 0;
    2590             : 
    2591             :         if (!candidate_ptr->distortion_ready) {
    2592             : 
    2593             :             fast_loop_core(
    2594             :                 candidate_buffer,
    2595             :                 picture_control_set_ptr,
    2596             :                 context_ptr,
    2597             :                 input_picture_ptr,
    2598             :                 inputOriginIndex,
    2599             :                 inputCbOriginIndex,
    2600             :                 inputCrOriginIndex,
    2601             :                 cu_ptr,
    2602             :                 cuOriginIndex,
    2603             :                 cuChromaOriginIndex,
    2604             :                 use_ssd);
    2605             :         }
    2606             :     }
    2607             : }
    2608             : #endif
    2609     3078240 : void predictive_me_full_pel_search(
    2610             :     PictureControlSet        *picture_control_set_ptr,
    2611             :     ModeDecisionContext      *context_ptr,
    2612             :     EbPictureBufferDesc      *input_picture_ptr,
    2613             :     uint32_t                  inputOriginIndex,
    2614             :     EbBool                    use_ssd,
    2615             :     uint8_t                   list_idx,
    2616             :     int8_t                    ref_idx,
    2617             :     int16_t                   mvx,
    2618             :     int16_t                   mvy,
    2619             :     int16_t                   search_position_start_x,
    2620             :     int16_t                   search_position_end_x,
    2621             :     int16_t                   search_position_start_y,
    2622             :     int16_t                   search_position_end_y,
    2623             :     int16_t                   search_step,
    2624             :     int16_t                  *best_mvx,
    2625             :     int16_t                  *best_mvy,
    2626             :     uint32_t                 *best_distortion)
    2627             : {
    2628             :     uint32_t  distortion;
    2629     3078240 :     ModeDecisionCandidateBuffer  *candidate_buffer = &(context_ptr->candidate_buffer_ptr_array[0][0]);
    2630     3078240 :     candidate_buffer->candidate_ptr = &(context_ptr->fast_candidate_array[0]);
    2631             : 
    2632     3078240 :     EbReferenceObject *refObj = picture_control_set_ptr->ref_pic_ptr_array[list_idx][ref_idx]->object_ptr;
    2633     6156480 :     EbPictureBufferDesc *ref_pic = context_ptr->hbd_mode_decision ?
    2634     3078240 :         refObj->reference_picture16bit : refObj->reference_picture;
    2635    24600300 :     for (int32_t refinement_pos_x = search_position_start_x; refinement_pos_x <= search_position_end_x; ++refinement_pos_x) {
    2636   128933000 :         for (int32_t refinement_pos_y = search_position_start_y; refinement_pos_y <= search_position_end_y; ++refinement_pos_y) {
    2637             : 
    2638   107411000 :             uint32_t ref_origin_index = ref_pic->origin_x + (context_ptr->cu_origin_x + (mvx >> 3) + refinement_pos_x) + (context_ptr->cu_origin_y + (mvy >> 3) + ref_pic->origin_y + refinement_pos_y) * ref_pic->stride_y;
    2639   107411000 :             if (use_ssd) {
    2640   214823000 :                 EbSpatialFullDistType spatial_full_dist_type_fun = context_ptr->hbd_mode_decision ?
    2641   107411000 :                     full_distortion_kernel16_bits : spatial_full_distortion_kernel;
    2642             : 
    2643   107394000 :                 distortion = (uint32_t) spatial_full_dist_type_fun(
    2644             :                     input_picture_ptr->buffer_y,
    2645             :                     inputOriginIndex,
    2646   107411000 :                     input_picture_ptr->stride_y,
    2647             :                     ref_pic->buffer_y,
    2648             :                     ref_origin_index,
    2649   107411000 :                     ref_pic->stride_y,
    2650   107411000 :                     context_ptr->blk_geom->bwidth,
    2651   107411000 :                     context_ptr->blk_geom->bheight);
    2652             :             }
    2653             :             else {
    2654           0 :                 assert((context_ptr->blk_geom->bwidth >> 3) < 17);
    2655             : 
    2656           0 :                 if (context_ptr->hbd_mode_decision) {
    2657           0 :                     distortion = sad_16b_kernel(
    2658           0 :                         ((uint16_t *)input_picture_ptr->buffer_y) + inputOriginIndex,
    2659           0 :                         input_picture_ptr->stride_y,
    2660           0 :                         ((uint16_t *)ref_pic->buffer_y) + ref_origin_index,
    2661           0 :                         ref_pic->stride_y,
    2662           0 :                         context_ptr->blk_geom->bheight,
    2663           0 :                         context_ptr->blk_geom->bwidth);
    2664             :                 } else {
    2665           0 :                     distortion = nxm_sad_kernel_sub_sampled(
    2666           0 :                         input_picture_ptr->buffer_y + inputOriginIndex,
    2667           0 :                         input_picture_ptr->stride_y,
    2668           0 :                         ref_pic->buffer_y + ref_origin_index,
    2669           0 :                         ref_pic->stride_y,
    2670           0 :                         context_ptr->blk_geom->bheight,
    2671           0 :                         context_ptr->blk_geom->bwidth);
    2672             :                 }
    2673             :             }
    2674             : 
    2675   107398000 :             if (distortion < *best_distortion) {
    2676    19822500 :                 *best_mvx = mvx + (refinement_pos_x * search_step);
    2677    19822500 :                 *best_mvy = mvy + (refinement_pos_y * search_step);
    2678    19822500 :                 *best_distortion = distortion;
    2679             :             }
    2680             :         }
    2681             :     }
    2682     3064960 : }
    2683             : 
    2684     8170360 : void predictive_me_sub_pel_search(
    2685             :     PictureControlSet        *picture_control_set_ptr,
    2686             :     ModeDecisionContext      *context_ptr,
    2687             :     EbPictureBufferDesc      *input_picture_ptr,
    2688             :     uint32_t                  inputOriginIndex,
    2689             :     uint32_t                  cuOriginIndex,
    2690             :     EbBool                    use_ssd,
    2691             :     uint8_t                   list_idx,
    2692             :     int8_t                    ref_idx,
    2693             :     int16_t                   mvx,
    2694             :     int16_t                   mvy,
    2695             :     int16_t                   search_position_start_x,
    2696             :     int16_t                   search_position_end_x,
    2697             :     int16_t                   search_position_start_y,
    2698             :     int16_t                   search_position_end_y,
    2699             :     int16_t                   search_step,
    2700             :     int16_t                  *best_mvx,
    2701             :     int16_t                  *best_mvy,
    2702             :     uint32_t                 *best_distortion,
    2703             :     uint8_t                   search_pattern)
    2704             : {
    2705             :     uint32_t  distortion;
    2706     8170360 :     ModeDecisionCandidateBuffer  *candidate_buffer = &(context_ptr->candidate_buffer_ptr_array[0][0]);
    2707     8170360 :     candidate_buffer->candidate_ptr = &(context_ptr->fast_candidate_array[0]);
    2708             : 
    2709    32680700 :     for (int32_t refinement_pos_x = search_position_start_x; refinement_pos_x <= search_position_end_x; ++refinement_pos_x) {
    2710    97895600 :         for (int32_t refinement_pos_y = search_position_start_y; refinement_pos_y <= search_position_end_y; ++refinement_pos_y) {
    2711             : 
    2712    73385300 :             if (refinement_pos_x == 0 && refinement_pos_y == 0)
    2713     8169810 :                 continue;
    2714             : 
    2715    65215500 :             if (search_pattern == 1 && refinement_pos_x != 0 && refinement_pos_y != 0)
    2716           0 :                 continue;
    2717             : 
    2718    65215500 :             if (search_pattern == 2 && refinement_pos_y != 0)
    2719           0 :                 continue;
    2720             : 
    2721    65215500 :             if (search_pattern == 3 && refinement_pos_x != 0)
    2722           0 :                 continue;
    2723             : 
    2724    65215500 :             ModeDecisionCandidate *candidate_ptr = candidate_buffer->candidate_ptr;
    2725    65215500 :             EbPictureBufferDesc   *prediction_ptr = candidate_buffer->prediction_ptr;
    2726             : 
    2727    65215500 :             candidate_ptr->type = INTER_MODE;
    2728    65215500 :             candidate_ptr->distortion_ready = 0;
    2729    65215500 :             candidate_ptr->use_intrabc = 0;
    2730    65215500 :             candidate_ptr->merge_flag = EB_FALSE;
    2731    65215500 :             candidate_ptr->prediction_direction[0] = (EbPredDirection)list_idx;
    2732    65215500 :             candidate_ptr->inter_mode = NEWMV;
    2733    65215500 :             candidate_ptr->pred_mode = NEWMV;
    2734    65215500 :             candidate_ptr->motion_mode = SIMPLE_TRANSLATION;
    2735             : #if II_COMP_FLAG
    2736    65215500 :             candidate_ptr->is_interintra_used = 0;
    2737             : #endif
    2738    65215500 :             candidate_ptr->is_compound = 0;
    2739    65215500 :             candidate_ptr->is_new_mv = 1;
    2740    65215500 :             candidate_ptr->is_zero_mv = 0;
    2741    65215500 :             candidate_ptr->drl_index = 0;
    2742    65215500 :             candidate_ptr->ref_mv_index = 0;
    2743    65215500 :             candidate_ptr->pred_mv_weight = 0;
    2744    65215500 :             candidate_ptr->ref_frame_type = svt_get_ref_frame_type(list_idx, ref_idx);
    2745    65260600 :             candidate_ptr->transform_type[PLANE_TYPE_Y] = DCT_DCT;
    2746    65260600 :             candidate_ptr->transform_type[PLANE_TYPE_UV] = DCT_DCT;
    2747    65260600 :             candidate_ptr->motion_vector_xl0 = list_idx == 0 ? mvx + (refinement_pos_x * search_step) : 0;
    2748    65260600 :             candidate_ptr->motion_vector_yl0 = list_idx == 0 ? mvy + (refinement_pos_y * search_step) : 0;
    2749    65260600 :             candidate_ptr->motion_vector_xl1 = list_idx == 1 ? mvx + (refinement_pos_x * search_step) : 0;
    2750    65260600 :             candidate_ptr->motion_vector_yl1 = list_idx == 1 ? mvy + (refinement_pos_y * search_step) : 0;
    2751    65260600 :             candidate_ptr->ref_frame_index_l0 = list_idx == 0 ? ref_idx : -1;
    2752    65260600 :             candidate_ptr->ref_frame_index_l1 = list_idx == 1 ? ref_idx : -1;
    2753    65260600 :             candidate_ptr->interp_filters = 0;
    2754             : 
    2755             :             // Prediction
    2756    65260600 :             context_ptr->md_staging_skip_interpolation_search = EB_TRUE;
    2757    65260600 :             context_ptr->md_staging_skip_inter_chroma_pred = EB_TRUE;
    2758    65260600 :             ProductPredictionFunTable[INTER_MODE](
    2759             :                 context_ptr,
    2760             :                 picture_control_set_ptr,
    2761             :                 candidate_buffer);
    2762             : 
    2763             :             // Distortion
    2764    65218400 :             if (use_ssd) {
    2765   130448000 :                 EbSpatialFullDistType spatial_full_dist_type_fun = context_ptr->hbd_mode_decision ?
    2766    65224000 :                     full_distortion_kernel16_bits : spatial_full_distortion_kernel;
    2767             : 
    2768    65248000 :                 distortion = (uint32_t) spatial_full_dist_type_fun(
    2769             :                     input_picture_ptr->buffer_y,
    2770             :                     inputOriginIndex,
    2771    65224000 :                     input_picture_ptr->stride_y,
    2772             :                     prediction_ptr->buffer_y,
    2773             :                     cuOriginIndex,
    2774    65224000 :                     prediction_ptr->stride_y,
    2775    65224000 :                     context_ptr->blk_geom->bwidth,
    2776    65224000 :                     context_ptr->blk_geom->bheight);
    2777             :             }
    2778             :             else {
    2779           0 :                 assert((context_ptr->blk_geom->bwidth >> 3) < 17);
    2780             : 
    2781           0 :                 if (context_ptr->hbd_mode_decision) {
    2782           0 :                     distortion = sad_16b_kernel(
    2783           0 :                         ((uint16_t *)input_picture_ptr->buffer_y) + inputOriginIndex,
    2784           0 :                         input_picture_ptr->stride_y,
    2785           0 :                         ((uint16_t *)prediction_ptr->buffer_y) + cuOriginIndex,
    2786           0 :                         prediction_ptr->stride_y,
    2787           0 :                         context_ptr->blk_geom->bheight,
    2788           0 :                         context_ptr->blk_geom->bwidth);
    2789             :                 } else {
    2790           0 :                     distortion = nxm_sad_kernel_sub_sampled(
    2791           0 :                         input_picture_ptr->buffer_y + inputOriginIndex,
    2792           0 :                         input_picture_ptr->stride_y,
    2793           0 :                         prediction_ptr->buffer_y + cuOriginIndex,
    2794           0 :                         prediction_ptr->stride_y,
    2795           0 :                         context_ptr->blk_geom->bheight,
    2796           0 :                         context_ptr->blk_geom->bwidth);
    2797             :                 }
    2798             :             }
    2799    65228800 :             if (distortion < *best_distortion) {
    2800     8199010 :                 *best_mvx = mvx + (refinement_pos_x * search_step);
    2801     8199010 :                 *best_mvy = mvy + (refinement_pos_y * search_step);
    2802     8199010 :                 *best_distortion = distortion;
    2803             :             }
    2804             :         }
    2805             :     }
    2806     8183660 : }
    2807             : 
    2808             : void av1_set_ref_frame(MvReferenceFrame *rf, int8_t ref_frame_type);
    2809             : uint8_t GetMaxDrlIndex(uint8_t  refmvCnt, PredictionMode   mode);
    2810             : 
    2811      675242 : void predictive_me_search(
    2812             :     PictureControlSet            *picture_control_set_ptr,
    2813             :     ModeDecisionContext          *context_ptr,
    2814             :     EbPictureBufferDesc          *input_picture_ptr,
    2815             :     uint32_t                      inputOriginIndex,
    2816             :     uint32_t                      cuOriginIndex) {
    2817             : 
    2818      675242 :     const SequenceControlSet *sequence_control_set_ptr = (SequenceControlSet*)picture_control_set_ptr->sequence_control_set_wrapper_ptr->object_ptr;
    2819             : 
    2820      675242 :     EbBool use_ssd = EB_TRUE;
    2821             : 
    2822             :     // Reset valid_refined_mv
    2823      675242 :     memset(context_ptr->valid_refined_mv, 0, 8); // [2][4]
    2824             : 
    2825     8316710 :     for (uint32_t refIt = 0; refIt < picture_control_set_ptr->parent_pcs_ptr->tot_ref_frame_types; ++refIt) {
    2826     7641380 :         MvReferenceFrame ref_pair = picture_control_set_ptr->parent_pcs_ptr->ref_frame_type_arr[refIt];
    2827             : 
    2828     7641380 :         MacroBlockD  *xd = context_ptr->cu_ptr->av1xd;
    2829             :         uint8_t drli, maxDrlIndex;
    2830             :         IntMv    nearestmv[2], nearmv[2], ref_mv[2];
    2831             : 
    2832             :         MvReferenceFrame rf[2];
    2833     7641380 :         av1_set_ref_frame(rf, ref_pair);
    2834             : 
    2835             :         // Reset search variable(s)
    2836     7641310 :         uint32_t best_mvp_distortion = (int32_t)~0;
    2837             :         uint32_t mvp_distortion;
    2838             : 
    2839     7641310 :         int16_t  best_search_mvx = (int16_t)~0;
    2840     7641310 :         int16_t  best_search_mvy = (int16_t)~0;
    2841     7641310 :         uint32_t best_search_distortion = (int32_t)~0;
    2842             : 
    2843             :         // Step 0: derive the MVP list; 1 nearest and up to 3 near
    2844             :         int16_t mvp_x_array[PREDICTIVE_ME_MAX_MVP_CANIDATES];
    2845             :         int16_t mvp_y_array[PREDICTIVE_ME_MAX_MVP_CANIDATES];
    2846     7641310 :         int8_t mvp_count = 0;
    2847     7641310 :         if (rf[1] == NONE_FRAME)
    2848             :         {
    2849     3078130 :             MvReferenceFrame frame_type = rf[0];
    2850     3078130 :             uint8_t list_idx = get_list_idx(rf[0]);
    2851     3078140 :             uint8_t ref_idx = get_ref_frame_idx(rf[0]);
    2852             :             // Get the ME MV
    2853     3078320 :             const MeLcuResults *me_results = picture_control_set_ptr->parent_pcs_ptr->me_results[context_ptr->me_sb_addr];
    2854             :             int16_t me_mv_x;
    2855             :             int16_t me_mv_y;
    2856     3078320 :             if (list_idx == 0) {
    2857     1884620 :                 me_mv_x = (me_results->me_mv_array[context_ptr->me_block_offset][ref_idx].x_mv) << 1;
    2858     1884620 :                 me_mv_y = (me_results->me_mv_array[context_ptr->me_block_offset][ref_idx].y_mv) << 1;
    2859             :             }
    2860             :             else {
    2861     1193700 :                 me_mv_x = (me_results->me_mv_array[context_ptr->me_block_offset][((sequence_control_set_ptr->mrp_mode == 0) ? 4 : 2) + ref_idx].x_mv) << 1;
    2862     1193700 :                 me_mv_y = (me_results->me_mv_array[context_ptr->me_block_offset][((sequence_control_set_ptr->mrp_mode == 0) ? 4 : 2) + ref_idx].y_mv) << 1;
    2863             :             }
    2864             :             // Round-up to the closest integer the ME MV
    2865     3078320 :             me_mv_x = (me_mv_x + 4)&~0x07;
    2866     3078320 :             me_mv_y = (me_mv_y + 4)&~0x07;
    2867             : 
    2868             :             uint32_t pa_me_distortion;
    2869     3078320 :             EbReferenceObject *refObj = picture_control_set_ptr->ref_pic_ptr_array[list_idx][ref_idx]->object_ptr;
    2870     6156640 :             EbPictureBufferDesc *ref_pic = context_ptr->hbd_mode_decision ?
    2871     3078320 :                 refObj->reference_picture16bit : refObj->reference_picture;
    2872             : 
    2873     3078320 :             uint32_t ref_origin_index = ref_pic->origin_x + (context_ptr->cu_origin_x + (me_mv_x >> 3)) + (context_ptr->cu_origin_y + (me_mv_y >> 3) + ref_pic->origin_y) * ref_pic->stride_y;
    2874     3078320 :             if (use_ssd) {
    2875     6156470 :                 EbSpatialFullDistType spatial_full_dist_type_fun = context_ptr->hbd_mode_decision ?
    2876     3078240 :                     full_distortion_kernel16_bits : spatial_full_distortion_kernel;
    2877             : 
    2878     3078150 :                 pa_me_distortion = (uint32_t) spatial_full_dist_type_fun(
    2879             :                     input_picture_ptr->buffer_y,
    2880             :                     inputOriginIndex,
    2881     3078240 :                     input_picture_ptr->stride_y,
    2882             :                     ref_pic->buffer_y,
    2883             :                     ref_origin_index,
    2884     3078240 :                     ref_pic->stride_y,
    2885     3078240 :                     context_ptr->blk_geom->bwidth,
    2886     3078240 :                     context_ptr->blk_geom->bheight);
    2887             :             }
    2888             :             else {
    2889          86 :                 assert((context_ptr->blk_geom->bwidth >> 3) < 17);
    2890             : 
    2891          86 :                if (context_ptr->hbd_mode_decision) {
    2892           0 :                     pa_me_distortion = sad_16b_kernel(
    2893           0 :                         ((uint16_t *)input_picture_ptr->buffer_y) + inputOriginIndex,
    2894           0 :                         input_picture_ptr->stride_y,
    2895           0 :                         ((uint16_t *)ref_pic->buffer_y) + ref_origin_index,
    2896           0 :                         ref_pic->stride_y,
    2897           0 :                         context_ptr->blk_geom->bheight,
    2898           0 :                         context_ptr->blk_geom->bwidth);
    2899             :                 } else {
    2900          86 :                     pa_me_distortion = nxm_sad_kernel_sub_sampled(
    2901          86 :                         input_picture_ptr->buffer_y + inputOriginIndex,
    2902          86 :                         input_picture_ptr->stride_y,
    2903          86 :                         ref_pic->buffer_y + ref_origin_index,
    2904          86 :                         ref_pic->stride_y,
    2905          86 :                         context_ptr->blk_geom->bheight,
    2906          86 :                         context_ptr->blk_geom->bwidth);
    2907             :                 }
    2908             :             }
    2909             : 
    2910     3078150 :             if (pa_me_distortion != 0 || context_ptr->predictive_me_level >= 5) {
    2911             : 
    2912             :                 //NEAREST
    2913     3078100 :                 mvp_x_array[mvp_count] = (context_ptr->cu_ptr->ref_mvs[frame_type][0].as_mv.col + 4)&~0x07;
    2914     3078100 :                 mvp_y_array[mvp_count] = (context_ptr->cu_ptr->ref_mvs[frame_type][0].as_mv.row + 4)&~0x07;
    2915             : 
    2916     3078100 :                 mvp_count++;
    2917             : 
    2918             :                 //NEAR
    2919     3078100 :                 maxDrlIndex = GetMaxDrlIndex(xd->ref_mv_count[frame_type], NEARMV);
    2920             : 
    2921     8220700 :                 for (drli = 0; drli < maxDrlIndex; drli++) {
    2922     5142520 :                     get_av1_mv_pred_drl(
    2923             :                         context_ptr,
    2924             :                         context_ptr->cu_ptr,
    2925             :                         frame_type,
    2926             :                         0,
    2927             :                         NEARMV,
    2928             :                         drli,
    2929             :                         nearestmv,
    2930             :                         nearmv,
    2931             :                         ref_mv);
    2932             : 
    2933     5142600 :                     if (((nearmv[0].as_mv.col + 4)&~0x07) != mvp_x_array[0] && ((nearmv[0].as_mv.row + 4)&~0x07) != mvp_y_array[0]) {
    2934     1942380 :                         mvp_x_array[mvp_count] = (nearmv[0].as_mv.col + 4)&~0x07;
    2935     1942380 :                         mvp_y_array[mvp_count] = (nearmv[0].as_mv.row + 4)&~0x07;
    2936     1942380 :                         mvp_count++;
    2937             :                     }
    2938             : 
    2939             :                 }
    2940             :                 // Step 1: derive the best MVP in term of distortion
    2941     3078180 :                 int16_t best_mvp_x = 0;
    2942     3078180 :                 int16_t best_mvp_y = 0;
    2943             : 
    2944     8098340 :                 for (int8_t mvp_index = 0; mvp_index < mvp_count; mvp_index++) {
    2945             : 
    2946             :                     // MVP Distortion
    2947     5020070 :                     EbReferenceObject *refObj = picture_control_set_ptr->ref_pic_ptr_array[list_idx][ref_idx]->object_ptr;
    2948    10040100 :                     EbPictureBufferDesc *ref_pic = context_ptr->hbd_mode_decision ?
    2949     5020070 :                         refObj->reference_picture16bit : refObj->reference_picture;
    2950             : 
    2951     5020070 :                    uint32_t ref_origin_index = ref_pic->origin_x + (context_ptr->cu_origin_x + (mvp_x_array[mvp_index] >> 3)) + (context_ptr->cu_origin_y + (mvp_y_array[mvp_index] >> 3) + ref_pic->origin_y) * ref_pic->stride_y;
    2952     5020070 :                     if (use_ssd) {
    2953    10040300 :                         EbSpatialFullDistType spatial_full_dist_type_fun = context_ptr->hbd_mode_decision ?
    2954     5020160 :                             full_distortion_kernel16_bits : spatial_full_distortion_kernel;
    2955             : 
    2956     5020180 :                         mvp_distortion = (uint32_t) spatial_full_dist_type_fun(
    2957             :                             input_picture_ptr->buffer_y,
    2958             :                             inputOriginIndex,
    2959     5020160 :                             input_picture_ptr->stride_y,
    2960             :                             ref_pic->buffer_y,
    2961             :                             ref_origin_index,
    2962     5020160 :                             ref_pic->stride_y,
    2963     5020160 :                             context_ptr->blk_geom->bwidth,
    2964     5020160 :                             context_ptr->blk_geom->bheight);
    2965             :                     }
    2966             :                     else {
    2967           0 :                         assert((context_ptr->blk_geom->bwidth >> 3) < 17);
    2968             : 
    2969           0 :                        if (context_ptr->hbd_mode_decision) {
    2970           0 :                             mvp_distortion = sad_16b_kernel(
    2971           0 :                                 ((uint16_t *)input_picture_ptr->buffer_y) + inputOriginIndex,
    2972           0 :                                 input_picture_ptr->stride_y,
    2973           0 :                                 ((uint16_t *)ref_pic->buffer_y) + ref_origin_index,
    2974           0 :                                 ref_pic->stride_y,
    2975           0 :                                 context_ptr->blk_geom->bheight,
    2976           0 :                                 context_ptr->blk_geom->bwidth);
    2977             :                         } else {
    2978           0 :                             mvp_distortion = nxm_sad_kernel_sub_sampled(
    2979           0 :                                 input_picture_ptr->buffer_y + inputOriginIndex,
    2980           0 :                                 input_picture_ptr->stride_y,
    2981           0 :                                 ref_pic->buffer_y + ref_origin_index,
    2982           0 :                                 ref_pic->stride_y,
    2983           0 :                                 context_ptr->blk_geom->bheight,
    2984           0 :                                 context_ptr->blk_geom->bwidth);
    2985             :                         }
    2986             :                     }
    2987             : 
    2988     5020170 :                     if (mvp_distortion < best_mvp_distortion) {
    2989     3498400 :                         best_mvp_distortion = mvp_distortion;
    2990     3498400 :                         best_mvp_x = mvp_x_array[mvp_index];
    2991     3498400 :                         best_mvp_y = mvp_y_array[mvp_index];
    2992             :                     }
    2993             :                 }
    2994             : 
    2995             :                 // Step 2: perform full pel search around the best MVP
    2996     3078270 :                 best_mvp_x = (best_mvp_x + 4)&~0x07;
    2997     3078270 :                 best_mvp_y = (best_mvp_y + 4)&~0x07;
    2998             : 
    2999     3078270 :                 predictive_me_full_pel_search(
    3000             :                     picture_control_set_ptr,
    3001             :                     context_ptr,
    3002             :                     input_picture_ptr,
    3003             :                     inputOriginIndex,
    3004             :                     use_ssd,
    3005             :                     list_idx,
    3006             :                     ref_idx,
    3007             :                     best_mvp_x,
    3008             :                     best_mvp_y,
    3009             :                     -(FULL_PEL_REF_WINDOW_WIDTH >> 1),
    3010             :                     +(FULL_PEL_REF_WINDOW_WIDTH >> 1),
    3011             :                     -(FULL_PEL_REF_WINDOW_HEIGHT >> 1),
    3012             :                     +(FULL_PEL_REF_WINDOW_HEIGHT >> 1),
    3013             :                     8,
    3014             :                     &best_search_mvx,
    3015             :                     &best_search_mvy,
    3016             :                     &best_search_distortion);
    3017             : 
    3018             :                 EbBool exit_predictive_me_sub_pel;
    3019             : 
    3020     3078180 :                 if (pa_me_distortion == 0)
    3021           0 :                     exit_predictive_me_sub_pel = EB_TRUE;
    3022     3078180 :                 else if (best_search_distortion <= pa_me_distortion)
    3023     2608370 :                     exit_predictive_me_sub_pel = EB_FALSE;
    3024             :                 else {
    3025      469811 :                     exit_predictive_me_sub_pel = ((((best_search_distortion - pa_me_distortion) * 100) / pa_me_distortion) < PREDICTIVE_ME_DEVIATION_TH) ?
    3026      469811 :                         EB_FALSE :
    3027             :                         EB_TRUE;
    3028             :                 }
    3029             : 
    3030     3078180 :                 if (exit_predictive_me_sub_pel == EB_FALSE || context_ptr->predictive_me_level >= 5) {
    3031             : 
    3032     2723640 :                     if (context_ptr->predictive_me_level >= 2) {
    3033             : 
    3034             :                         uint8_t search_pattern;
    3035             :                         // 0: all possible position(s): horizontal, vertical, diagonal
    3036             :                         // 1: horizontal, vertical
    3037             :                         // 2: horizontal only
    3038             :                         // 3: vertical only
    3039             : 
    3040             :                         // Step 3: perform half pel search around the best full pel position
    3041     2723760 :                         search_pattern = (context_ptr->predictive_me_level >= 4) ? 0 : 1;
    3042             : 
    3043     2723760 :                         predictive_me_sub_pel_search(
    3044             :                             picture_control_set_ptr,
    3045             :                             context_ptr,
    3046             :                             input_picture_ptr,
    3047             :                             inputOriginIndex,
    3048             :                             cuOriginIndex,
    3049             :                             use_ssd,
    3050             :                             list_idx,
    3051             :                             ref_idx,
    3052             :                             best_search_mvx,
    3053             :                             best_search_mvy,
    3054             :                             -(HALF_PEL_REF_WINDOW >> 1),
    3055             :                             +(HALF_PEL_REF_WINDOW >> 1),
    3056             :                             -(HALF_PEL_REF_WINDOW >> 1),
    3057             :                             +(HALF_PEL_REF_WINDOW >> 1),
    3058             :                             4,
    3059             :                             &best_search_mvx,
    3060             :                             &best_search_mvy,
    3061             :                             &best_search_distortion,
    3062             :                             search_pattern);
    3063             : 
    3064     2723650 :                         if (context_ptr->predictive_me_level == 3) {
    3065           0 :                             if ((best_search_mvx & 0x07) != 0 || (best_search_mvy & 0x07) != 0) {
    3066             : 
    3067           0 :                                 if ((best_search_mvx & 0x07) == 0)
    3068           0 :                                     search_pattern = 2;
    3069             :                                 else // if(best_search_mvy & 0x07 == 0)
    3070           0 :                                     search_pattern = 3;
    3071             : 
    3072           0 :                                 predictive_me_sub_pel_search(
    3073             :                                     picture_control_set_ptr,
    3074             :                                     context_ptr,
    3075             :                                     input_picture_ptr,
    3076             :                                     inputOriginIndex,
    3077             :                                     cuOriginIndex,
    3078             :                                     use_ssd,
    3079             :                                     list_idx,
    3080             :                                     ref_idx,
    3081             :                                     best_search_mvx,
    3082             :                                     best_search_mvy,
    3083             :                                     -(HALF_PEL_REF_WINDOW >> 1),
    3084             :                                     +(HALF_PEL_REF_WINDOW >> 1),
    3085             :                                     -(HALF_PEL_REF_WINDOW >> 1),
    3086             :                                     +(HALF_PEL_REF_WINDOW >> 1),
    3087             :                                     4,
    3088             :                                     &best_search_mvx,
    3089             :                                     &best_search_mvy,
    3090             :                                     &best_search_distortion,
    3091             :                                     search_pattern);
    3092             :                             }
    3093             :                         }
    3094             : 
    3095             :                         // Step 4: perform quarter pel search around the best half pel position
    3096     2723650 :                         search_pattern = (context_ptr->predictive_me_level >= 4) ? 0 : 1;
    3097     2723650 :                         predictive_me_sub_pel_search(
    3098             :                             picture_control_set_ptr,
    3099             :                             context_ptr,
    3100             :                             input_picture_ptr,
    3101             :                             inputOriginIndex,
    3102             :                             cuOriginIndex,
    3103             :                             use_ssd,
    3104             :                             list_idx,
    3105             :                             ref_idx,
    3106             :                             best_search_mvx,
    3107             :                             best_search_mvy,
    3108             :                             -(QUARTER_PEL_REF_WINDOW >> 1),
    3109             :                             +(QUARTER_PEL_REF_WINDOW >> 1),
    3110             :                             -(QUARTER_PEL_REF_WINDOW >> 1),
    3111             :                             +(QUARTER_PEL_REF_WINDOW >> 1),
    3112             :                             2,
    3113             :                             &best_search_mvx,
    3114             :                             &best_search_mvy,
    3115             :                             &best_search_distortion,
    3116             :                             search_pattern);
    3117             : 
    3118     2723690 :                         if (context_ptr->predictive_me_level == 3) {
    3119           0 :                             if ((best_search_mvx & 0x03) != 0 || (best_search_mvy & 0x03) != 0) {
    3120             : 
    3121           0 :                                 if ((best_search_mvx & 0x03) == 0)
    3122           0 :                                     search_pattern = 2;
    3123             :                                 else // if(best_search_mvy & 0x03 == 0)
    3124           0 :                                     search_pattern = 3;
    3125             : 
    3126           0 :                                 predictive_me_sub_pel_search(
    3127             :                                     picture_control_set_ptr,
    3128             :                                     context_ptr,
    3129             :                                     input_picture_ptr,
    3130             :                                     inputOriginIndex,
    3131             :                                     cuOriginIndex,
    3132             :                                     use_ssd,
    3133             :                                     list_idx,
    3134             :                                     ref_idx,
    3135             :                                     best_search_mvx,
    3136             :                                     best_search_mvy,
    3137             :                                     -(QUARTER_PEL_REF_WINDOW >> 1),
    3138             :                                     +(QUARTER_PEL_REF_WINDOW >> 1),
    3139             :                                     -(QUARTER_PEL_REF_WINDOW >> 1),
    3140             :                                     +(QUARTER_PEL_REF_WINDOW >> 1),
    3141             :                                     2,
    3142             :                                     &best_search_mvx,
    3143             :                                     &best_search_mvy,
    3144             :                                     &best_search_distortion,
    3145             :                                     search_pattern);
    3146             :                             }
    3147             :                         }
    3148             :                     }
    3149             : #if EIGHT_PEL_PREDICTIVE_ME
    3150             :                     // Step 5: perform eigh pel search around the best quarter pel position
    3151     2723570 :                     if (picture_control_set_ptr->parent_pcs_ptr->frm_hdr.allow_high_precision_mv) {
    3152     2723710 :                         uint8_t search_pattern = 0;
    3153     2723710 :                         predictive_me_sub_pel_search(
    3154             :                             picture_control_set_ptr,
    3155             :                             context_ptr,
    3156             :                             input_picture_ptr,
    3157             :                             inputOriginIndex,
    3158             :                             cuOriginIndex,
    3159             :                             use_ssd,
    3160             :                             list_idx,
    3161             :                             ref_idx,
    3162             :                             best_search_mvx,
    3163             :                             best_search_mvy,
    3164             : #if MDC_ADAPTIVE_LEVEL
    3165             :                             -(EIGHT_PEL_REF_WINDOW >> 1),
    3166             :                             +(EIGHT_PEL_REF_WINDOW >> 1),
    3167             :                             -(EIGHT_PEL_REF_WINDOW >> 1),
    3168             :                             +(EIGHT_PEL_REF_WINDOW >> 1),
    3169             : #else
    3170             :                             -(QUARTER_PEL_REF_WINDOW >> 1),
    3171             :                             +(QUARTER_PEL_REF_WINDOW >> 1),
    3172             :                             -(QUARTER_PEL_REF_WINDOW >> 1),
    3173             :                             +(QUARTER_PEL_REF_WINDOW >> 1),
    3174             : #endif
    3175             :                             1,
    3176             :                             &best_search_mvx,
    3177             :                             &best_search_mvy,
    3178             :                             &best_search_distortion,
    3179             :                             search_pattern);
    3180             :                     }
    3181             : #endif
    3182     2723700 :                     context_ptr->best_spatial_pred_mv[list_idx][ref_idx][0] = best_search_mvx;
    3183     2723700 :                     context_ptr->best_spatial_pred_mv[list_idx][ref_idx][1] = best_search_mvy;
    3184     2723700 :                     context_ptr->valid_refined_mv[list_idx][ref_idx] = 1;
    3185             :                 }
    3186             :             }
    3187             :         }
    3188             :     }
    3189      675325 : }
    3190     9266730 : void AV1CostCalcCfl(
    3191             :     PictureControlSet                *picture_control_set_ptr,
    3192             :     ModeDecisionCandidateBuffer      *candidate_buffer,
    3193             :     LargestCodingUnit                *sb_ptr,
    3194             :     ModeDecisionContext              *context_ptr,
    3195             :     uint32_t                            component_mask,
    3196             :     EbPictureBufferDesc              *input_picture_ptr,
    3197             :     uint32_t                            inputCbOriginIndex,
    3198             :     uint32_t                            cuChromaOriginIndex,
    3199             :     uint64_t                            full_distortion[DIST_CALC_TOTAL],
    3200             :     uint64_t                           *coeffBits,
    3201             :     EbBool                              check_dc)
    3202             : {
    3203     9266730 :     ModeDecisionCandidate            *candidate_ptr = candidate_buffer->candidate_ptr;
    3204             :     uint32_t                            count_non_zero_coeffs[3][MAX_NUM_OF_TU_PER_CU];
    3205             :     uint64_t                            cbFullDistortion[DIST_CALC_TOTAL];
    3206             :     uint64_t                            crFullDistortion[DIST_CALC_TOTAL];
    3207     9266730 :     uint64_t                            cb_coeff_bits = 0;
    3208     9266730 :     uint64_t                            cr_coeff_bits = 0;
    3209     9266730 :     uint32_t                            chroma_width = context_ptr->blk_geom->bwidth_uv;
    3210     9266730 :     uint32_t                            chroma_height = context_ptr->blk_geom->bheight_uv;
    3211             :     // FullLoop and TU search
    3212             :     int32_t                             alpha_q3;
    3213     9266730 :     uint16_t                             cb_qp = context_ptr->qp;
    3214     9266730 :     uint16_t                             cr_qp = context_ptr->qp;
    3215             : 
    3216     9266730 :     full_distortion[DIST_CALC_RESIDUAL] = 0;
    3217     9266730 :     full_distortion[DIST_CALC_PREDICTION] = 0;
    3218     9266730 :     *coeffBits = 0;
    3219             : 
    3220             :     // Loop over alphas and find the best
    3221     9266730 :     if (component_mask == COMPONENT_CHROMA_CB || component_mask == COMPONENT_CHROMA || component_mask == COMPONENT_ALL) {
    3222     5078560 :         cbFullDistortion[DIST_CALC_RESIDUAL] = 0;
    3223     5078560 :         crFullDistortion[DIST_CALC_RESIDUAL] = 0;
    3224     5078560 :         cbFullDistortion[DIST_CALC_PREDICTION] = 0;
    3225     5078560 :         crFullDistortion[DIST_CALC_PREDICTION] = 0;
    3226     5078560 :         cb_coeff_bits = 0;
    3227     5078560 :         cr_coeff_bits = 0;
    3228     5078560 :         alpha_q3 = (check_dc) ? 0:
    3229     4556090 :             cfl_idx_to_alpha(candidate_ptr->cfl_alpha_idx, candidate_ptr->cfl_alpha_signs, CFL_PRED_U); // once for U, once for V
    3230     5077980 :         assert(chroma_width * CFL_BUF_LINE + chroma_height <=
    3231             :             CFL_BUF_SQUARE);
    3232             : 
    3233     5077980 :         if (!context_ptr->hbd_mode_decision) {
    3234     5078030 :             eb_cfl_predict_lbd(
    3235     5078030 :                 context_ptr->pred_buf_q3,
    3236     5078030 :                 &(candidate_buffer->prediction_ptr->buffer_cb[cuChromaOriginIndex]),
    3237     5078030 :                 candidate_buffer->prediction_ptr->stride_cb,
    3238     5078030 :                 &(candidate_buffer->cfl_temp_prediction_ptr->buffer_cb[cuChromaOriginIndex]),
    3239     5078030 :                 candidate_buffer->cfl_temp_prediction_ptr->stride_cb,
    3240             :                 alpha_q3,
    3241             :                 8,
    3242             :                 chroma_width,
    3243             :                 chroma_height);
    3244             :         } else {
    3245           0 :             eb_cfl_predict_hbd(
    3246           0 :                 context_ptr->pred_buf_q3,
    3247           0 :                 ((uint16_t*)candidate_buffer->prediction_ptr->buffer_cb) + cuChromaOriginIndex,
    3248           0 :                 candidate_buffer->prediction_ptr->stride_cb,
    3249           0 :                 ((uint16_t*)candidate_buffer->cfl_temp_prediction_ptr->buffer_cb) + cuChromaOriginIndex,
    3250           0 :                 candidate_buffer->cfl_temp_prediction_ptr->stride_cb,
    3251             :                 alpha_q3,
    3252             :                 10,
    3253             :                 chroma_width,
    3254             :                 chroma_height);
    3255             :         }
    3256             : 
    3257             :         // Cb Residual
    3258     5077570 :         residual_kernel(
    3259             :             input_picture_ptr->buffer_cb,
    3260             :             inputCbOriginIndex,
    3261     5077570 :             input_picture_ptr->stride_cb,
    3262     5077570 :             candidate_buffer->cfl_temp_prediction_ptr->buffer_cb,
    3263             :             cuChromaOriginIndex,
    3264     5077570 :             candidate_buffer->cfl_temp_prediction_ptr->stride_cb,
    3265     5077570 :             (int16_t*)candidate_buffer->residual_ptr->buffer_cb,
    3266             :             cuChromaOriginIndex,
    3267     5077570 :             candidate_buffer->residual_ptr->stride_cb,
    3268     5077570 :             context_ptr->hbd_mode_decision,
    3269             :             chroma_width,
    3270             :             chroma_height);
    3271             : 
    3272     5077400 :         full_loop_r(
    3273             :             sb_ptr,
    3274             :             candidate_buffer,
    3275             :             context_ptr,
    3276             :             input_picture_ptr,
    3277             :             picture_control_set_ptr,
    3278             :             PICTURE_BUFFER_DESC_Cb_FLAG,
    3279             :             cb_qp,
    3280             :             cr_qp,
    3281             :             &(*count_non_zero_coeffs[1]),
    3282             :             &(*count_non_zero_coeffs[2]));
    3283             : 
    3284             :         // Create new function
    3285     5077880 :         cu_full_distortion_fast_tu_mode_r(
    3286             :             sb_ptr,
    3287             :             candidate_buffer,
    3288             :             context_ptr,
    3289             :             candidate_ptr,
    3290             :             picture_control_set_ptr,
    3291             :             input_picture_ptr,
    3292             :             cbFullDistortion,
    3293             :             crFullDistortion,
    3294             :             count_non_zero_coeffs,
    3295             :             COMPONENT_CHROMA_CB,
    3296             :             &cb_coeff_bits,
    3297             :             &cr_coeff_bits,
    3298             :             0);
    3299             : 
    3300     5077660 :         full_distortion[DIST_CALC_RESIDUAL] += cbFullDistortion[DIST_CALC_RESIDUAL];
    3301     5077660 :         full_distortion[DIST_CALC_PREDICTION] += cbFullDistortion[DIST_CALC_PREDICTION];
    3302     5077660 :         *coeffBits += cb_coeff_bits;
    3303             :     }
    3304     9265820 :     if (component_mask == COMPONENT_CHROMA_CR || component_mask == COMPONENT_CHROMA || component_mask == COMPONENT_ALL) {
    3305     4714410 :         cbFullDistortion[DIST_CALC_RESIDUAL] = 0;
    3306     4714410 :         crFullDistortion[DIST_CALC_RESIDUAL] = 0;
    3307     4714410 :         cbFullDistortion[DIST_CALC_PREDICTION] = 0;
    3308     4714410 :         crFullDistortion[DIST_CALC_PREDICTION] = 0;
    3309             : 
    3310     4714410 :         cb_coeff_bits = 0;
    3311     4714410 :         cr_coeff_bits = 0;
    3312     4714410 :         alpha_q3 = (check_dc) ? 0 :
    3313     4191950 :             cfl_idx_to_alpha(candidate_ptr->cfl_alpha_idx, candidate_ptr->cfl_alpha_signs, CFL_PRED_V); // once for U, once for V
    3314     4713810 :         assert(chroma_width * CFL_BUF_LINE + chroma_height <=
    3315             :             CFL_BUF_SQUARE);
    3316             : 
    3317     4713810 :         if (!context_ptr->hbd_mode_decision) {
    3318     4713850 :             eb_cfl_predict_lbd(
    3319     4713850 :                 context_ptr->pred_buf_q3,
    3320     4713850 :                 &(candidate_buffer->prediction_ptr->buffer_cr[cuChromaOriginIndex]),
    3321     4713850 :                 candidate_buffer->prediction_ptr->stride_cr,
    3322     4713850 :                 &(candidate_buffer->cfl_temp_prediction_ptr->buffer_cr[cuChromaOriginIndex]),
    3323     4713850 :                 candidate_buffer->cfl_temp_prediction_ptr->stride_cr,
    3324             :                 alpha_q3,
    3325             :                 8,
    3326             :                 chroma_width,
    3327             :                 chroma_height);
    3328             :         } else {
    3329           0 :             eb_cfl_predict_hbd(
    3330           0 :                 context_ptr->pred_buf_q3,
    3331           0 :                 ((uint16_t*)candidate_buffer->prediction_ptr->buffer_cr) + cuChromaOriginIndex,
    3332           0 :                 candidate_buffer->prediction_ptr->stride_cr,
    3333           0 :                 ((uint16_t*)candidate_buffer->cfl_temp_prediction_ptr->buffer_cr) + cuChromaOriginIndex,
    3334           0 :                 candidate_buffer->cfl_temp_prediction_ptr->stride_cr,
    3335             :                 alpha_q3,
    3336             :                 10,
    3337             :                 chroma_width,
    3338             :                 chroma_height);
    3339             :         }
    3340             : 
    3341             :         // Cr Residual
    3342     4713500 :         residual_kernel(
    3343             :             input_picture_ptr->buffer_cr,
    3344             :             inputCbOriginIndex,
    3345     4713500 :             input_picture_ptr->stride_cr,
    3346     4713500 :             candidate_buffer->cfl_temp_prediction_ptr->buffer_cr,
    3347             :             cuChromaOriginIndex,
    3348     4713500 :             candidate_buffer->cfl_temp_prediction_ptr->stride_cr,
    3349     4713500 :             (int16_t*)candidate_buffer->residual_ptr->buffer_cr,
    3350             :             cuChromaOriginIndex,
    3351     4713500 :             candidate_buffer->residual_ptr->stride_cr,
    3352     4713500 :             context_ptr->hbd_mode_decision,
    3353             :             chroma_width,
    3354             :             chroma_height);
    3355             : 
    3356     4713320 :         full_loop_r(
    3357             :             sb_ptr,
    3358             :             candidate_buffer,
    3359             :             context_ptr,
    3360             :             input_picture_ptr,
    3361             :             picture_control_set_ptr,
    3362             :             PICTURE_BUFFER_DESC_Cr_FLAG,
    3363             :             cb_qp,
    3364             :             cr_qp,
    3365             :             &(*count_non_zero_coeffs[1]),
    3366             :             &(*count_non_zero_coeffs[2]));
    3367     4713300 :         candidate_ptr->v_has_coeff = *count_non_zero_coeffs[2] ? EB_TRUE : EB_FALSE;
    3368             : 
    3369             :         // Create new function
    3370     4713300 :         cu_full_distortion_fast_tu_mode_r(
    3371             :             sb_ptr,
    3372             :             candidate_buffer,
    3373             :             context_ptr,
    3374             :             candidate_ptr,
    3375             :             picture_control_set_ptr,
    3376             :             input_picture_ptr,
    3377             :             cbFullDistortion,
    3378             :             crFullDistortion,
    3379             :             count_non_zero_coeffs,
    3380             :             COMPONENT_CHROMA_CR,
    3381             :             &cb_coeff_bits,
    3382             :             &cr_coeff_bits,
    3383             :             0);
    3384             : 
    3385     4713280 :         full_distortion[DIST_CALC_RESIDUAL] += crFullDistortion[DIST_CALC_RESIDUAL];
    3386     4713280 :         full_distortion[DIST_CALC_PREDICTION] += crFullDistortion[DIST_CALC_PREDICTION];
    3387     4713280 :         *coeffBits += cr_coeff_bits;
    3388             :     }
    3389     9264700 : }
    3390             : 
    3391             : #define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \
    3392             :   (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1)
    3393             : /*************************Pick the best alpha for cfl mode  or Choose DC******************************************************/
    3394      522466 : void cfl_rd_pick_alpha(
    3395             :     PictureControlSet     *picture_control_set_ptr,
    3396             :     ModeDecisionCandidateBuffer  *candidate_buffer,
    3397             :     LargestCodingUnit     *sb_ptr,
    3398             :     ModeDecisionContext   *context_ptr,
    3399             :     EbPictureBufferDesc   *input_picture_ptr,
    3400             :     uint32_t                   inputCbOriginIndex,
    3401             :     uint32_t                     cuChromaOriginIndex)
    3402             : {
    3403      522466 :     int64_t                  best_rd = INT64_MAX;
    3404             :     uint64_t                  full_distortion[DIST_CALC_TOTAL];
    3405             :     uint64_t                  coeffBits;
    3406             : 
    3407      522466 :     const int64_t mode_rd =
    3408      522466 :         RDCOST(context_ptr->full_lambda,
    3409             :         (uint64_t)candidate_buffer->candidate_ptr->md_rate_estimation_ptr->intra_uv_mode_fac_bits[CFL_ALLOWED][candidate_buffer->candidate_ptr->intra_luma_mode][UV_CFL_PRED], 0);
    3410             : 
    3411             :     int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
    3412             :     int32_t best_c[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
    3413             : 
    3414     1567350 :     for (int32_t plane = 0; plane < CFL_PRED_PLANES; plane++) {
    3415     1044900 :         coeffBits = 0;
    3416     1044900 :         full_distortion[DIST_CALC_RESIDUAL] = 0;
    3417     9403550 :         for (int32_t joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
    3418     8358660 :             best_rd_uv[joint_sign][plane] = INT64_MAX;
    3419     8358660 :             best_c[joint_sign][plane] = 0;
    3420             :         }
    3421             :         // Collect RD stats for an alpha value of zero in this plane.
    3422             :         // Skip i == CFL_SIGN_ZERO as (0, 0) is invalid.
    3423     3134660 :         for (int32_t i = CFL_SIGN_NEG; i < CFL_SIGNS; i++) {
    3424     2089780 :             const int32_t joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, CFL_SIGN_ZERO, i);
    3425     2089780 :             if (i == CFL_SIGN_NEG) {
    3426     1044870 :                 candidate_buffer->candidate_ptr->cfl_alpha_idx = 0;
    3427     1044870 :                 candidate_buffer->candidate_ptr->cfl_alpha_signs = joint_sign;
    3428             : 
    3429     1044870 :                 AV1CostCalcCfl(
    3430             :                     picture_control_set_ptr,
    3431             :                     candidate_buffer,
    3432             :                     sb_ptr,
    3433             :                     context_ptr,
    3434             :                     (plane == 0) ? COMPONENT_CHROMA_CB : COMPONENT_CHROMA_CR,
    3435             :                     input_picture_ptr,
    3436             :                     inputCbOriginIndex,
    3437             :                     cuChromaOriginIndex,
    3438             :                     full_distortion,
    3439             :                     &coeffBits,
    3440             :                     0);
    3441             : 
    3442     1044860 :                 if (coeffBits == INT64_MAX) break;
    3443             :             }
    3444             : 
    3445     2089770 :             const int32_t alpha_rate = candidate_buffer->candidate_ptr->md_rate_estimation_ptr->cfl_alpha_fac_bits[joint_sign][plane][0];
    3446             : 
    3447     2089770 :             best_rd_uv[joint_sign][plane] =
    3448     2089770 :                 RDCOST(context_ptr->full_lambda, coeffBits + alpha_rate, full_distortion[DIST_CALC_RESIDUAL]);
    3449             :         }
    3450             :     }
    3451             : 
    3452      522454 :     int32_t best_joint_sign = -1;
    3453             : 
    3454     1567560 :     for (int32_t plane = 0; plane < CFL_PRED_PLANES; plane++) {
    3455     3134560 :         for (int32_t pn_sign = CFL_SIGN_NEG; pn_sign < CFL_SIGNS; pn_sign++) {
    3456     2089460 :             int32_t progress = 0;
    3457     9788830 :             for (int32_t c = 0; c < CFL_ALPHABET_SIZE; c++) {
    3458     9788440 :                 int32_t flag = 0;
    3459     9788440 :                 if (c > 2 && progress < c) break;
    3460     7699150 :                 coeffBits = 0;
    3461     7699150 :                 full_distortion[DIST_CALC_RESIDUAL] = 0;
    3462    30804000 :                 for (int32_t i = 0; i < CFL_SIGNS; i++) {
    3463    23104600 :                     const int32_t joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, pn_sign, i);
    3464    23104600 :                     if (i == 0) {
    3465     7699390 :                         candidate_buffer->candidate_ptr->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c;
    3466     7699390 :                         candidate_buffer->candidate_ptr->cfl_alpha_signs = joint_sign;
    3467             : 
    3468     7699390 :                         AV1CostCalcCfl(
    3469             :                             picture_control_set_ptr,
    3470             :                             candidate_buffer,
    3471             :                             sb_ptr,
    3472             :                             context_ptr,
    3473             :                             (plane == 0) ? COMPONENT_CHROMA_CB : COMPONENT_CHROMA_CR,
    3474             :                             input_picture_ptr,
    3475             :                             inputCbOriginIndex,
    3476             :                             cuChromaOriginIndex,
    3477             :                             full_distortion,
    3478             :                             &coeffBits,
    3479             :                             0);
    3480             : 
    3481     7699620 :                         if (coeffBits == INT64_MAX) break;
    3482             :                     }
    3483             : 
    3484    23104900 :                     const int32_t alpha_rate = candidate_buffer->candidate_ptr->md_rate_estimation_ptr->cfl_alpha_fac_bits[joint_sign][plane][c];
    3485             : 
    3486    23104900 :                     int64_t this_rd =
    3487    23104900 :                         RDCOST(context_ptr->full_lambda, coeffBits + alpha_rate, full_distortion[DIST_CALC_RESIDUAL]);
    3488    23104900 :                     if (this_rd >= best_rd_uv[joint_sign][plane]) continue;
    3489     7589330 :                     best_rd_uv[joint_sign][plane] = this_rd;
    3490     7589330 :                     best_c[joint_sign][plane] = c;
    3491             : 
    3492     7589330 :                     flag = 2;
    3493     7589330 :                     if (best_rd_uv[joint_sign][!plane] == INT64_MAX) continue;
    3494     4903790 :                     this_rd += mode_rd + best_rd_uv[joint_sign][!plane];
    3495     4903790 :                     if (this_rd >= best_rd) continue;
    3496     1637620 :                     best_rd = this_rd;
    3497     1637620 :                     best_joint_sign = joint_sign;
    3498             :                 }
    3499     7699380 :                 progress += flag;
    3500             :             }
    3501             :         }
    3502             :     }
    3503             : 
    3504             :     // Compare with DC Chroma
    3505      522681 :     coeffBits = 0;
    3506      522681 :     full_distortion[DIST_CALC_RESIDUAL] = 0;
    3507             : 
    3508      522681 :     candidate_buffer->candidate_ptr->cfl_alpha_idx = 0;
    3509      522681 :     candidate_buffer->candidate_ptr->cfl_alpha_signs = 0;
    3510             : 
    3511      522681 :     const int64_t dc_mode_rd =
    3512      522681 :         RDCOST(context_ptr->full_lambda,
    3513             :             candidate_buffer->candidate_ptr->md_rate_estimation_ptr->intra_uv_mode_fac_bits[CFL_ALLOWED][candidate_buffer->candidate_ptr->intra_luma_mode][UV_DC_PRED], 0);
    3514             : 
    3515      522681 :     AV1CostCalcCfl(
    3516             :         picture_control_set_ptr,
    3517             :         candidate_buffer,
    3518             :         sb_ptr,
    3519             :         context_ptr,
    3520             :         COMPONENT_CHROMA,
    3521             :         input_picture_ptr,
    3522             :         inputCbOriginIndex,
    3523             :         cuChromaOriginIndex,
    3524             :         full_distortion,
    3525             :         &coeffBits,
    3526             :         1);
    3527             : 
    3528      522451 :     int64_t dc_rd =
    3529      522451 :         RDCOST(context_ptr->full_lambda, coeffBits, full_distortion[DIST_CALC_RESIDUAL]);
    3530             : 
    3531      522451 :     dc_rd += dc_mode_rd;
    3532      522451 :     if (dc_rd <= best_rd) {
    3533      233340 :         candidate_buffer->candidate_ptr->intra_chroma_mode = UV_DC_PRED;
    3534      233340 :         candidate_buffer->candidate_ptr->cfl_alpha_idx = 0;
    3535      233340 :         candidate_buffer->candidate_ptr->cfl_alpha_signs = 0;
    3536             :     }
    3537             :     else {
    3538      289111 :         candidate_buffer->candidate_ptr->intra_chroma_mode = UV_CFL_PRED;
    3539      289111 :         int32_t ind = 0;
    3540      289111 :         if (best_joint_sign >= 0) {
    3541      289126 :             const int32_t u = best_c[best_joint_sign][CFL_PRED_U];
    3542      289126 :             const int32_t v = best_c[best_joint_sign][CFL_PRED_V];
    3543      289126 :             ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
    3544             :         }
    3545             :         else
    3546           0 :             best_joint_sign = 0;
    3547      289111 :         candidate_buffer->candidate_ptr->cfl_alpha_idx = ind;
    3548      289111 :         candidate_buffer->candidate_ptr->cfl_alpha_signs = best_joint_sign;
    3549             :     }
    3550      522451 : }
    3551             : 
    3552             : // If mode is CFL:
    3553             : // 1: recon the Luma
    3554             : // 2: Form the pred_buf_q3
    3555             : // 3: Loop over alphas and find the best or choose DC
    3556             : // 4: Recalculate the residual for chroma
    3557      749219 : static void CflPrediction(
    3558             :     PictureControlSet     *picture_control_set_ptr,
    3559             :     ModeDecisionCandidateBuffer  *candidate_buffer,
    3560             :     LargestCodingUnit     *sb_ptr,
    3561             :     ModeDecisionContext   *context_ptr,
    3562             :     EbPictureBufferDesc   *input_picture_ptr,
    3563             :     uint32_t                   inputCbOriginIndex,
    3564             :     uint32_t                     cuChromaOriginIndex)
    3565             : {
    3566      749219 :     if (context_ptr->blk_geom->has_uv) {
    3567             :     // 1: recon the Luma
    3568      519774 :     AV1PerformInverseTransformReconLuma(
    3569             :         picture_control_set_ptr,
    3570             :         context_ptr,
    3571             :         candidate_buffer);
    3572             : 
    3573      519774 :         uint32_t recLumaOffset = ((context_ptr->blk_geom->origin_y >> 3) << 3) * candidate_buffer->recon_ptr->stride_y +
    3574      519774 :             ((context_ptr->blk_geom->origin_x >> 3) << 3);
    3575             :     // 2: Form the pred_buf_q3
    3576      519774 :     uint32_t chroma_width = context_ptr->blk_geom->bwidth_uv;
    3577      519774 :     uint32_t chroma_height = context_ptr->blk_geom->bheight_uv;
    3578             : 
    3579             :     // Down sample Luma
    3580      519774 :     if (!context_ptr->hbd_mode_decision) {
    3581     2598920 :         cfl_luma_subsampling_420_lbd_c(
    3582      519783 :             &(context_ptr->cfl_temp_luma_recon[recLumaOffset]),
    3583      519783 :             candidate_buffer->recon_ptr->stride_y,
    3584      519783 :             context_ptr->pred_buf_q3,
    3585      519783 :             context_ptr->blk_geom->bwidth_uv == context_ptr->blk_geom->bwidth ? (context_ptr->blk_geom->bwidth_uv << 1) : context_ptr->blk_geom->bwidth,
    3586      519783 :             context_ptr->blk_geom->bheight_uv == context_ptr->blk_geom->bheight ? (context_ptr->blk_geom->bheight_uv << 1) : context_ptr->blk_geom->bheight);
    3587             :     } else {
    3588           0 :         cfl_luma_subsampling_420_hbd_c(
    3589           0 :             context_ptr->cfl_temp_luma_recon16bit + recLumaOffset,
    3590           0 :             candidate_buffer->recon_ptr->stride_y,
    3591           0 :             context_ptr->pred_buf_q3,
    3592           0 :             context_ptr->blk_geom->bwidth_uv == context_ptr->blk_geom->bwidth ? (context_ptr->blk_geom->bwidth_uv << 1) : context_ptr->blk_geom->bwidth,
    3593           0 :             context_ptr->blk_geom->bheight_uv == context_ptr->blk_geom->bheight ? (context_ptr->blk_geom->bheight_uv << 1) : context_ptr->blk_geom->bheight);
    3594             :     }
    3595      519759 :     int32_t round_offset = chroma_width * chroma_height / 2;
    3596             : 
    3597     1039520 :     eb_subtract_average(
    3598      519764 :         context_ptr->pred_buf_q3,
    3599             :         chroma_width,
    3600             :         chroma_height,
    3601             :         round_offset,
    3602      519759 :         LOG2F(chroma_width) + LOG2F(chroma_height));
    3603             : 
    3604             :     // 3: Loop over alphas and find the best or choose DC
    3605      519781 :     cfl_rd_pick_alpha(
    3606             :         picture_control_set_ptr,
    3607             :         candidate_buffer,
    3608             :         sb_ptr,
    3609             :         context_ptr,
    3610             :         input_picture_ptr,
    3611             :         inputCbOriginIndex,
    3612             :         cuChromaOriginIndex);
    3613             : 
    3614      519783 :     if (candidate_buffer->candidate_ptr->intra_chroma_mode == UV_CFL_PRED) {
    3615             :         // 4: Recalculate the prediction and the residual
    3616             :         int32_t alpha_q3_cb =
    3617      288122 :             cfl_idx_to_alpha(candidate_buffer->candidate_ptr->cfl_alpha_idx, candidate_buffer->candidate_ptr->cfl_alpha_signs, CFL_PRED_U);
    3618             :         int32_t alpha_q3_cr =
    3619      288121 :             cfl_idx_to_alpha(candidate_buffer->candidate_ptr->cfl_alpha_idx, candidate_buffer->candidate_ptr->cfl_alpha_signs, CFL_PRED_V);
    3620             : 
    3621      288119 :         assert(chroma_height * CFL_BUF_LINE + chroma_width <=
    3622             :             CFL_BUF_SQUARE);
    3623             : 
    3624      288119 :         if (!context_ptr->hbd_mode_decision) {
    3625      288119 :             eb_cfl_predict_lbd(
    3626      288119 :                 context_ptr->pred_buf_q3,
    3627      288119 :                 &(candidate_buffer->prediction_ptr->buffer_cb[cuChromaOriginIndex]),
    3628      288119 :                 candidate_buffer->prediction_ptr->stride_cb,
    3629      288119 :                 &(candidate_buffer->prediction_ptr->buffer_cb[cuChromaOriginIndex]),
    3630      288119 :                 candidate_buffer->prediction_ptr->stride_cb,
    3631             :                 alpha_q3_cb,
    3632             :                 8,
    3633             :                 chroma_width,
    3634             :                 chroma_height);
    3635             : 
    3636      288127 :             eb_cfl_predict_lbd(
    3637      288127 :                 context_ptr->pred_buf_q3,
    3638      288127 :                 &(candidate_buffer->prediction_ptr->buffer_cr[cuChromaOriginIndex]),
    3639      288127 :                 candidate_buffer->prediction_ptr->stride_cr,
    3640      288127 :                 &(candidate_buffer->prediction_ptr->buffer_cr[cuChromaOriginIndex]),
    3641      288127 :                 candidate_buffer->prediction_ptr->stride_cr,
    3642             :                 alpha_q3_cr,
    3643             :                 8,
    3644             :                 chroma_width,
    3645             :                 chroma_height);
    3646             :         } else {
    3647           0 :             eb_cfl_predict_hbd(
    3648           0 :                 context_ptr->pred_buf_q3,
    3649           0 :                 ((uint16_t*)candidate_buffer->prediction_ptr->buffer_cb) + cuChromaOriginIndex,
    3650           0 :                 candidate_buffer->prediction_ptr->stride_cb,
    3651           0 :                 ((uint16_t*)candidate_buffer->prediction_ptr->buffer_cb) + cuChromaOriginIndex,
    3652           0 :                 candidate_buffer->prediction_ptr->stride_cb,
    3653             :                 alpha_q3_cb,
    3654             :                 10,
    3655             :                 chroma_width,
    3656             :                 chroma_height);
    3657             : 
    3658           0 :             eb_cfl_predict_hbd(
    3659           0 :                 context_ptr->pred_buf_q3,
    3660           0 :                 ((uint16_t*)candidate_buffer->prediction_ptr->buffer_cr) + cuChromaOriginIndex,
    3661           0 :                 candidate_buffer->prediction_ptr->stride_cr,
    3662           0 :                 ((uint16_t*)candidate_buffer->prediction_ptr->buffer_cr) + cuChromaOriginIndex,
    3663           0 :                 candidate_buffer->prediction_ptr->stride_cr,
    3664             :                 alpha_q3_cr,
    3665             :                 10,
    3666             :                 chroma_width,
    3667             :                 chroma_height);
    3668             :         }
    3669             : 
    3670             :         // Cb Residual
    3671      288121 :         residual_kernel(
    3672             :             input_picture_ptr->buffer_cb,
    3673             :             inputCbOriginIndex,
    3674      288121 :             input_picture_ptr->stride_cb,
    3675      288121 :             candidate_buffer->prediction_ptr->buffer_cb,
    3676             :             cuChromaOriginIndex,
    3677      288121 :             candidate_buffer->prediction_ptr->stride_cb,
    3678      288121 :             (int16_t*)candidate_buffer->residual_ptr->buffer_cb,
    3679             :             cuChromaOriginIndex,
    3680      288121 :             candidate_buffer->residual_ptr->stride_cb,
    3681      288121 :             context_ptr->hbd_mode_decision,
    3682      288121 :             context_ptr->blk_geom->bwidth_uv,
    3683      288121 :             context_ptr->blk_geom->bheight_uv);
    3684             : 
    3685             :         // Cr Residual
    3686      288124 :         residual_kernel(
    3687             :             input_picture_ptr->buffer_cr,
    3688             :             inputCbOriginIndex,
    3689      288124 :             input_picture_ptr->stride_cr,
    3690      288124 :             candidate_buffer->prediction_ptr->buffer_cr,
    3691             :             cuChromaOriginIndex,
    3692      288124 :             candidate_buffer->prediction_ptr->stride_cr,
    3693      288124 :             (int16_t*)candidate_buffer->residual_ptr->buffer_cr,
    3694             :             cuChromaOriginIndex,
    3695      288124 :             candidate_buffer->residual_ptr->stride_cr,
    3696      288124 :             context_ptr->hbd_mode_decision,
    3697      288124 :             context_ptr->blk_geom->bwidth_uv,
    3698      288124 :             context_ptr->blk_geom->bheight_uv);
    3699             :     }
    3700             :     else {
    3701             :         // Alphas = 0, Preds are the same as DC. Switch to DC mode
    3702      231661 :         candidate_buffer->candidate_ptr->intra_chroma_mode = UV_DC_PRED;
    3703             :     }
    3704             :     }
    3705      749234 : }
    3706     3024800 : uint8_t get_skip_tx_search_flag(
    3707             :     int32_t                  sq_size,
    3708             :     uint64_t                 ref_fast_cost,
    3709             :     uint64_t                 cu_cost,
    3710             :     uint64_t                 weight)
    3711             : {
    3712             :     //NM: Skip tx search when the fast cost of the current mode candidate is substansially
    3713             :     // Larger than the best fast_cost (
    3714     3024800 :     uint8_t  tx_search_skip_flag = cu_cost >= ((ref_fast_cost * weight) / 100) ? 1 : 0;
    3715     3024800 :     tx_search_skip_flag = sq_size >= 128 ? 1 : tx_search_skip_flag;
    3716     3024800 :     return tx_search_skip_flag;
    3717             : }
    3718             : 
    3719   236348000 : static INLINE TxType av1_get_tx_type(
    3720             :     BlockSize  sb_type,
    3721             :     int32_t   is_inter,
    3722             :     PredictionMode pred_mode,
    3723             :     UvPredictionMode pred_mode_uv,
    3724             :     PlaneType plane_type,
    3725             :     const MacroBlockD *xd, int32_t blk_row,
    3726             :     int32_t blk_col, TxSize tx_size,
    3727             :     int32_t reduced_tx_set)
    3728             : {
    3729             :     UNUSED(sb_type);
    3730             :     UNUSED(*xd);
    3731             :     UNUSED(blk_row);
    3732             :     UNUSED(blk_col);
    3733             : 
    3734             :     // block_size  sb_type = BLOCK_8X8;
    3735             : 
    3736             :     MbModeInfo  mbmi;
    3737   236348000 :     mbmi.block_mi.mode = pred_mode;
    3738   236348000 :     mbmi.block_mi.uv_mode = pred_mode_uv;
    3739             : 
    3740             :     // const MbModeInfo *const mbmi = xd->mi[0];
    3741             :     // const struct MacroblockdPlane *const pd = &xd->plane[plane_type];
    3742             :     const TxSetType tx_set_type =
    3743   236348000 :         /*av1_*/get_ext_tx_set_type(tx_size, is_inter, reduced_tx_set);
    3744             : 
    3745   236550000 :     TxType tx_type = DCT_DCT;
    3746   236550000 :     if ( /*xd->lossless[mbmi->segment_id] ||*/ txsize_sqr_up_map[tx_size] > TX_32X32)
    3747           0 :         tx_type = DCT_DCT;
    3748             :     else {
    3749   236550000 :         if (plane_type == PLANE_TYPE_Y) {
    3750             :             //const int32_t txk_type_idx =
    3751             :             //    av1_get_txk_type_index(/*mbmi->*/sb_type, blk_row, blk_col);
    3752             :             //tx_type = mbmi->txk_type[txk_type_idx];
    3753             :         }
    3754   236584000 :         else if (is_inter /*is_inter_block(mbmi)*/) {
    3755             :             // scale back to y plane's coordinate
    3756             :             //blk_row <<= pd->subsampling_y;
    3757             :             //blk_col <<= pd->subsampling_x;
    3758             :             //const int32_t txk_type_idx =
    3759             :             //    av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
    3760             :             //tx_type = mbmi->txk_type[txk_type_idx];
    3761             :         }
    3762             :         else {
    3763             :             // In intra mode, uv planes don't share the same prediction mode as y
    3764             :             // plane, so the tx_type should not be shared
    3765   236598000 :             tx_type = intra_mode_to_tx_type(&mbmi.block_mi, PLANE_TYPE_UV);
    3766             :         }
    3767             :     }
    3768             :     ASSERT(tx_type < TX_TYPES);
    3769   236779000 :     if (!av1_ext_tx_used[tx_set_type][tx_type]) return DCT_DCT;
    3770   224583000 :     return tx_type;
    3771             : }
    3772             : 
    3773      522356 : void check_best_indepedant_cfl(
    3774             :     PictureControlSet           *picture_control_set_ptr,
    3775             :     EbPictureBufferDesc         *input_picture_ptr,
    3776             :     ModeDecisionContext         *context_ptr,
    3777             :     uint32_t                       inputCbOriginIndex,
    3778             :     uint32_t                       cuChromaOriginIndex,
    3779             :     ModeDecisionCandidateBuffer *candidate_buffer,
    3780             :     uint8_t                        cb_qp,
    3781             :     uint8_t                        cr_qp,
    3782             :     uint64_t                      *cbFullDistortion,
    3783             :     uint64_t                      *crFullDistortion,
    3784             :     uint64_t                      *cb_coeff_bits,
    3785             :     uint64_t                      *cr_coeff_bits)
    3786             : {
    3787             : 
    3788             : #if FILTER_INTRA_FLAG
    3789      522356 :     if (candidate_buffer->candidate_ptr->filter_intra_mode != FILTER_INTRA_MODES)
    3790      175796 :         assert(candidate_buffer->candidate_ptr->intra_luma_mode == DC_PRED);
    3791             : #endif
    3792      522356 :     FrameHeader *frm_hdr = &picture_control_set_ptr->parent_pcs_ptr->frm_hdr;
    3793             :     // cfl cost
    3794      522356 :     uint64_t chromaRate = 0;
    3795      522356 :     if (candidate_buffer->candidate_ptr->intra_chroma_mode == UV_CFL_PRED) {
    3796      287185 :         chromaRate += candidate_buffer->candidate_ptr->md_rate_estimation_ptr->cfl_alpha_fac_bits[candidate_buffer->candidate_ptr->cfl_alpha_signs][CFL_PRED_U][CFL_IDX_U(candidate_buffer->candidate_ptr->cfl_alpha_idx)] +
    3797      287185 :             candidate_buffer->candidate_ptr->md_rate_estimation_ptr->cfl_alpha_fac_bits[candidate_buffer->candidate_ptr->cfl_alpha_signs][CFL_PRED_V][CFL_IDX_V(candidate_buffer->candidate_ptr->cfl_alpha_idx)];
    3798             : 
    3799      287185 :         chromaRate += (uint64_t)candidate_buffer->candidate_ptr->md_rate_estimation_ptr->intra_uv_mode_fac_bits[CFL_ALLOWED][candidate_buffer->candidate_ptr->intra_luma_mode][UV_CFL_PRED];
    3800      287185 :         chromaRate -= (uint64_t)candidate_buffer->candidate_ptr->md_rate_estimation_ptr->intra_uv_mode_fac_bits[CFL_ALLOWED][candidate_buffer->candidate_ptr->intra_luma_mode][UV_DC_PRED];
    3801             :     }
    3802             :     else
    3803      235171 :         chromaRate = (uint64_t)candidate_buffer->candidate_ptr->md_rate_estimation_ptr->intra_uv_mode_fac_bits[CFL_ALLOWED][candidate_buffer->candidate_ptr->intra_luma_mode][UV_DC_PRED];
    3804      522356 :     int coeff_rate = (int)(*cb_coeff_bits + *cr_coeff_bits);
    3805      522356 :     int distortion = (int)(cbFullDistortion[DIST_CALC_RESIDUAL] + crFullDistortion[DIST_CALC_RESIDUAL]);
    3806      522356 :     int rate = (int)(coeff_rate + chromaRate + candidate_buffer->candidate_ptr->fast_luma_rate);
    3807      522356 :     uint64_t cfl_uv_cost = RDCOST(context_ptr->full_lambda, rate, distortion);
    3808             : 
    3809             :     // cfl vs. best independant
    3810      522356 :     if (context_ptr->best_uv_cost[candidate_buffer->candidate_ptr->intra_luma_mode][3 + candidate_buffer->candidate_ptr->angle_delta[PLANE_TYPE_Y]] < cfl_uv_cost) {
    3811             :         // Update the current candidate
    3812      197813 :         candidate_buffer->candidate_ptr->intra_chroma_mode = context_ptr->best_uv_mode[candidate_buffer->candidate_ptr->intra_luma_mode][MAX_ANGLE_DELTA + candidate_buffer->candidate_ptr->angle_delta[PLANE_TYPE_Y]];
    3813      197813 :         candidate_buffer->candidate_ptr->angle_delta[PLANE_TYPE_UV] = context_ptr->best_uv_angle[candidate_buffer->candidate_ptr->intra_luma_mode][MAX_ANGLE_DELTA + candidate_buffer->candidate_ptr->angle_delta[PLANE_TYPE_Y]];
    3814      197813 :         candidate_buffer->candidate_ptr->is_directional_chroma_mode_flag = (uint8_t)av1_is_directional_mode((PredictionMode)(context_ptr->best_uv_mode[candidate_buffer->candidate_ptr->intra_luma_mode][MAX_ANGLE_DELTA + candidate_buffer->candidate_ptr->angle_delta[PLANE_TYPE_Y]]));
    3815             : 
    3816             :         // check if candidate_buffer->candidate_ptr->fast_luma_rate = context_ptr->fast_luma_rate[candidate_buffer->candidate_ptr->intra_luma_mode];
    3817      197813 :         candidate_buffer->candidate_ptr->fast_chroma_rate = context_ptr->fast_chroma_rate[candidate_buffer->candidate_ptr->intra_luma_mode][MAX_ANGLE_DELTA + candidate_buffer->candidate_ptr->angle_delta[PLANE_TYPE_Y]];
    3818             : 
    3819      395625 :         candidate_buffer->candidate_ptr->transform_type_uv =
    3820      197813 :             av1_get_tx_type(
    3821      197813 :                 context_ptr->blk_geom->bsize,
    3822             :                 0,
    3823             :                 (PredictionMode)NULL,
    3824      197813 :                 (UvPredictionMode)context_ptr->best_uv_mode[candidate_buffer->candidate_ptr->intra_luma_mode][3 + candidate_buffer->candidate_ptr->angle_delta[PLANE_TYPE_Y]],
    3825             :                 PLANE_TYPE_UV,
    3826             :                 0,
    3827             :                 0,
    3828             :                 0,
    3829      197813 :                 context_ptr->blk_geom->txsize_uv[0][0],
    3830      197813 :                 frm_hdr->reduced_tx_set);
    3831             : 
    3832             :         // Start uv search path
    3833      197812 :         context_ptr->uv_search_path = EB_TRUE;
    3834             : 
    3835      197812 :         memset(candidate_buffer->candidate_ptr->eob[1], 0, sizeof(uint16_t));
    3836      197812 :         memset(candidate_buffer->candidate_ptr->eob[2], 0, sizeof(uint16_t));
    3837      197812 :         candidate_buffer->candidate_ptr->u_has_coeff = 0;
    3838      197812 :         candidate_buffer->candidate_ptr->v_has_coeff = 0;
    3839      197812 :         cbFullDistortion[DIST_CALC_RESIDUAL] = 0;
    3840      197812 :         crFullDistortion[DIST_CALC_RESIDUAL] = 0;
    3841      197812 :         cbFullDistortion[DIST_CALC_PREDICTION] = 0;
    3842      197812 :         crFullDistortion[DIST_CALC_PREDICTION] = 0;
    3843             : 
    3844      197812 :         *cb_coeff_bits = 0;
    3845      197812 :         *cr_coeff_bits = 0;
    3846             : 
    3847             :         uint32_t count_non_zero_coeffs[3][MAX_NUM_OF_TU_PER_CU];
    3848      197812 :         context_ptr->md_staging_skip_inter_chroma_pred = EB_FALSE;
    3849      197812 :         ProductPredictionFunTable[candidate_buffer->candidate_ptr->type](
    3850             :             context_ptr,
    3851             :             picture_control_set_ptr,
    3852             :             candidate_buffer);
    3853             : 
    3854             :         // Cb Residual
    3855      197812 :         residual_kernel(
    3856             :             input_picture_ptr->buffer_cb,
    3857             :             inputCbOriginIndex,
    3858      197812 :             input_picture_ptr->stride_cb,
    3859      197812 :             candidate_buffer->prediction_ptr->buffer_cb,
    3860             :             cuChromaOriginIndex,
    3861      197812 :             candidate_buffer->prediction_ptr->stride_cb,
    3862      197812 :             (int16_t*)candidate_buffer->residual_ptr->buffer_cb,
    3863             :             cuChromaOriginIndex,
    3864      197812 :             candidate_buffer->residual_ptr->stride_cb,
    3865      197812 :             context_ptr->hbd_mode_decision,
    3866      197812 :             context_ptr->blk_geom->bwidth_uv,
    3867      197812 :             context_ptr->blk_geom->bheight_uv);
    3868             : 
    3869             :         // Cr Residual
    3870      197812 :         residual_kernel(
    3871             :             input_picture_ptr->buffer_cr,
    3872             :             inputCbOriginIndex,
    3873      197812 :             input_picture_ptr->stride_cr,
    3874      197812 :             candidate_buffer->prediction_ptr->buffer_cr,
    3875             :             cuChromaOriginIndex,
    3876      197812 :             candidate_buffer->prediction_ptr->stride_cr,
    3877      197812 :             (int16_t*)candidate_buffer->residual_ptr->buffer_cr,
    3878             :             cuChromaOriginIndex,
    3879      197812 :             candidate_buffer->residual_ptr->stride_cr,
    3880      197812 :             context_ptr->hbd_mode_decision,
    3881      197812 :             context_ptr->blk_geom->bwidth_uv,
    3882      197812 :             context_ptr->blk_geom->bheight_uv);
    3883             : 
    3884      197811 :         full_loop_r(
    3885             :             context_ptr->sb_ptr,
    3886             :             candidate_buffer,
    3887             :             context_ptr,
    3888             :             input_picture_ptr,
    3889             :             picture_control_set_ptr,
    3890             :             PICTURE_BUFFER_DESC_CHROMA_MASK,
    3891             :             cb_qp,
    3892             :             cr_qp,
    3893             :             &(*count_non_zero_coeffs[1]),
    3894             :             &(*count_non_zero_coeffs[2]));
    3895             : 
    3896      197811 :         cu_full_distortion_fast_tu_mode_r(
    3897             :             context_ptr->sb_ptr,
    3898             :             candidate_buffer,
    3899             :             context_ptr,
    3900             :             candidate_buffer->candidate_ptr,
    3901             :             picture_control_set_ptr,
    3902             :             input_picture_ptr,
    3903             :             cbFullDistortion,
    3904             :             crFullDistortion,
    3905             :             count_non_zero_coeffs,
    3906             :             COMPONENT_CHROMA,
    3907             :             cb_coeff_bits,
    3908             :             cr_coeff_bits,
    3909             :             1);
    3910             : 
    3911             :         // End uv search path
    3912      197813 :         context_ptr->uv_search_path = EB_FALSE;
    3913             :     }
    3914      522356 : }
    3915             : 
    3916             :             // double check the usage of tx_search_luma_recon_neighbor_array16bit
    3917     1899240 : EbErrorType av1_intra_luma_prediction(
    3918             :     ModeDecisionContext         *md_context_ptr,
    3919             :     PictureControlSet           *picture_control_set_ptr,
    3920             :     ModeDecisionCandidateBuffer *candidate_buffer_ptr)
    3921             : {
    3922     1899240 :     EbErrorType return_error = EB_ErrorNone;
    3923             : 
    3924     1899240 :     uint16_t txb_origin_x = md_context_ptr->cu_origin_x + md_context_ptr->blk_geom->tx_boff_x[md_context_ptr->tx_depth][md_context_ptr->txb_itr];
    3925     1899240 :     uint16_t txb_origin_y = md_context_ptr->cu_origin_y + md_context_ptr->blk_geom->tx_boff_y[md_context_ptr->tx_depth][md_context_ptr->txb_itr];
    3926             : 
    3927     1899240 :     uint8_t  tx_width = md_context_ptr->blk_geom->tx_width[md_context_ptr->tx_depth][md_context_ptr->txb_itr];
    3928     1899240 :     uint8_t  tx_height = md_context_ptr->blk_geom->tx_height[md_context_ptr->tx_depth][md_context_ptr->txb_itr];
    3929             : 
    3930     1899240 :     uint32_t modeTypeLeftNeighborIndex = get_neighbor_array_unit_left_index(
    3931             :         md_context_ptr->mode_type_neighbor_array,
    3932             :         txb_origin_y);
    3933     1899250 :     uint32_t modeTypeTopNeighborIndex = get_neighbor_array_unit_top_index(
    3934             :         md_context_ptr->mode_type_neighbor_array,
    3935             :         txb_origin_x);
    3936     1899260 :     uint32_t intraLumaModeLeftNeighborIndex = get_neighbor_array_unit_left_index(
    3937             :         md_context_ptr->intra_luma_mode_neighbor_array,
    3938             :         txb_origin_y);
    3939     1899240 :     uint32_t intraLumaModeTopNeighborIndex = get_neighbor_array_unit_top_index(
    3940             :         md_context_ptr->intra_luma_mode_neighbor_array,
    3941             :         txb_origin_x);
    3942             : 
    3943      326767 :     md_context_ptr->intra_luma_left_mode = (uint32_t)(
    3944     1899260 :         (md_context_ptr->mode_type_neighbor_array->left_array[modeTypeLeftNeighborIndex] != INTRA_MODE) ? DC_PRED/*EB_INTRA_DC*/ :
    3945     1572490 :         (uint32_t)md_context_ptr->intra_luma_mode_neighbor_array->left_array[intraLumaModeLeftNeighborIndex]);
    3946             : 
    3947      344277 :     md_context_ptr->intra_luma_top_mode = (uint32_t)(
    3948     1899260 :         (md_context_ptr->mode_type_neighbor_array->top_array[modeTypeTopNeighborIndex] != INTRA_MODE) ? DC_PRED/*EB_INTRA_DC*/ :
    3949     1554980 :         (uint32_t)md_context_ptr->intra_luma_mode_neighbor_array->top_array[intraLumaModeTopNeighborIndex]);       //   use DC. This seems like we could use a LCU-width
    3950             : 
    3951     1899260 :     TxSize  tx_size = md_context_ptr->blk_geom->txsize[md_context_ptr->tx_depth][md_context_ptr->txb_itr];
    3952             : 
    3953             :     PredictionMode mode;
    3954     1899260 :     if (!md_context_ptr->hbd_mode_decision) {
    3955             :         uint8_t topNeighArray[64 * 2 + 1];
    3956             :         uint8_t leftNeighArray[64 * 2 + 1];
    3957             : 
    3958     1899240 :         if (txb_origin_y != 0)
    3959     1826950 :             memcpy(topNeighArray + 1, md_context_ptr->tx_search_luma_recon_neighbor_array->top_array + txb_origin_x, tx_width * 2);
    3960     1899240 :         if (txb_origin_x != 0)
    3961     1828220 :             memcpy(leftNeighArray + 1, md_context_ptr->tx_search_luma_recon_neighbor_array->left_array + txb_origin_y, tx_height * 2);
    3962     1899240 :         if (txb_origin_y != 0 && txb_origin_x != 0)
    3963     1757970 :             topNeighArray[0] = leftNeighArray[0] = md_context_ptr->tx_search_luma_recon_neighbor_array->top_left_array[MAX_PICTURE_HEIGHT_SIZE + txb_origin_x - txb_origin_y];
    3964             : 
    3965     1899240 :         mode = candidate_buffer_ptr->candidate_ptr->pred_mode;
    3966     1899240 :         eb_av1_predict_intra_block(
    3967     1899240 :             &md_context_ptr->sb_ptr->tile_info,
    3968             :             !ED_STAGE,
    3969             :             md_context_ptr->blk_geom,
    3970     1899240 :             picture_control_set_ptr->parent_pcs_ptr->av1_cm,                                      //const Av1Common *cm,
    3971     1899240 :             md_context_ptr->blk_geom->bwidth,
    3972     1899240 :             md_context_ptr->blk_geom->bheight,
    3973             :             tx_size,
    3974             :             mode,                                                                           //PredictionMode mode,
    3975     1899240 :             candidate_buffer_ptr->candidate_ptr->angle_delta[PLANE_TYPE_Y],
    3976             : #if PAL_SUP
    3977     1899240 :             candidate_buffer_ptr->candidate_ptr->palette_info.pmi.palette_size[0]>0,
    3978     1899240 :             &candidate_buffer_ptr->candidate_ptr->palette_info ,    //ATB MD
    3979             : #else
    3980             :             0,                                                                              //int32_t use_palette,
    3981             : #endif
    3982             : #if FILTER_INTRA_FLAG
    3983     1899240 :             candidate_buffer_ptr->candidate_ptr->filter_intra_mode,
    3984             : #else
    3985             :             FILTER_INTRA_MODES,                                                             //CHKN FilterIntraMode filter_intra_mode,
    3986             : #endif
    3987             :             topNeighArray + 1,
    3988             :             leftNeighArray + 1,
    3989             :             candidate_buffer_ptr->prediction_ptr,                                              //uint8_t *dst,
    3990     1899240 :             md_context_ptr->blk_geom->tx_boff_x[md_context_ptr->tx_depth][md_context_ptr->txb_itr] >> 2, //int32_t col_off,
    3991     1899240 :             md_context_ptr->blk_geom->tx_boff_y[md_context_ptr->tx_depth][md_context_ptr->txb_itr] >> 2,                                                                              //int32_t row_off,
    3992             :             PLANE_TYPE_Y,                                                                          //int32_t plane,
    3993     1899240 :             md_context_ptr->blk_geom->bsize,
    3994     1899240 :             md_context_ptr->cu_origin_x,
    3995     1899240 :             md_context_ptr->cu_origin_y,
    3996     1899240 :             md_context_ptr->cu_origin_x,
    3997     1899240 :             md_context_ptr->cu_origin_y,
    3998     1899240 :             md_context_ptr->blk_geom->tx_org_x[md_context_ptr->tx_depth][md_context_ptr->txb_itr],  //uint32_t cuOrgX used only for prediction Ptr
    3999     1899240 :             md_context_ptr->blk_geom->tx_org_y[md_context_ptr->tx_depth][md_context_ptr->txb_itr]   //uint32_t cuOrgY used only for prediction Ptr
    4000             :         );
    4001             :     } else {
    4002             :         uint16_t topNeighArray[64 * 2 + 1];
    4003             :         uint16_t leftNeighArray[64 * 2 + 1];
    4004             : 
    4005          12 :         if (txb_origin_y != 0)
    4006           0 :             memcpy(topNeighArray + 1, (uint16_t*)(md_context_ptr->tx_search_luma_recon_neighbor_array16bit->top_array) + txb_origin_x, sizeof(uint16_t) * tx_width * 2);
    4007          12 :         if (txb_origin_x != 0)
    4008           0 :             memcpy(leftNeighArray + 1, (uint16_t*)(md_context_ptr->tx_search_luma_recon_neighbor_array16bit->left_array) + txb_origin_y, sizeof(uint16_t) * tx_height * 2);
    4009          12 :         if (txb_origin_y != 0 && txb_origin_x != 0)
    4010           0 :             topNeighArray[0] = leftNeighArray[0] = ((uint16_t*)(md_context_ptr->tx_search_luma_recon_neighbor_array16bit->top_left_array) + MAX_PICTURE_HEIGHT_SIZE + txb_origin_x - txb_origin_y)[0];
    4011             : 
    4012          12 :         mode = candidate_buffer_ptr->candidate_ptr->pred_mode;
    4013          12 :         eb_av1_predict_intra_block_16bit(
    4014          12 :             &md_context_ptr->sb_ptr->tile_info,
    4015             :             !ED_STAGE,
    4016             :             md_context_ptr->blk_geom,
    4017          12 :             picture_control_set_ptr->parent_pcs_ptr->av1_cm,
    4018          12 :             md_context_ptr->blk_geom->bwidth,
    4019          12 :             md_context_ptr->blk_geom->bheight,
    4020             :             tx_size,
    4021             :             mode,
    4022          12 :             candidate_buffer_ptr->candidate_ptr->angle_delta[PLANE_TYPE_Y],
    4023             : #if PAL_SUP
    4024          12 :             candidate_buffer_ptr->candidate_ptr->palette_info.pmi.palette_size[0] > 0,
    4025          12 :             &candidate_buffer_ptr->candidate_ptr->palette_info,    //ATB MD
    4026             : #else
    4027             :             0,
    4028             : #endif
    4029             :             FILTER_INTRA_MODES,
    4030             :             topNeighArray + 1,
    4031             :             leftNeighArray + 1,
    4032             :             candidate_buffer_ptr->prediction_ptr,
    4033          12 :             md_context_ptr->blk_geom->tx_boff_x[md_context_ptr->tx_depth][md_context_ptr->txb_itr] >> 2,
    4034          12 :             md_context_ptr->blk_geom->tx_boff_y[md_context_ptr->tx_depth][md_context_ptr->txb_itr] >> 2,                                                                              //int32_t row_off,
    4035             :             PLANE_TYPE_Y,
    4036          12 :             md_context_ptr->blk_geom->bsize,
    4037          12 :             md_context_ptr->cu_origin_x,
    4038          12 :             md_context_ptr->cu_origin_y,
    4039          12 :             md_context_ptr->cu_origin_x,
    4040          12 :             md_context_ptr->cu_origin_y,
    4041          12 :             md_context_ptr->blk_geom->tx_org_x[md_context_ptr->tx_depth][md_context_ptr->txb_itr],  //uint32_t cuOrgX used only for prediction Ptr
    4042          12 :             md_context_ptr->blk_geom->tx_org_y[md_context_ptr->tx_depth][md_context_ptr->txb_itr]   //uint32_t cuOrgY used only for prediction Ptr
    4043             :         );
    4044             :     }
    4045             : 
    4046     1899440 :     return return_error;
    4047             : }
    4048             : 
    4049     1471100 : static void tx_search_update_recon_sample_neighbor_array(
    4050             :     NeighborArrayUnit     *lumaReconSampleNeighborArray,
    4051             :     EbPictureBufferDesc   *recon_buffer,
    4052             :     uint32_t               tu_origin_x,
    4053             :     uint32_t               tu_origin_y,
    4054             :     uint32_t               input_origin_x,
    4055             :     uint32_t               input_origin_y,
    4056             :     uint32_t               width,
    4057             :     uint32_t               height,
    4058             :     EbBool                 hbd)
    4059             : {
    4060     1471100 :     if (hbd) {
    4061           0 :         neighbor_array_unit16bit_sample_write(
    4062             :             lumaReconSampleNeighborArray,
    4063           0 :             (uint16_t*)recon_buffer->buffer_y,
    4064           0 :             recon_buffer->stride_y,
    4065           0 :             recon_buffer->origin_x + tu_origin_x,
    4066           0 :             recon_buffer->origin_y + tu_origin_y,
    4067             :             input_origin_x,
    4068             :             input_origin_y,
    4069             :             width,
    4070             :             height,
    4071             :             NEIGHBOR_ARRAY_UNIT_FULL_MASK);
    4072             :     } else {
    4073     1471100 :         neighbor_array_unit_sample_write(
    4074             :             lumaReconSampleNeighborArray,
    4075             :             recon_buffer->buffer_y,
    4076     1471100 :             recon_buffer->stride_y,
    4077     1471100 :             recon_buffer->origin_x + tu_origin_x,
    4078     1471100 :             recon_buffer->origin_y + tu_origin_y,
    4079             :             input_origin_x,
    4080             :             input_origin_y,
    4081             :             width,
    4082             :             height,
    4083             :             NEIGHBOR_ARRAY_UNIT_FULL_MASK);
    4084             :     }
    4085             : 
    4086     1471070 :     return;
    4087             : }
    4088             : 
    4089    35501400 : uint8_t get_end_tx_depth(BlockSize bsize, uint8_t btype) {
    4090    35501400 :     uint8_t tx_depth = 0;
    4091    35501400 :     if (bsize == BLOCK_64X64 ||
    4092    34084600 :         bsize == BLOCK_32X32 ||
    4093    30808900 :         bsize == BLOCK_16X16 ||
    4094    30401400 :         bsize == BLOCK_64X32 ||
    4095    30021700 :         bsize == BLOCK_32X64 ||
    4096    28515900 :         bsize == BLOCK_16X32 ||
    4097    27030300 :         bsize == BLOCK_32X16 ||
    4098    23413500 :         bsize == BLOCK_16X8 ||
    4099             :         bsize == BLOCK_8X16)
    4100    16976900 :         tx_depth = (btype == INTRA_MODE) ? 1 : 1;
    4101    18524600 :     else if (bsize == BLOCK_8X8 ||
    4102    11314500 :         bsize == BLOCK_64X16 ||
    4103    10704700 :         bsize == BLOCK_16X64 ||
    4104     8800200 :         bsize == BLOCK_32X8 ||
    4105     6847650 :         bsize == BLOCK_8X32 ||
    4106     5089960 :         bsize == BLOCK_16X4 ||
    4107             :         bsize == BLOCK_4X16)
    4108    15157900 :         tx_depth = (btype == INTRA_MODE) ? 1 : 1;
    4109             : 
    4110    35501400 :     return tx_depth;
    4111             : }
    4112             : 
    4113             : #if ENHANCE_ATB
    4114             : uint8_t allowed_tx_set_a[TX_SIZES_ALL][TX_TYPES];
    4115             : 
    4116     1627310 : void tx_initialize_neighbor_arrays(
    4117             :     PictureControlSet            *picture_control_set_ptr,
    4118             :     ModeDecisionContext          *context_ptr,
    4119             :     EbBool                       is_inter) {
    4120             : 
    4121             :     // Set recon neighbor array to be used @ intra compensation
    4122     1627310 :     if (!is_inter)
    4123      856733 :         context_ptr->tx_search_luma_recon_neighbor_array =
    4124      856733 :         (context_ptr->tx_depth) ?
    4125      856733 :         picture_control_set_ptr->md_tx_depth_1_luma_recon_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX] :
    4126             :         picture_control_set_ptr->md_luma_recon_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    4127             : 
    4128             :     // Set luma dc sign level coeff
    4129     1627310 :     context_ptr->full_loop_luma_dc_sign_level_coeff_neighbor_array =
    4130     1627310 :         (context_ptr->tx_depth == 1) ?
    4131     1627310 :             picture_control_set_ptr->md_tx_depth_1_luma_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX] :
    4132             :             picture_control_set_ptr->md_luma_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    4133     1627310 : }
    4134             : 
    4135     3547460 : void tx_update_neighbor_arrays(
    4136             :     PictureControlSet            *picture_control_set_ptr,
    4137             :     ModeDecisionContext          *context_ptr,
    4138             :     ModeDecisionCandidateBuffer  *candidate_buffer,
    4139             :     EbBool                        is_inter) {
    4140             : 
    4141     3547460 :     if (context_ptr->tx_depth) {
    4142             : 
    4143     2733960 :         if (!is_inter)
    4144     1471120 :             tx_search_update_recon_sample_neighbor_array(
    4145             :                 context_ptr->tx_search_luma_recon_neighbor_array,
    4146             :                 candidate_buffer->recon_ptr,
    4147     1471120 :                 context_ptr->blk_geom->tx_org_x[context_ptr->tx_depth][context_ptr->txb_itr],
    4148     1471120 :                 context_ptr->blk_geom->tx_org_y[context_ptr->tx_depth][context_ptr->txb_itr],
    4149     1471120 :                 context_ptr->sb_origin_x + context_ptr->blk_geom->tx_org_x[context_ptr->tx_depth][context_ptr->txb_itr],
    4150     1471120 :                 context_ptr->sb_origin_y + context_ptr->blk_geom->tx_org_y[context_ptr->tx_depth][context_ptr->txb_itr],
    4151     1471120 :                 context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    4152     1471120 :                 context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr],
    4153     1471120 :                 context_ptr->hbd_mode_decision);
    4154             : 
    4155     2733910 :         int8_t dc_sign_level_coeff = candidate_buffer->candidate_ptr->quantized_dc[0][context_ptr->txb_itr];
    4156     2733910 :         neighbor_array_unit_mode_write(
    4157             :             picture_control_set_ptr->md_tx_depth_1_luma_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX],
    4158             :             (uint8_t*)&dc_sign_level_coeff,
    4159     2733910 :             context_ptr->sb_origin_x + context_ptr->blk_geom->tx_org_x[context_ptr->tx_depth][context_ptr->txb_itr],
    4160     2733910 :             context_ptr->sb_origin_y + context_ptr->blk_geom->tx_org_y[context_ptr->tx_depth][context_ptr->txb_itr],
    4161     2733910 :             context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    4162     2733910 :             context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr],
    4163             :             NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
    4164             :     }
    4165     3547270 : }
    4166             : 
    4167             : 
    4168      813703 : void tx_reset_neighbor_arrays(
    4169             :     PictureControlSet   *picture_control_set_ptr,
    4170             :     ModeDecisionContext *context_ptr,
    4171             :     EbBool               is_inter,
    4172             :     uint8_t              end_tx_depth) {
    4173             : 
    4174      813703 :     if (end_tx_depth) {
    4175      813703 :         if (!is_inter)
    4176      428399 :             copy_neigh_arr(
    4177             :                 picture_control_set_ptr->md_luma_recon_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX],
    4178             :                 picture_control_set_ptr->md_tx_depth_1_luma_recon_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX],
    4179      428399 :                 context_ptr->sb_origin_x + context_ptr->blk_geom->origin_x,
    4180      428399 :                 context_ptr->sb_origin_y + context_ptr->blk_geom->origin_y,
    4181      428399 :                 context_ptr->blk_geom->bwidth,
    4182      428399 :                 context_ptr->blk_geom->bheight,
    4183             :                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
    4184             : 
    4185      813690 :         copy_neigh_arr(
    4186             :             picture_control_set_ptr->md_luma_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX],
    4187             :             picture_control_set_ptr->md_tx_depth_1_luma_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX],
    4188      813690 :             context_ptr->sb_origin_x + context_ptr->blk_geom->origin_x,
    4189      813690 :             context_ptr->sb_origin_y + context_ptr->blk_geom->origin_y,
    4190      813690 :             context_ptr->blk_geom->bwidth,
    4191      813690 :             context_ptr->blk_geom->bheight,
    4192             :             NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
    4193             :     }
    4194      813690 : }
    4195             : 
    4196     2647620 : void tx_type_search(
    4197             :     SequenceControlSet           *sequence_control_set_ptr,
    4198             :     PictureControlSet            *picture_control_set_ptr,
    4199             :     ModeDecisionContext          *context_ptr,
    4200             :     ModeDecisionCandidateBuffer  *candidate_buffer,
    4201             :     uint32_t                      qp)
    4202             : {
    4203     2647620 :     EbPictureBufferDesc *input_picture_ptr = picture_control_set_ptr->parent_pcs_ptr->enhanced_picture_ptr;
    4204     5295240 :     int32_t seg_qp = picture_control_set_ptr->parent_pcs_ptr->frm_hdr.segmentation_params.segmentation_enabled ?
    4205     2647620 :         picture_control_set_ptr->parent_pcs_ptr->frm_hdr.segmentation_params.feature_data[context_ptr->cu_ptr->segment_id][SEG_LVL_ALT_Q] : 0;
    4206             : 
    4207     2647620 :     TxType txk_start = DCT_DCT;
    4208     2647620 :     TxType txk_end = TX_TYPES;
    4209     2647620 :     uint64_t best_cost_tx_search = (uint64_t)~0;
    4210             :     int32_t tx_type;
    4211     2647620 :     TxSize txSize = context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr];
    4212     2647620 :     int32_t is_inter = (candidate_buffer->candidate_ptr->type == INTER_MODE || candidate_buffer->candidate_ptr->use_intrabc) ? EB_TRUE : EB_FALSE;
    4213     2647620 :     const TxSetType tx_set_type = get_ext_tx_set_type(txSize, is_inter, picture_control_set_ptr->parent_pcs_ptr->tx_search_reduced_set);
    4214     2647770 :     uint8_t txb_origin_x = (uint8_t)context_ptr->blk_geom->tx_org_x[context_ptr->tx_depth][context_ptr->txb_itr];
    4215     2647770 :     uint8_t txb_origin_y = (uint8_t)context_ptr->blk_geom->tx_org_y[context_ptr->tx_depth][context_ptr->txb_itr];
    4216     2647770 :     uint32_t tu_origin_index = txb_origin_x + (txb_origin_y * candidate_buffer->residual_ptr->stride_y);
    4217     2647770 :     uint32_t input_tu_origin_index = (context_ptr->sb_origin_x + txb_origin_x + input_picture_ptr->origin_x) + ((context_ptr->sb_origin_y + txb_origin_y + input_picture_ptr->origin_y) * input_picture_ptr->stride_y);
    4218             : 
    4219             : 
    4220     2647770 :     context_ptr->luma_txb_skip_context = 0;
    4221     2647770 :     context_ptr->luma_dc_sign_context = 0;
    4222     2647770 :     get_txb_ctx(
    4223             :         sequence_control_set_ptr,
    4224             :         COMPONENT_LUMA,
    4225             :         context_ptr->full_loop_luma_dc_sign_level_coeff_neighbor_array,
    4226     2647770 :         context_ptr->sb_origin_x + txb_origin_x,
    4227     2647770 :         context_ptr->sb_origin_y + txb_origin_y,
    4228     2647770 :         context_ptr->blk_geom->bsize,
    4229     2647770 :         context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    4230             :         &context_ptr->luma_txb_skip_context,
    4231             :         &context_ptr->luma_dc_sign_context);
    4232             : 
    4233     2647680 :     if (picture_control_set_ptr->parent_pcs_ptr->tx_search_reduced_set == 2)
    4234           0 :         txk_end = 2;
    4235             : 
    4236     2647680 :     TxType best_tx_type = DCT_DCT;
    4237    44979500 :     for (tx_type = txk_start; tx_type < txk_end; ++tx_type) {
    4238             : 
    4239             :         uint64_t tuFullDistortion[3][DIST_CALC_TOTAL];
    4240    42319100 :         uint64_t y_tu_coeff_bits = 0;
    4241             :         uint32_t y_count_non_zero_coeffs;
    4242             : 
    4243             :         //context_ptr->three_quad_energy = 0;
    4244    42319100 :         if (tx_type != DCT_DCT) {
    4245    39650400 :             if (is_inter) {
    4246    11219900 :                 TxSize max_tx_size = context_ptr->blk_geom->txsize[0][0];
    4247    11219900 :                 const TxSetType tx_set_type = get_ext_tx_set_type(max_tx_size, is_inter, picture_control_set_ptr->parent_pcs_ptr->tx_search_reduced_set);
    4248    11219600 :                 int32_t eset = get_ext_tx_set(max_tx_size, is_inter, picture_control_set_ptr->parent_pcs_ptr->tx_search_reduced_set);
    4249             :                 // eset == 0 should correspond to a set with only DCT_DCT and there
    4250             :                 // is no need to send the tx_type
    4251    36557900 :                 if (eset <= 0) continue;
    4252    10321700 :                 else if (av1_ext_tx_used[tx_set_type][tx_type] == 0) continue;
    4253     7513480 :                 else if (context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr] > 32 || context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr] > 32)  continue;
    4254             :             }
    4255             : 
    4256    35944000 :             int32_t eset = get_ext_tx_set(context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr], is_inter, picture_control_set_ptr->parent_pcs_ptr->tx_search_reduced_set);
    4257             :             // eset == 0 should correspond to a set with only DCT_DCT and there
    4258             :             // is no need to send the tx_type
    4259    35945400 :             if (eset <= 0) continue;
    4260    34269600 :             else if (av1_ext_tx_used[tx_set_type][tx_type] == 0) continue;
    4261    17868300 :             else if (context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr] > 32 || context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr] > 32) continue;
    4262             :         }
    4263             : 
    4264    20537000 :         if (picture_control_set_ptr->parent_pcs_ptr->tx_search_reduced_set)
    4265           0 :             if (!allowed_tx_set_a[context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr]][tx_type]) continue;
    4266             : 
    4267             :         // For Inter blocks, transform type of chroma follows luma transfrom type
    4268    20537000 :         if (is_inter)
    4269     8261160 :             candidate_buffer->candidate_ptr->transform_type_uv = (context_ptr->txb_itr == 0) ? candidate_buffer->candidate_ptr->transform_type[context_ptr->txb_itr] : candidate_buffer->candidate_ptr->transform_type_uv;
    4270             : 
    4271             :         // Y: T Q iQ
    4272    20537000 :         av1_estimate_transform(
    4273    20537000 :             &(((int16_t*)candidate_buffer->residual_ptr->buffer_y)[tu_origin_index]),
    4274    20537000 :             candidate_buffer->residual_ptr->stride_y,
    4275    20537000 :             &(((int32_t*)context_ptr->trans_quant_buffers_ptr->tu_trans_coeff2_nx2_n_ptr->buffer_y)[context_ptr->txb_1d_offset]),
    4276             :             NOT_USED_VALUE,
    4277    20537000 :             context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    4278             :             &context_ptr->three_quad_energy,
    4279             :             context_ptr->transform_inner_array_ptr,
    4280    20537000 :             context_ptr->hbd_mode_decision ? BIT_INCREMENT_10BIT : BIT_INCREMENT_8BIT,
    4281             :             tx_type,
    4282             :             PLANE_TYPE_Y,
    4283             :             DEFAULT_SHAPE);
    4284             : 
    4285    20550200 :         av1_quantize_inv_quantize(
    4286             :             picture_control_set_ptr,
    4287             :             context_ptr,
    4288    20550200 :             &(((int32_t*)context_ptr->trans_quant_buffers_ptr->tu_trans_coeff2_nx2_n_ptr->buffer_y)[context_ptr->txb_1d_offset]),
    4289             :             NOT_USED_VALUE,
    4290    20550200 :             &(((int32_t*)candidate_buffer->residual_quant_coeff_ptr->buffer_y)[context_ptr->txb_1d_offset]),
    4291    20550200 :             &(((int32_t*)candidate_buffer->recon_coeff_ptr->buffer_y)[context_ptr->txb_1d_offset]),
    4292             :             qp,
    4293             :             seg_qp,
    4294    20550200 :             context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    4295    20550200 :             context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr],
    4296    20550200 :             context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    4297    20550200 :             &candidate_buffer->candidate_ptr->eob[0][context_ptr->txb_itr],
    4298             :             &y_count_non_zero_coeffs,
    4299             :             COMPONENT_LUMA,
    4300    20550200 :             context_ptr->hbd_mode_decision ? BIT_INCREMENT_10BIT : BIT_INCREMENT_8BIT,
    4301             :             tx_type,
    4302             :             candidate_buffer,
    4303    20550200 :             context_ptr->luma_txb_skip_context,
    4304    20550200 :             context_ptr->luma_dc_sign_context,
    4305    20550200 :             candidate_buffer->candidate_ptr->pred_mode,
    4306             :             EB_FALSE,
    4307             :             EB_FALSE);
    4308             : 
    4309    20547100 :         candidate_buffer->candidate_ptr->quantized_dc[0][context_ptr->txb_itr] = (((int32_t*)candidate_buffer->residual_quant_coeff_ptr->buffer_y)[context_ptr->txb_1d_offset]);
    4310    20547100 :         uint32_t y_has_coeff = y_count_non_zero_coeffs > 0;
    4311             : 
    4312             :         // tx_type not equal to DCT_DCT and no coeff is not an acceptable option in AV1.
    4313    20547100 :         if (y_has_coeff == 0 && tx_type != DCT_DCT)
    4314     4450620 :             continue;
    4315             : 
    4316             : 
    4317    16096500 :         if (y_has_coeff)
    4318    15548700 :             inv_transform_recon_wrapper(
    4319    15548700 :                 candidate_buffer->prediction_ptr->buffer_y,
    4320             :                 tu_origin_index,
    4321    15548700 :                 candidate_buffer->prediction_ptr->stride_y,
    4322    15548700 :                 candidate_buffer->recon_ptr->buffer_y,
    4323             :                 tu_origin_index,
    4324    15548700 :                 candidate_buffer->recon_ptr->stride_y,
    4325    15548700 :                 (int32_t*)candidate_buffer->recon_coeff_ptr->buffer_y,
    4326             :                 context_ptr->txb_1d_offset,
    4327    15548700 :                 context_ptr->hbd_mode_decision,
    4328    15548700 :                 context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    4329             :                 tx_type,
    4330             :                 PLANE_TYPE_Y,
    4331    15548700 :                 (uint16_t)candidate_buffer->candidate_ptr->eob[0][context_ptr->txb_itr]);
    4332             :         else
    4333      547773 :             picture_copy(
    4334             :                 candidate_buffer->prediction_ptr,
    4335             :                 tu_origin_index,
    4336             :                 0,
    4337             :                 candidate_buffer->recon_ptr,
    4338             :                 tu_origin_index,
    4339             :                 0,
    4340      547773 :                 context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    4341      547773 :                 context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr],
    4342             :                 0,
    4343             :                 0,
    4344             :                 PICTURE_BUFFER_DESC_Y_FLAG,
    4345      547773 :                 context_ptr->hbd_mode_decision);
    4346             : 
    4347    32212700 :         EbSpatialFullDistType spatial_full_dist_type_fun = context_ptr->hbd_mode_decision ?
    4348    16106300 :             full_distortion_kernel16_bits : spatial_full_distortion_kernel;
    4349             : 
    4350    32206300 :         tuFullDistortion[0][DIST_CALC_PREDICTION] = spatial_full_dist_type_fun(
    4351             :             input_picture_ptr->buffer_y,
    4352             :             input_tu_origin_index,
    4353    16106300 :             input_picture_ptr->stride_y,
    4354    16106300 :             candidate_buffer->prediction_ptr->buffer_y,
    4355             :             tu_origin_index,
    4356    16106300 :             candidate_buffer->prediction_ptr->stride_y,
    4357    16106300 :             context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    4358    16106300 :             context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr]);
    4359             : 
    4360    32199100 :         tuFullDistortion[0][DIST_CALC_RESIDUAL] = spatial_full_dist_type_fun(
    4361             :             input_picture_ptr->buffer_y,
    4362             :             input_tu_origin_index,
    4363    16099900 :             input_picture_ptr->stride_y,
    4364    16099900 :             candidate_buffer->recon_ptr->buffer_y,
    4365             :             tu_origin_index,
    4366    16099900 :             candidate_buffer->recon_ptr->stride_y,
    4367    16099900 :             context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    4368    16099900 :             context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr]);
    4369             : 
    4370    16099200 :         tuFullDistortion[0][DIST_CALC_PREDICTION] <<= 4;
    4371    16099200 :         tuFullDistortion[0][DIST_CALC_RESIDUAL] <<= 4;
    4372             : 
    4373             :         //LUMA-ONLY
    4374    16099200 :         av1_tu_estimate_coeff_bits(
    4375             :             context_ptr,
    4376             :             0,   //allow_update_cdf,
    4377             :             NULL,//FRAME_CONTEXT *ec_ctx,
    4378             :             picture_control_set_ptr,
    4379             :             candidate_buffer,
    4380             :             context_ptr->txb_1d_offset,
    4381             :             0,
    4382             :             context_ptr->coeff_est_entropy_coder_ptr,
    4383             :             candidate_buffer->residual_quant_coeff_ptr,
    4384             :             y_count_non_zero_coeffs,
    4385             :             0,
    4386             :             0,
    4387             :             &y_tu_coeff_bits,
    4388             :             &y_tu_coeff_bits,
    4389             :             &y_tu_coeff_bits,
    4390    16099200 :             context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    4391    16099200 :             context_ptr->blk_geom->txsize_uv[context_ptr->tx_depth][context_ptr->txb_itr],
    4392             :             tx_type,
    4393    16099200 :             candidate_buffer->candidate_ptr->transform_type_uv,
    4394             :             COMPONENT_LUMA);
    4395             : 
    4396    16095700 :         uint64_t cost = RDCOST(context_ptr->full_lambda, y_tu_coeff_bits, tuFullDistortion[0][DIST_CALC_RESIDUAL]);
    4397    16095700 :         if (cost < best_cost_tx_search) {
    4398     5495560 :             best_cost_tx_search = cost;
    4399     5495560 :             best_tx_type = tx_type;
    4400             :         }
    4401             :     }
    4402             : 
    4403             :     //  Best Tx Type Pass
    4404     2660460 :     candidate_buffer->candidate_ptr->transform_type[context_ptr->txb_itr] = best_tx_type;
    4405             : 
    4406             :     // For Inter blocks, transform type of chroma follows luma transfrom type
    4407     2660460 :     if (is_inter)
    4408      748392 :         candidate_buffer->candidate_ptr->transform_type_uv = (context_ptr->txb_itr == 0) ? candidate_buffer->candidate_ptr->transform_type[context_ptr->txb_itr] : candidate_buffer->candidate_ptr->transform_type_uv;
    4409             : 
    4410     2660460 : }
    4411             : 
    4412    14044100 : static INLINE int block_signals_txsize(BlockSize bsize) {
    4413    14044100 :     return bsize > BLOCK_4X4;
    4414             : }
    4415             : 
    4416    19252000 : static INLINE int is_inter_block(const BlockModeInfo *bloc_mi) {
    4417    19252000 :     return is_intrabc_block(bloc_mi) || bloc_mi->ref_frame[0] > INTRA_FRAME;
    4418             : }
    4419             : 
    4420     3135220 : static INLINE int get_vartx_max_txsize(/*const MbModeInfo *xd,*/ BlockSize bsize,
    4421             :     int plane) {
    4422             :     /* if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;*/
    4423     3135220 :     const TxSize max_txsize = max_txsize_rect_lookup[bsize];
    4424     3135220 :     if (plane == 0) return max_txsize;            // luma
    4425           0 :     return av1_get_adjusted_tx_size(max_txsize);  // chroma
    4426             : }
    4427             : 
    4428     4219450 : static INLINE int max_block_wide(const MacroBlockD *xd, BlockSize bsize,
    4429             :     int plane) {
    4430     4219450 :     int max_blocks_wide = block_size_wide[bsize];
    4431     4219450 :     const struct macroblockd_plane *const pd = &xd->plane[plane];
    4432             : 
    4433     4219450 :     if (xd->mb_to_right_edge < 0)
    4434           0 :         max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
    4435             : 
    4436             :     // Scale the width in the transform block unit.
    4437     4219450 :     return max_blocks_wide >> tx_size_wide_log2[0];
    4438             : }
    4439             : 
    4440     4219400 : static INLINE int max_block_high(const MacroBlockD *xd, BlockSize bsize,
    4441             :     int plane) {
    4442     4219400 :     int max_blocks_high = block_size_high[bsize];
    4443     4219400 :     const struct macroblockd_plane *const pd = &xd->plane[plane];
    4444             : 
    4445     4219400 :     if (xd->mb_to_bottom_edge < 0)
    4446       10987 :         max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
    4447             : 
    4448             :     // Scale the height in the transform block unit.
    4449     4219400 :     return max_blocks_high >> tx_size_high_log2[0];
    4450             : }
    4451             : 
    4452     3755710 : static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
    4453             :     TXFM_CONTEXT *left_ctx,
    4454             :     TxSize tx_size, TxSize txb_size) {
    4455     3755710 :     BlockSize bsize = txsize_to_bsize[txb_size];
    4456     3755710 :     int bh = mi_size_high[bsize];
    4457     3755710 :     int bw = mi_size_wide[bsize];
    4458     3755710 :     uint8_t txw = tx_size_wide[tx_size];
    4459     3755710 :     uint8_t txh = tx_size_high[tx_size];
    4460             :     int i;
    4461    15887500 :     for (i = 0; i < bh; ++i) left_ctx[i] = txh;
    4462    15857800 :     for (i = 0; i < bw; ++i) above_ctx[i] = txw;
    4463     3755710 : }
    4464             : 
    4465     3999000 : static INLINE TxSize get_sqr_tx_size(int tx_dim) {
    4466     3999000 :     switch (tx_dim) {
    4467      217315 :     case 128:
    4468      217315 :     case 64: return TX_64X64; break;
    4469      807066 :     case 32: return TX_32X32; break;
    4470     1847190 :     case 16: return TX_16X16; break;
    4471     1127840 :     case 8: return TX_8X8; break;
    4472           0 :     default: return TX_4X4;
    4473             :     }
    4474             : }
    4475     3998980 : static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
    4476             :     TXFM_CONTEXT *left_ctx,
    4477             :     BlockSize bsize, TxSize tx_size) {
    4478     3998980 :     const uint8_t txw = tx_size_wide[tx_size];
    4479     3998980 :     const uint8_t txh = tx_size_high[tx_size];
    4480     3998980 :     const int above = *above_ctx < txw;
    4481     3998980 :     const int left = *left_ctx < txh;
    4482     3998980 :     int category = TXFM_PARTITION_CONTEXTS;
    4483             : 
    4484             :     // dummy return, not used by others.
    4485     3998980 :     if (tx_size <= TX_4X4) return 0;
    4486             : 
    4487             :     TxSize max_tx_size =
    4488     3998980 :         get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize]));
    4489             : 
    4490     3999060 :     if (max_tx_size >= TX_8X8) {
    4491     3999290 :         category =
    4492     3999290 :             (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) +
    4493     3999290 :             (TX_SIZES - 1 - max_tx_size) * 2;
    4494             :     }
    4495     3999060 :     assert(category != TXFM_PARTITION_CONTEXTS);
    4496     3999060 :     return category * 3 + above + left;
    4497             : }
    4498             : 
    4499     4219420 : static uint64_t cost_tx_size_vartx(MacroBlockD *xd, const MbModeInfo *mbmi,
    4500             :     TxSize tx_size, int depth, int blk_row,
    4501             :     int blk_col, MdRateEstimationContext  *md_rate_estimation_ptr) {
    4502     4219420 :     uint64_t bits = 0;
    4503     4219420 :     const int max_blocks_high = max_block_high(xd, mbmi->block_mi.sb_type, 0);
    4504     4219460 :     const int max_blocks_wide = max_block_wide(xd, mbmi->block_mi.sb_type, 0);
    4505             : 
    4506     4219420 :     if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return bits;
    4507             : 
    4508     4219420 :     if (depth == MAX_VARTX_DEPTH) {
    4509             : 
    4510      220466 :         txfm_partition_update(xd->above_txfm_context + blk_col,
    4511      220466 :             xd->left_txfm_context + blk_row, tx_size, tx_size);
    4512             : 
    4513      220465 :         return bits;
    4514             :     }
    4515             : 
    4516     3998960 :     const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
    4517     3998960 :         xd->left_txfm_context + blk_row,
    4518     3998960 :         mbmi->block_mi.sb_type, tx_size);
    4519             : 
    4520     3999110 :     const int write_txfm_partition = (tx_size == tx_depth_to_tx_size[mbmi->tx_depth][mbmi->block_mi.sb_type]);
    4521             : 
    4522     3999110 :     if (write_txfm_partition) {
    4523     3336480 :         bits += md_rate_estimation_ptr->txfm_partition_fac_bits[ctx][0];
    4524             : 
    4525     3336480 :         txfm_partition_update(xd->above_txfm_context + blk_col,
    4526     3336480 :             xd->left_txfm_context + blk_row, tx_size, tx_size);
    4527             : 
    4528             :     }
    4529             :     else {
    4530      662635 :         const TxSize sub_txs = sub_tx_size_map[tx_size];
    4531      662635 :         const int bsw = tx_size_wide_unit[sub_txs];
    4532      662635 :         const int bsh = tx_size_high_unit[sub_txs];
    4533             : 
    4534      662635 :         bits += md_rate_estimation_ptr->txfm_partition_fac_bits[ctx][1];
    4535      662635 :         if (sub_txs == TX_4X4) {
    4536             : 
    4537      198793 :             txfm_partition_update(xd->above_txfm_context + blk_col,
    4538      198793 :                 xd->left_txfm_context + blk_row, sub_txs, tx_size);
    4539             : 
    4540      198791 :             return bits;
    4541             :         }
    4542             : 
    4543      463842 :         assert(bsw > 0 && bsh > 0);
    4544     1203220 :         for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh)
    4545     1823930 :             for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
    4546     1084550 :                 int offsetr = blk_row + row;
    4547     1084550 :                 int offsetc = blk_col + col;
    4548     1084550 :                 bits += cost_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, md_rate_estimation_ptr);
    4549             :             }
    4550             :     }
    4551     3800310 :     return bits;
    4552             : }
    4553             : 
    4554    12113300 : static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
    4555             :     int i;
    4556    49466400 :     for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
    4557    12113300 : }
    4558             : 
    4559     6056740 : static INLINE void set_txfm_ctxs(TxSize tx_size, int n8_w, int n8_h, int skip,
    4560             :     const MacroBlockD *xd) {
    4561     6056740 :     uint8_t bw = tx_size_wide[tx_size];
    4562     6056740 :     uint8_t bh = tx_size_high[tx_size];
    4563             : 
    4564     6056740 :     if (skip) {
    4565      673069 :         bw = n8_w * MI_SIZE;
    4566      673069 :         bh = n8_h * MI_SIZE;
    4567             :     }
    4568             : 
    4569     6056740 :     set_txfm_ctx(xd->above_txfm_context, bw, n8_w);
    4570     6057680 :     set_txfm_ctx(xd->left_txfm_context, bh, n8_h);
    4571     6058500 : }
    4572             : 
    4573     4852790 : static INLINE int tx_size_to_depth(TxSize tx_size, BlockSize bsize) {
    4574     4852790 :     TxSize ctx_size = max_txsize_rect_lookup[bsize];
    4575     4852790 :     int depth = 0;
    4576     5714890 :     while (tx_size != ctx_size) {
    4577      862099 :         depth++;
    4578      862099 :         ctx_size = sub_tx_size_map[ctx_size];
    4579      862099 :         assert(depth <= MAX_TX_DEPTH);
    4580             :     }
    4581     4852790 :     return depth;
    4582             : }
    4583             : 
    4584             : #define BLOCK_SIZES_ALL 22
    4585             : 
    4586             : // Returns a context number for the given MB prediction signal
    4587             : // The mode info data structure has a one element border above and to the
    4588             : // left of the entries corresponding to real blocks.
    4589             : // The prediction flags in these dummy entries are initialized to 0.
    4590     4853850 : static INLINE int get_tx_size_context(const MacroBlockD *xd) {
    4591     4853850 :     const ModeInfo *mi = xd->mi[0];
    4592     4853850 :     const MbModeInfo *mbmi = &mi->mbmi;
    4593     4853850 :     const MbModeInfo *const above_mbmi = xd->above_mbmi;
    4594     4853850 :     const MbModeInfo *const left_mbmi = xd->left_mbmi;
    4595     4853850 :     const TxSize max_tx_size = max_txsize_rect_lookup[mbmi->block_mi.sb_type];
    4596     4853850 :     const int max_tx_wide = tx_size_wide[max_tx_size];
    4597     4853850 :     const int max_tx_high = tx_size_high[max_tx_size];
    4598     4853850 :     const int has_above = xd->up_available;
    4599     4853850 :     const int has_left = xd->left_available;
    4600             : 
    4601     4853850 :     int above = xd->above_txfm_context[0] >= max_tx_wide;
    4602     4853850 :     int left = xd->left_txfm_context[0] >= max_tx_high;
    4603             : 
    4604     4853850 :     if (has_above)
    4605     4625990 :         if (is_inter_block(&above_mbmi->block_mi))
    4606      950897 :             above = block_size_wide[above_mbmi->block_mi.sb_type] >= max_tx_wide;
    4607             : 
    4608     4853520 :     if (has_left)
    4609     4668580 :         if (is_inter_block(&left_mbmi->block_mi))
    4610      943619 :             left = block_size_high[left_mbmi->block_mi.sb_type] >= max_tx_high;
    4611             : 
    4612     4852740 :     if (has_above && has_left)
    4613     4446440 :         return (above + left);
    4614      406300 :     else if (has_above)
    4615      178721 :         return above;
    4616      227579 :     else if (has_left)
    4617      221691 :         return left;
    4618             :     else
    4619        5888 :         return 0;
    4620             : }
    4621             : 
    4622     4854510 : static uint64_t cost_selected_tx_size(
    4623             :     const MacroBlockD *xd,
    4624             :     MdRateEstimationContext  *md_rate_estimation_ptr) {
    4625     4854510 :     const ModeInfo *const mi = xd->mi[0];
    4626     4854510 :     const MbModeInfo *const mbmi = &mi->mbmi;
    4627     4854510 :     const BlockSize bsize = mbmi->block_mi.sb_type;
    4628     4854510 :     uint64_t bits = 0;
    4629             : 
    4630     4854510 :     if (block_signals_txsize(bsize)) {
    4631     4853720 :         const TxSize tx_size = mbmi->tx_size;
    4632     4853720 :         const int tx_size_ctx = get_tx_size_context(xd);
    4633     4852680 :         const int depth = tx_size_to_depth(tx_size, bsize);
    4634     4852900 :         const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
    4635     4853100 :         bits += md_rate_estimation_ptr->tx_size_fac_bits[tx_size_cat][tx_size_ctx][depth];
    4636             :     }
    4637             : 
    4638     4853080 :     return bits;
    4639             : }
    4640             : 
    4641     9193970 : static uint64_t tx_size_bits(
    4642             :     MdRateEstimationContext  *md_rate_estimation_ptr,
    4643             :     MacroBlockD         *xd,
    4644             :     const MbModeInfo    *mbmi,
    4645             :     TxMode              tx_mode,
    4646             :     BlockSize          bsize,
    4647             :     uint8_t             skip) {
    4648             : 
    4649     9193970 :     uint64_t bits = 0;
    4650             : 
    4651     9193970 :     int is_inter_tx = is_inter_block(&mbmi->block_mi) || is_intrabc_block(&mbmi->block_mi);
    4652     9193600 :     if (tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
    4653     3768360 :         !(is_inter_tx && skip) /*&& !xd->lossless[segment_id]*/) {
    4654     7989780 :         if (is_inter_tx) {  // This implies skip flag is 0.
    4655     3135250 :             const TxSize max_tx_size = get_vartx_max_txsize(/*xd,*/ bsize, 0);
    4656     3135230 :             const int txbh = tx_size_high_unit[max_tx_size];
    4657     3135230 :             const int txbw = tx_size_wide_unit[max_tx_size];
    4658     3135230 :             const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
    4659     3135230 :             const int height = block_size_high[bsize] >> tx_size_high_log2[0];
    4660             :             int idx, idy;
    4661     6270280 :             for (idy = 0; idy < height; idy += txbh)
    4662     6270270 :                 for (idx = 0; idx < width; idx += txbw)
    4663     3135210 :                     bits += cost_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, md_rate_estimation_ptr);
    4664             :         }
    4665             :         else {
    4666     4854540 :             bits += cost_selected_tx_size(xd, md_rate_estimation_ptr);
    4667     4853170 :             set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, 0, xd);
    4668             :         }
    4669             :     }
    4670             :     else {
    4671     1975990 :         set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h,
    4672      772352 :             skip && is_inter_block(&mbmi->block_mi), xd);
    4673             :     }
    4674     9193150 :     return bits;
    4675             : }
    4676             : 
    4677             : void set_mi_row_col(
    4678             :     PictureControlSet       *picture_control_set_ptr,
    4679             :     MacroBlockD             *xd,
    4680             :     TileInfo *              tile,
    4681             :     int                     mi_row,
    4682             :     int                     bh,
    4683             :     int                     mi_col,
    4684             :     int                     bw,
    4685             :     uint32_t                mi_stride,
    4686             :     int                     mi_rows,
    4687             :     int                     mi_cols);
    4688             : 
    4689     9193640 : uint64_t estimate_tx_size_bits(
    4690             :     PictureControlSet       *pcsPtr,
    4691             :     ModeDecisionContext     *context_ptr,
    4692             :     ModeDecisionCandidate   *candidate_ptr,
    4693             :     EbBool                   skip_flag,
    4694             :     uint32_t                 cu_origin_x,
    4695             :     uint32_t                 cu_origin_y,
    4696             :     CodingUnit               *cu_ptr,
    4697             :     const BlockGeom          *blk_geom,
    4698             :     NeighborArrayUnit        *txfm_context_array,
    4699             :     uint8_t                   tx_depth,
    4700             :     MdRateEstimationContext  *md_rate_estimation_ptr) {
    4701     9193640 :     uint32_t txfm_context_left_index = get_neighbor_array_unit_left_index(
    4702             :         txfm_context_array,
    4703             :         cu_origin_y);
    4704     9192980 :     uint32_t txfm_context_above_index = get_neighbor_array_unit_top_index(
    4705             :         txfm_context_array,
    4706             :         cu_origin_x);
    4707             : 
    4708     9193120 :     TxMode tx_mode = pcsPtr->parent_pcs_ptr->frm_hdr.tx_mode;
    4709     9193120 :     Av1Common  *cm = pcsPtr->parent_pcs_ptr->av1_cm;
    4710     9193120 :     MacroBlockD *xd = cu_ptr->av1xd;
    4711     9193120 :     TileInfo * tile = &xd->tile;
    4712     9193120 :     int32_t mi_row = cu_origin_y >> MI_SIZE_LOG2;
    4713     9193120 :     int32_t mi_col = cu_origin_x >> MI_SIZE_LOG2;
    4714     9193120 :     BlockSize bsize = blk_geom->bsize;
    4715     9193120 :     const int32_t bw = mi_size_wide[bsize];
    4716     9193120 :     const int32_t bh = mi_size_high[bsize];
    4717     9193120 :     uint32_t mi_stride = pcsPtr->mi_stride;
    4718             : 
    4719     9193120 :     set_mi_row_col(
    4720             :         pcsPtr,
    4721             :         xd,
    4722             :         tile,
    4723             :         mi_row,
    4724             :         bh,
    4725             :         mi_col,
    4726             :         bw,
    4727             :         mi_stride,
    4728             :         cm->mi_rows,
    4729             :         cm->mi_cols);
    4730             : 
    4731     9193810 :     MbModeInfo * mbmi = &xd->mi[0]->mbmi;
    4732             : 
    4733     9193810 :     memcpy(context_ptr->above_txfm_context, &(txfm_context_array->top_array[txfm_context_above_index]), (blk_geom->bwidth >> MI_SIZE_LOG2) * sizeof(TXFM_CONTEXT));
    4734     9193810 :     memcpy(context_ptr->left_txfm_context, &(txfm_context_array->left_array[txfm_context_left_index]), (blk_geom->bheight >> MI_SIZE_LOG2) * sizeof(TXFM_CONTEXT));
    4735             : 
    4736     9193810 :     xd->above_txfm_context = context_ptr->above_txfm_context;
    4737     9193810 :     xd->left_txfm_context = context_ptr->left_txfm_context;
    4738             : 
    4739     9193810 :     mbmi->tx_size = blk_geom->txsize[tx_depth][0];
    4740     9193810 :     mbmi->block_mi.sb_type = blk_geom->bsize;
    4741     9193810 :     mbmi->block_mi.use_intrabc = candidate_ptr->use_intrabc;
    4742     9193810 :     mbmi->block_mi.ref_frame[0] = candidate_ptr->ref_frame_type;
    4743     9193810 :     mbmi->tx_depth = tx_depth;
    4744             : 
    4745     9193810 :     uint64_t bits = tx_size_bits(
    4746             :         md_rate_estimation_ptr,
    4747             :         xd,
    4748             :         mbmi,
    4749             :         tx_mode,
    4750             :         bsize,
    4751             :         skip_flag);
    4752             : 
    4753     9193060 :     return bits;
    4754             : }
    4755             : 
    4756     9194160 : uint64_t get_tx_size_bits(
    4757             :     ModeDecisionCandidateBuffer  *candidateBuffer,
    4758             :     ModeDecisionContext          *context_ptr,
    4759             :     PictureControlSet            *picture_control_set_ptr,
    4760             :     uint8_t tx_depth,
    4761             :     EbBool block_has_coeff) {
    4762             : 
    4763     9194160 :     uint64_t tx_size_bits = 0;
    4764             : 
    4765     9194160 :     tx_size_bits = estimate_tx_size_bits(
    4766             :         picture_control_set_ptr,
    4767             :         context_ptr,
    4768             :         candidateBuffer->candidate_ptr,
    4769             :         block_has_coeff ? 0 : 1,
    4770     9194160 :         context_ptr->cu_origin_x,
    4771     9194160 :         context_ptr->cu_origin_y,
    4772             :         context_ptr->cu_ptr,
    4773             :         context_ptr->blk_geom,
    4774             :         context_ptr->txfm_context_array,
    4775             :         tx_depth,
    4776             :         context_ptr->md_rate_estimation_ptr);
    4777             : 
    4778     9193020 :     return tx_size_bits;
    4779             : }
    4780             : 
    4781      813702 : void tx_partitioning_path(
    4782             :     ModeDecisionCandidateBuffer  *candidate_buffer,
    4783             :     ModeDecisionContext          *context_ptr,
    4784             :     PictureControlSet            *picture_control_set_ptr,
    4785             :     uint64_t                      ref_fast_cost,
    4786             :     uint8_t                       end_tx_depth,
    4787             :     uint32_t                      qp,
    4788             :     uint32_t                     *y_count_non_zero_coeffs,
    4789             :     uint64_t                     *y_coeff_bits,
    4790             :     uint64_t                     *y_full_distortion)
    4791             : {
    4792      813702 :     EbPictureBufferDesc *input_picture_ptr = picture_control_set_ptr->parent_pcs_ptr->enhanced_picture_ptr;
    4793      813702 :     SequenceControlSet  *sequence_control_set_ptr = (SequenceControlSet*)picture_control_set_ptr->sequence_control_set_wrapper_ptr->object_ptr;
    4794      813702 :     int32_t is_inter = (candidate_buffer->candidate_ptr->type == INTER_MODE || candidate_buffer->candidate_ptr->use_intrabc) ? EB_TRUE : EB_FALSE;
    4795             : 
    4796             : 
    4797      813702 :     uint8_t  best_tx_depth = 0;
    4798      813702 :     uint64_t best_cost_search = (uint64_t)~0;
    4799             : 
    4800             :     // Fill the scratch buffer
    4801      813702 :     memcpy(context_ptr->scratch_candidate_buffer->candidate_ptr, candidate_buffer->candidate_ptr, sizeof(ModeDecisionCandidate));
    4802             : 
    4803      813702 :     if (is_inter) {
    4804             : 
    4805      385315 :         uint32_t block_index = context_ptr->blk_geom->origin_x + (context_ptr->blk_geom->origin_y * MAX_SB_SIZE);
    4806             : 
    4807             :         // Copy pred
    4808             :         {
    4809      385315 :             EbByte src = &(candidate_buffer->prediction_ptr->buffer_y[block_index]);
    4810      385315 :             EbByte dst = &(context_ptr->scratch_candidate_buffer->prediction_ptr->buffer_y[block_index]);
    4811     6196280 :             for (int i = 0; i < context_ptr->blk_geom->bheight; i++) {
    4812     5810960 :                 memcpy(dst, src, context_ptr->blk_geom->bwidth);
    4813     5810960 :                 src += candidate_buffer->prediction_ptr->stride_y;
    4814     5810960 :                 dst += context_ptr->scratch_candidate_buffer->prediction_ptr->stride_y;
    4815             :             }
    4816             :         }
    4817             : 
    4818             :         // Copy residual
    4819             :         {
    4820      385315 :             int16_t* src = &(((int16_t*)candidate_buffer->residual_ptr->buffer_y)[block_index]);
    4821      385315 :             int16_t* dst = &(((int16_t*)context_ptr->scratch_candidate_buffer->residual_ptr->buffer_y)[block_index]);
    4822             : 
    4823     6196270 :             for (int i = 0; i < context_ptr->blk_geom->bheight; i++) {
    4824     5810950 :                 memcpy(dst, src, context_ptr->blk_geom->bwidth << 1);
    4825     5810950 :                 src += candidate_buffer->residual_ptr->stride_y;
    4826     5810950 :                 dst += context_ptr->scratch_candidate_buffer->residual_ptr->stride_y;
    4827             :             }
    4828             :         }
    4829             :     }
    4830             : 
    4831             : 
    4832             :     uint8_t tx_search_skip_flag;
    4833      813702 :     if (context_ptr->md_staging_tx_search == 0)
    4834           0 :         tx_search_skip_flag = EB_TRUE;
    4835      813702 :     else if (context_ptr->md_staging_tx_search == 1)
    4836      770634 :         tx_search_skip_flag = picture_control_set_ptr->parent_pcs_ptr->tx_search_level == TX_SEARCH_FULL_LOOP ? get_skip_tx_search_flag(
    4837      385316 :             context_ptr->blk_geom->sq_size,
    4838             :             ref_fast_cost,
    4839      385316 :             *candidate_buffer->fast_cost_ptr,
    4840      385316 :             picture_control_set_ptr->parent_pcs_ptr->tx_weight) : EB_TRUE;
    4841             :     else
    4842      428387 :         tx_search_skip_flag = picture_control_set_ptr->parent_pcs_ptr->tx_search_level == TX_SEARCH_FULL_LOOP ? EB_FALSE : EB_TRUE;
    4843             : 
    4844             : 
    4845      813705 :     tx_reset_neighbor_arrays(
    4846             :         picture_control_set_ptr,
    4847             :         context_ptr,
    4848             :         is_inter,
    4849             :         end_tx_depth);
    4850             : 
    4851             :     // Transform Depth Loop
    4852     2441060 :     for (context_ptr->tx_depth = 0; context_ptr->tx_depth <= end_tx_depth; context_ptr->tx_depth++) {
    4853             : 
    4854     1627390 :         ModeDecisionCandidateBuffer *tx_candidate_buffer = (context_ptr->tx_depth == 0) ? candidate_buffer : context_ptr->scratch_candidate_buffer;
    4855             : 
    4856     1627390 :         tx_candidate_buffer->candidate_ptr->tx_depth = context_ptr->tx_depth;
    4857             : 
    4858     1627390 :         tx_initialize_neighbor_arrays(
    4859             :             picture_control_set_ptr,
    4860             :             context_ptr,
    4861             :             is_inter);
    4862             : 
    4863             :         // Initialize TU Split
    4864             :         uint32_t tx_y_count_non_zero_coeffs[MAX_NUM_OF_TU_PER_CU];
    4865     1627300 :         uint64_t tx_y_coeff_bits = 0;
    4866     1627300 :         uint64_t tx_y_full_distortion[DIST_CALC_TOTAL] = { 0 };
    4867             : 
    4868     1627300 :         context_ptr->txb_1d_offset = 0;
    4869     1627300 :         context_ptr->three_quad_energy = 0;
    4870     1627300 :         tx_candidate_buffer->candidate_ptr->y_has_coeff = 0;
    4871             : 
    4872     1627300 :         uint16_t txb_count = context_ptr->blk_geom->txb_count[context_ptr->tx_depth];
    4873             : 
    4874     1627300 :         uint32_t block_has_coeff = EB_FALSE;
    4875     5174530 :         for (context_ptr->txb_itr = 0; context_ptr->txb_itr < txb_count; context_ptr->txb_itr++) {
    4876     3547150 :             uint16_t tx_org_x = context_ptr->blk_geom->tx_org_x[context_ptr->tx_depth][context_ptr->txb_itr];
    4877     3547150 :             uint16_t tx_org_y = context_ptr->blk_geom->tx_org_y[context_ptr->tx_depth][context_ptr->txb_itr];
    4878     3547150 :             uint32_t tu_origin_index = tx_org_x + (tx_org_y * tx_candidate_buffer->residual_ptr->stride_y);
    4879     3547150 :             uint32_t input_tu_origin_index = (context_ptr->sb_origin_x + tx_org_x + input_picture_ptr->origin_x) + ((context_ptr->sb_origin_y + tx_org_y + input_picture_ptr->origin_y) * input_picture_ptr->stride_y);
    4880             : 
    4881             :             // Y Prediction
    4882             : 
    4883     3547150 :             if (!is_inter) {
    4884     1899240 :                 av1_intra_luma_prediction(
    4885             :                     context_ptr,
    4886             :                     picture_control_set_ptr,
    4887             :                     tx_candidate_buffer);
    4888             : 
    4889             :                 // Y Residual
    4890     1899430 :                 residual_kernel8bit(
    4891     1899430 :                     &(input_picture_ptr->buffer_y[input_tu_origin_index]),
    4892     1899430 :                     input_picture_ptr->stride_y,
    4893     1899430 :                     &(tx_candidate_buffer->prediction_ptr->buffer_y[tu_origin_index]),
    4894     1899430 :                     tx_candidate_buffer->prediction_ptr->stride_y,
    4895     1899430 :                     &(((int16_t*)tx_candidate_buffer->residual_ptr->buffer_y)[tu_origin_index]),
    4896     1899430 :                     tx_candidate_buffer->residual_ptr->stride_y,
    4897     1899430 :                     context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    4898     1899430 :                     context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr]);
    4899             :             }
    4900             : 
    4901     3547420 :             if (!tx_search_skip_flag) {
    4902             : 
    4903     2647610 :                 tx_type_search(
    4904             :                     sequence_control_set_ptr,
    4905             :                     picture_control_set_ptr,
    4906             :                     context_ptr,
    4907             :                     tx_candidate_buffer,
    4908             :                     qp);
    4909             :             }
    4910             : 
    4911     3547320 :             product_full_loop(
    4912             :                 tx_candidate_buffer,
    4913             :                 context_ptr,
    4914             :                 picture_control_set_ptr,
    4915             :                 input_picture_ptr,
    4916     3547320 :                 context_ptr->cu_ptr->qp,
    4917             :                 &(tx_y_count_non_zero_coeffs[0]),
    4918             :                 &tx_y_coeff_bits,
    4919             :                 &tx_y_full_distortion[0]);
    4920             : 
    4921     3547480 :             uint32_t y_has_coeff = tx_y_count_non_zero_coeffs[context_ptr->txb_itr] > 0;
    4922             : 
    4923     3547480 :             tx_update_neighbor_arrays(
    4924             :                 picture_control_set_ptr,
    4925             :                 context_ptr,
    4926             :                 tx_candidate_buffer,
    4927             :                 is_inter);
    4928             : 
    4929     3547230 :             if (y_has_coeff)
    4930     2823320 :                 block_has_coeff = EB_TRUE;
    4931             : 
    4932             :         } // Transform Loop
    4933             : 
    4934     1627380 :         uint64_t tx_size_bits = 0;
    4935     1627380 :         if (picture_control_set_ptr->parent_pcs_ptr->frm_hdr.tx_mode == TX_MODE_SELECT)
    4936     1627340 :             tx_size_bits = get_tx_size_bits(
    4937             :                 tx_candidate_buffer,
    4938             :                 context_ptr,
    4939             :                 picture_control_set_ptr,
    4940     1627340 :                 context_ptr->tx_depth,
    4941             :                 block_has_coeff);
    4942             : 
    4943     1627370 :         uint64_t cost = RDCOST(context_ptr->full_lambda, (tx_y_coeff_bits + tx_size_bits), tx_y_full_distortion[DIST_CALC_RESIDUAL]);
    4944             : 
    4945     1627370 :         if (cost < best_cost_search) {
    4946     1161860 :             best_cost_search = cost;
    4947     1161860 :             best_tx_depth = context_ptr->tx_depth;
    4948             : 
    4949     1161860 :             y_full_distortion[DIST_CALC_RESIDUAL] = tx_y_full_distortion[DIST_CALC_RESIDUAL];
    4950     1161860 :             y_full_distortion[DIST_CALC_PREDICTION] = tx_y_full_distortion[DIST_CALC_PREDICTION];
    4951     1161860 :             *y_coeff_bits = tx_y_coeff_bits;
    4952     3102750 :             for (context_ptr->txb_itr = 0; context_ptr->txb_itr < txb_count; context_ptr->txb_itr++) {
    4953     1940880 :                 y_count_non_zero_coeffs[context_ptr->txb_itr] = tx_y_count_non_zero_coeffs[context_ptr->txb_itr];
    4954             :             }
    4955             : 
    4956             :         }
    4957             :     } // Transform Depth Loop
    4958             : 
    4959             :     // ATB Recon
    4960      813674 :     if (best_tx_depth == 1) {
    4961             :         // Copy depth 1 mode/type/eob ..
    4962      348182 :         memcpy(candidate_buffer->candidate_ptr, context_ptr->scratch_candidate_buffer->candidate_ptr, sizeof(ModeDecisionCandidate));
    4963             : 
    4964             :         // Copy depth 1 pred
    4965      348182 :         uint32_t block_index = context_ptr->blk_geom->origin_x + (context_ptr->blk_geom->origin_y * MAX_SB_SIZE);
    4966      348182 :         EbByte src = &(context_ptr->scratch_candidate_buffer->prediction_ptr->buffer_y[block_index]);
    4967      348182 :         EbByte dst = &(candidate_buffer->prediction_ptr->buffer_y[block_index]);
    4968     6379340 :         for (int i = 0; i < context_ptr->blk_geom->bheight; i++) {
    4969     6031160 :             memcpy(dst, src, context_ptr->blk_geom->bwidth);
    4970     6031160 :             src += context_ptr->scratch_candidate_buffer->prediction_ptr->stride_y;
    4971     6031160 :             dst += candidate_buffer->prediction_ptr->stride_y;
    4972             :         }
    4973             : 
    4974             :         // Copy depth 1 recon coeff
    4975      348182 :         memcpy(candidate_buffer->recon_coeff_ptr->buffer_y, context_ptr->scratch_candidate_buffer->recon_coeff_ptr->buffer_y, (context_ptr->blk_geom->bwidth * context_ptr->blk_geom->bheight << 2));
    4976             :     }
    4977      813674 : }
    4978             : #else
    4979             : void perform_intra_tx_partitioning(
    4980             :     ModeDecisionCandidateBuffer  *candidate_buffer,
    4981             :     ModeDecisionContext          *context_ptr,
    4982             :     PictureControlSet            *picture_control_set_ptr,
    4983             :     uint64_t                      ref_fast_cost,
    4984             :     uint8_t                       end_tx_depth,
    4985             :     uint32_t                      qp,
    4986             :     uint32_t                     *y_count_non_zero_coeffs,
    4987             :     uint64_t                     *y_coeff_bits,
    4988             :     uint64_t                     *y_full_distortion)
    4989             : {
    4990             :     EbPictureBufferDesc *input_picture_ptr = picture_control_set_ptr->hbd_mode_decision ?
    4991             :         picture_control_set_ptr->input_frame16bit : picture_control_set_ptr->parent_pcs_ptr->enhanced_picture_ptr;
    4992             :     SequenceControlSet  *sequence_control_set_ptr = (SequenceControlSet*)picture_control_set_ptr->sequence_control_set_wrapper_ptr->object_ptr;
    4993             :     uint32_t tu_origin_index;
    4994             :     uint64_t y_full_cost;
    4995             :     uint64_t y_tu_coeff_bits;
    4996             :     uint64_t tuFullDistortion[3][DIST_CALC_TOTAL];
    4997             :     uint32_t txb_1d_offset;
    4998             : 
    4999             :     uint8_t  best_tx_depth = 0;
    5000             : 
    5001             :     uint64_t best_cost_search = (uint64_t)~0;
    5002             : 
    5003             :     TxType best_tx_type_depth_0 = DCT_DCT; // Track the best tx type @ depth 0 to be used @ the final stage (i.e. avoid redoing the tx type search).
    5004             :     uint8_t  tx_search_skip_flag;
    5005             :     if (context_ptr->md_staging_tx_search == 0)
    5006             :         tx_search_skip_flag = EB_TRUE;
    5007             :     else if (context_ptr->md_staging_tx_search == 1)
    5008             :         tx_search_skip_flag = picture_control_set_ptr->parent_pcs_ptr->tx_search_level == TX_SEARCH_FULL_LOOP ? get_skip_tx_search_flag(
    5009             :             context_ptr->blk_geom->sq_size,
    5010             :             ref_fast_cost,
    5011             :             *candidate_buffer->fast_cost_ptr,
    5012             :             picture_control_set_ptr->parent_pcs_ptr->tx_weight) : EB_TRUE;
    5013             :     else
    5014             :         tx_search_skip_flag = picture_control_set_ptr->parent_pcs_ptr->tx_search_level == TX_SEARCH_FULL_LOOP ? EB_FALSE : EB_TRUE;
    5015             : 
    5016             :     // Reset depth_1 neighbor arrays
    5017             :     if (end_tx_depth) {
    5018             :         if (!picture_control_set_ptr->hbd_mode_decision) {
    5019             :             copy_neigh_arr(
    5020             :                 picture_control_set_ptr->md_luma_recon_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX],
    5021             :                 picture_control_set_ptr->md_tx_depth_1_luma_recon_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX],
    5022             :                 context_ptr->sb_origin_x + context_ptr->blk_geom->origin_x,
    5023             :                 context_ptr->sb_origin_y + context_ptr->blk_geom->origin_y,
    5024             :                 context_ptr->blk_geom->bwidth,
    5025             :                 context_ptr->blk_geom->bheight,
    5026             :                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
    5027             :         } else {
    5028             :             copy_neigh_arr(
    5029             :                 picture_control_set_ptr->md_luma_recon_neighbor_array16bit[MD_NEIGHBOR_ARRAY_INDEX],
    5030             :                 picture_control_set_ptr->md_tx_depth_1_luma_recon_neighbor_array16bit[MD_NEIGHBOR_ARRAY_INDEX],
    5031             :                 context_ptr->sb_origin_x + context_ptr->blk_geom->origin_x,
    5032             :                 context_ptr->sb_origin_y + context_ptr->blk_geom->origin_y,
    5033             :                 context_ptr->blk_geom->bwidth,
    5034             :                 context_ptr->blk_geom->bheight,
    5035             :                 NEIGHBOR_ARRAY_UNIT_FULL_MASK);
    5036             :         }
    5037             : 
    5038             :         copy_neigh_arr(
    5039             :             picture_control_set_ptr->md_luma_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX],
    5040             :             picture_control_set_ptr->md_tx_depth_1_luma_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX],
    5041             :             context_ptr->sb_origin_x + context_ptr->blk_geom->origin_x,
    5042             :             context_ptr->sb_origin_y + context_ptr->blk_geom->origin_y,
    5043             :             context_ptr->blk_geom->bwidth,
    5044             :             context_ptr->blk_geom->bheight,
    5045             :             NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
    5046             :     }
    5047             : 
    5048             :     // Transform Depth Loop
    5049             :     for (context_ptr->tx_depth = 0; context_ptr->tx_depth <= end_tx_depth; context_ptr->tx_depth++) {
    5050             :         // Set recon neighbor array to be used @ intra compensation
    5051             :         if (!context_ptr->hbd_mode_decision) {
    5052             :             context_ptr->tx_search_luma_recon_neighbor_array =
    5053             :                 (context_ptr->tx_depth) ?
    5054             :                 picture_control_set_ptr->md_tx_depth_1_luma_recon_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX] :
    5055             :                 picture_control_set_ptr->md_luma_recon_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    5056             :         } else {
    5057             :             context_ptr->tx_search_luma_recon_neighbor_array16bit =
    5058             :                 (context_ptr->tx_depth) ?
    5059             :                 picture_control_set_ptr->md_tx_depth_1_luma_recon_neighbor_array16bit[MD_NEIGHBOR_ARRAY_INDEX] :
    5060             :                 picture_control_set_ptr->md_luma_recon_neighbor_array16bit[MD_NEIGHBOR_ARRAY_INDEX];
    5061             :         }
    5062             : 
    5063             :         // Set luma dc sign level coeff
    5064             :         context_ptr->tx_search_luma_dc_sign_level_coeff_neighbor_array =
    5065             :             (context_ptr->tx_depth) ?
    5066             :             picture_control_set_ptr->md_tx_depth_1_luma_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX] :
    5067             :             picture_control_set_ptr->md_luma_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    5068             : 
    5069             :         // Initialize TU Split
    5070             :         y_full_distortion[DIST_CALC_RESIDUAL] = 0;
    5071             :         y_full_distortion[DIST_CALC_PREDICTION] = 0;
    5072             :         *y_coeff_bits = 0;
    5073             :         txb_1d_offset = 0;
    5074             :         context_ptr->three_quad_energy = 0;
    5075             :         candidate_buffer->candidate_ptr->y_has_coeff = 0;
    5076             : 
    5077             :         uint16_t txb_count = context_ptr->blk_geom->txb_count[context_ptr->tx_depth];
    5078             :         for (context_ptr->txb_itr = 0; context_ptr->txb_itr < txb_count; context_ptr->txb_itr++) {
    5079             :             uint16_t tx_org_x = context_ptr->blk_geom->tx_org_x[context_ptr->tx_depth][context_ptr->txb_itr];
    5080             :             uint16_t tx_org_y = context_ptr->blk_geom->tx_org_y[context_ptr->tx_depth][context_ptr->txb_itr];
    5081             : 
    5082             :             context_ptr->luma_txb_skip_context = 0;
    5083             :             context_ptr->luma_dc_sign_context = 0;
    5084             :             get_txb_ctx(
    5085             :                 sequence_control_set_ptr,
    5086             :                 COMPONENT_LUMA,
    5087             :                 context_ptr->tx_search_luma_dc_sign_level_coeff_neighbor_array,
    5088             :                 context_ptr->sb_origin_x + tx_org_x,
    5089             :                 context_ptr->sb_origin_y + tx_org_y,
    5090             :                 context_ptr->blk_geom->bsize,
    5091             :                 context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5092             :                 &context_ptr->luma_txb_skip_context,
    5093             :                 &context_ptr->luma_dc_sign_context);
    5094             :             tu_origin_index = tx_org_x + (tx_org_y * candidate_buffer->residual_ptr->stride_y);
    5095             : 
    5096             :             uint32_t input_tu_origin_index = (context_ptr->sb_origin_x + tx_org_x + input_picture_ptr->origin_x) + ((context_ptr->sb_origin_y + tx_org_y + input_picture_ptr->origin_y) * input_picture_ptr->stride_y);
    5097             : 
    5098             :             // Y Prediction
    5099             :             av1_intra_luma_prediction(
    5100             :                 context_ptr,
    5101             :                 picture_control_set_ptr,
    5102             :                 candidate_buffer);
    5103             : 
    5104             :             // Y Residual
    5105             :             residual_kernel(
    5106             :                 input_picture_ptr->buffer_y,
    5107             :                 input_tu_origin_index,
    5108             :                 input_picture_ptr->stride_y,
    5109             :                 candidate_buffer->prediction_ptr->buffer_y,
    5110             :                 tu_origin_index,
    5111             :                 candidate_buffer->prediction_ptr->stride_y,
    5112             :                 (int16_t*)candidate_buffer->residual_ptr->buffer_y,
    5113             :                 tu_origin_index,
    5114             :                 candidate_buffer->residual_ptr->stride_y,
    5115             :                 context_ptr->hbd_mode_decision,
    5116             :                 context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5117             :                 context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr]);
    5118             : 
    5119             :             TxType best_tx_type = DCT_DCT;
    5120             :             if (!tx_search_skip_flag) {
    5121             :             TxType txk_start = DCT_DCT;
    5122             :             TxType txk_end = TX_TYPES;
    5123             :             uint64_t best_cost_tx_search = (uint64_t)~0;
    5124             : 
    5125             :             const TxSetType tx_set_type = get_ext_tx_set_type(context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr], 0, 0); // assumes INTRA
    5126             : 
    5127             :             for (int32_t tx_type = txk_start; tx_type < txk_end; ++tx_type) {
    5128             :                 y_tu_coeff_bits = 0;
    5129             : 
    5130             :                 int32_t eset = get_ext_tx_set(context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr], 0, 0); // assumes INTRA
    5131             :                 // eset == 0 should correspond to a set with only DCT_DCT and there
    5132             :                 // is no need to send the tx_type
    5133             :                 if (eset <= 0) continue;
    5134             :                 else if (av1_ext_tx_used[tx_set_type][tx_type] == 0) continue;
    5135             :                 else if (context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr] > 32 || context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr] > 32) continue;
    5136             : 
    5137             :                 int32_t seg_qp = picture_control_set_ptr->parent_pcs_ptr->frm_hdr.segmentation_params.segmentation_enabled ?
    5138             :                                  picture_control_set_ptr->parent_pcs_ptr->frm_hdr.segmentation_params.feature_data[context_ptr->cu_ptr->segment_id][SEG_LVL_ALT_Q] : 0;
    5139             :                 // Y: T Q iQ
    5140             :                 av1_estimate_transform(
    5141             :                     &(((int16_t*)candidate_buffer->residual_ptr->buffer_y)[tu_origin_index]),
    5142             :                     candidate_buffer->residual_ptr->stride_y,
    5143             :                     &(((int32_t*)context_ptr->trans_quant_buffers_ptr->tu_trans_coeff2_nx2_n_ptr->buffer_y)[txb_1d_offset]),
    5144             :                     NOT_USED_VALUE,
    5145             :                     context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5146             :                     &context_ptr->three_quad_energy,
    5147             :                     context_ptr->transform_inner_array_ptr,
    5148             :                     picture_control_set_ptr->hbd_mode_decision ? BIT_INCREMENT_10BIT : BIT_INCREMENT_8BIT,
    5149             :                     tx_type,
    5150             :                     PLANE_TYPE_Y,
    5151             :                     DEFAULT_SHAPE);
    5152             : 
    5153             :                 av1_quantize_inv_quantize(
    5154             :                     picture_control_set_ptr,
    5155             :                     context_ptr,
    5156             :                     &(((int32_t*)context_ptr->trans_quant_buffers_ptr->tu_trans_coeff2_nx2_n_ptr->buffer_y)[txb_1d_offset]),
    5157             :                     NOT_USED_VALUE,
    5158             :                     &(((int32_t*)candidate_buffer->residual_quant_coeff_ptr->buffer_y)[txb_1d_offset]),
    5159             :                     &(((int32_t*)candidate_buffer->recon_coeff_ptr->buffer_y)[txb_1d_offset]),
    5160             :                     qp,
    5161             :                     seg_qp,
    5162             :                     context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5163             :                     context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr],
    5164             :                     context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5165             :                     &candidate_buffer->candidate_ptr->eob[0][context_ptr->txb_itr],
    5166             :                     &(y_count_non_zero_coeffs[context_ptr->txb_itr]),
    5167             :                     COMPONENT_LUMA,
    5168             :                     picture_control_set_ptr->hbd_mode_decision ? BIT_INCREMENT_10BIT : BIT_INCREMENT_8BIT,
    5169             :                     tx_type,
    5170             :                     candidate_buffer,
    5171             :                     context_ptr->luma_txb_skip_context,
    5172             :                     context_ptr->luma_dc_sign_context,
    5173             :                     candidate_buffer->candidate_ptr->pred_mode,
    5174             :                     EB_FALSE,
    5175             :                     EB_FALSE);
    5176             : 
    5177             :                 candidate_buffer->candidate_ptr->quantized_dc[0][context_ptr->txb_itr] = (((int32_t*)candidate_buffer->residual_quant_coeff_ptr->buffer_y)[txb_1d_offset]);
    5178             : 
    5179             :                 uint32_t y_has_coeff = y_count_non_zero_coeffs[context_ptr->txb_itr] > 0;
    5180             : 
    5181             :                 // tx_type not equal to DCT_DCT and no coeff is not an acceptable option in AV1.
    5182             :                 if (y_has_coeff == 0 && tx_type != DCT_DCT)
    5183             :                     continue;
    5184             :                 if (y_has_coeff)
    5185             :                     inv_transform_recon_wrapper(
    5186             :                         candidate_buffer->prediction_ptr->buffer_y,
    5187             :                         tu_origin_index,
    5188             :                         candidate_buffer->prediction_ptr->stride_y,
    5189             :                         candidate_buffer->recon_ptr->buffer_y,
    5190             :                         tu_origin_index,
    5191             :                         candidate_buffer->recon_ptr->stride_y,
    5192             :                         (int32_t*) candidate_buffer->recon_coeff_ptr->buffer_y,
    5193             :                         txb_1d_offset,
    5194             :                         picture_control_set_ptr->hbd_mode_decision,
    5195             :                         context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5196             :                         tx_type,
    5197             :                         PLANE_TYPE_Y,
    5198             :                         (uint16_t)candidate_buffer->candidate_ptr->eob[0][context_ptr->txb_itr]);
    5199             :                 else
    5200             :                     picture_copy(
    5201             :                         candidate_buffer->prediction_ptr,
    5202             :                         tu_origin_index,
    5203             :                         0,
    5204             :                         candidate_buffer->recon_ptr,
    5205             :                         tu_origin_index,
    5206             :                         0,
    5207             :                         context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5208             :                         context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr],
    5209             :                         0,
    5210             :                         0,
    5211             :                         PICTURE_BUFFER_DESC_Y_FLAG,
    5212             :                         picture_control_set_ptr->hbd_mode_decision);
    5213             : 
    5214             :                 EbSpatialFullDistType spatial_full_dist_type_fun = context_ptr->hbd_mode_decision ?
    5215             :                         full_distortion_kernel16_bits : spatial_full_distortion_kernel;
    5216             : 
    5217             :                 tuFullDistortion[0][DIST_CALC_PREDICTION] = spatial_full_dist_type_fun(
    5218             :                     input_picture_ptr->buffer_y,
    5219             :                     input_tu_origin_index,
    5220             :                     input_picture_ptr->stride_y,
    5221             :                     candidate_buffer->prediction_ptr->buffer_y,
    5222             :                     tu_origin_index,
    5223             :                     candidate_buffer->prediction_ptr->stride_y,
    5224             :                     context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5225             :                     context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr]);
    5226             : 
    5227             :                 tuFullDistortion[0][DIST_CALC_RESIDUAL] = spatial_full_dist_type_fun(
    5228             :                     input_picture_ptr->buffer_y,
    5229             :                     input_tu_origin_index,
    5230             :                     input_picture_ptr->stride_y,
    5231             :                     candidate_buffer->recon_ptr->buffer_y,
    5232             :                     tu_origin_index,
    5233             :                     candidate_buffer->recon_ptr->stride_y,
    5234             :                     context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5235             :                     context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr]);
    5236             : 
    5237             :                 tuFullDistortion[0][DIST_CALC_PREDICTION] <<= 4;
    5238             :                 tuFullDistortion[0][DIST_CALC_RESIDUAL] <<= 4;
    5239             : 
    5240             :                 //LUMA-ONLY
    5241             :                 av1_tu_estimate_coeff_bits(
    5242             :                     context_ptr,
    5243             :                     0,   //allow_update_cdf,
    5244             :                     NULL,//FRAME_CONTEXT *ec_ctx,
    5245             :                     picture_control_set_ptr,
    5246             :                     candidate_buffer,
    5247             :                     txb_1d_offset,
    5248             :                     0,
    5249             :                     context_ptr->coeff_est_entropy_coder_ptr,
    5250             :                     candidate_buffer->residual_quant_coeff_ptr,
    5251             :                     y_count_non_zero_coeffs[context_ptr->txb_itr],
    5252             :                     0,
    5253             :                     0,
    5254             :                     &y_tu_coeff_bits,
    5255             :                     &y_tu_coeff_bits,
    5256             :                     &y_tu_coeff_bits,
    5257             :                     context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5258             :                     context_ptr->blk_geom->txsize_uv[context_ptr->tx_depth][context_ptr->txb_itr],
    5259             :                     tx_type,
    5260             :                     candidate_buffer->candidate_ptr->transform_type_uv,
    5261             :                     COMPONENT_LUMA);
    5262             : 
    5263             :                 uint64_t cost = RDCOST(context_ptr->full_lambda, y_tu_coeff_bits, tuFullDistortion[0][DIST_CALC_RESIDUAL]);
    5264             :                 if (cost < best_cost_tx_search) {
    5265             :                     best_cost_tx_search = cost;
    5266             :                     best_tx_type = tx_type;
    5267             :                 }
    5268             :             }
    5269             : 
    5270             :             // Record the best tx type @ depth 0
    5271             :             best_tx_type_depth_0 = (context_ptr->tx_depth == 0) ? best_tx_type : best_tx_type_depth_0;
    5272             :             }
    5273             :             //  Best Tx Type Pass
    5274             :             candidate_buffer->candidate_ptr->transform_type[context_ptr->txb_itr] = best_tx_type;
    5275             : 
    5276             :             y_tu_coeff_bits = 0;
    5277             : 
    5278             : 
    5279             :             // Y: T Q iQ
    5280             :             av1_estimate_transform(
    5281             :                 &(((int16_t*)candidate_buffer->residual_ptr->buffer_y)[tu_origin_index]),
    5282             :                 candidate_buffer->residual_ptr->stride_y,
    5283             :                 &(((int32_t*)context_ptr->trans_quant_buffers_ptr->tu_trans_coeff2_nx2_n_ptr->buffer_y)[txb_1d_offset]),
    5284             :                 NOT_USED_VALUE,
    5285             :                 context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5286             :                 &context_ptr->three_quad_energy,
    5287             :                 context_ptr->transform_inner_array_ptr,
    5288             :                 picture_control_set_ptr->hbd_mode_decision ? BIT_INCREMENT_10BIT : BIT_INCREMENT_8BIT,
    5289             :                 candidate_buffer->candidate_ptr->transform_type[context_ptr->txb_itr],
    5290             :                 PLANE_TYPE_Y,
    5291             :                 DEFAULT_SHAPE);
    5292             : 
    5293             :             int32_t seg_qp = picture_control_set_ptr->parent_pcs_ptr->frm_hdr.segmentation_params.segmentation_enabled ?
    5294             :                              picture_control_set_ptr->parent_pcs_ptr->frm_hdr.segmentation_params.feature_data[context_ptr->cu_ptr->segment_id][SEG_LVL_ALT_Q] : 0;
    5295             : 
    5296             :             candidate_buffer->candidate_ptr->quantized_dc[0][context_ptr->txb_itr] = av1_quantize_inv_quantize(
    5297             :                 picture_control_set_ptr,
    5298             :                 context_ptr,
    5299             :                 &(((int32_t*)context_ptr->trans_quant_buffers_ptr->tu_trans_coeff2_nx2_n_ptr->buffer_y)[txb_1d_offset]),
    5300             :                 NOT_USED_VALUE,
    5301             :                 &(((int32_t*)candidate_buffer->residual_quant_coeff_ptr->buffer_y)[txb_1d_offset]),
    5302             :                 &(((int32_t*)candidate_buffer->recon_coeff_ptr->buffer_y)[txb_1d_offset]),
    5303             :                 qp,
    5304             :                 seg_qp,
    5305             :                 context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5306             :                 context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr],
    5307             :                 context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5308             :                 &candidate_buffer->candidate_ptr->eob[0][context_ptr->txb_itr],
    5309             :                 &(y_count_non_zero_coeffs[context_ptr->txb_itr]),
    5310             :                 COMPONENT_LUMA,
    5311             :                 picture_control_set_ptr->hbd_mode_decision ? BIT_INCREMENT_10BIT : BIT_INCREMENT_8BIT,
    5312             :                 candidate_buffer->candidate_ptr->transform_type[context_ptr->txb_itr],
    5313             :                 candidate_buffer,
    5314             :                 context_ptr->luma_txb_skip_context,
    5315             :                 context_ptr->luma_dc_sign_context,
    5316             :                 candidate_buffer->candidate_ptr->pred_mode,
    5317             :                 EB_FALSE,
    5318             :                 EB_FALSE);
    5319             :             uint32_t y_has_coeff = y_count_non_zero_coeffs[context_ptr->txb_itr] > 0;
    5320             : 
    5321             :             if (y_has_coeff)
    5322             :                 inv_transform_recon_wrapper(
    5323             :                     candidate_buffer->prediction_ptr->buffer_y,
    5324             :                     tu_origin_index,
    5325             :                     candidate_buffer->prediction_ptr->stride_y,
    5326             :                     candidate_buffer->recon_ptr->buffer_y,
    5327             :                     tu_origin_index,
    5328             :                     candidate_buffer->recon_ptr->stride_y,
    5329             :                     (int32_t*) candidate_buffer->recon_coeff_ptr->buffer_y,
    5330             :                     txb_1d_offset,
    5331             :                     picture_control_set_ptr->hbd_mode_decision,
    5332             :                     context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5333             :                     candidate_buffer->candidate_ptr->transform_type[context_ptr->txb_itr],
    5334             :                     PLANE_TYPE_Y,
    5335             :                     (uint16_t)candidate_buffer->candidate_ptr->eob[0][context_ptr->txb_itr]);
    5336             :             else
    5337             :                 picture_copy(
    5338             :                     candidate_buffer->prediction_ptr,
    5339             :                     tu_origin_index,
    5340             :                     0,
    5341             :                     candidate_buffer->recon_ptr,
    5342             :                     tu_origin_index,
    5343             :                     0,
    5344             :                     context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5345             :                     context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr],
    5346             :                     0,
    5347             :                     0,
    5348             :                     PICTURE_BUFFER_DESC_Y_FLAG,
    5349             :                     picture_control_set_ptr->hbd_mode_decision);
    5350             : 
    5351             :             EbSpatialFullDistType spatial_full_dist_type_fun = context_ptr->hbd_mode_decision ?
    5352             :                 full_distortion_kernel16_bits : spatial_full_distortion_kernel;
    5353             : 
    5354             :             tuFullDistortion[0][DIST_CALC_PREDICTION] = spatial_full_dist_type_fun(
    5355             :                 input_picture_ptr->buffer_y,
    5356             :                 input_tu_origin_index,
    5357             :                 input_picture_ptr->stride_y,
    5358             :                 candidate_buffer->prediction_ptr->buffer_y,
    5359             :                 tu_origin_index,
    5360             :                 candidate_buffer->prediction_ptr->stride_y,
    5361             :                 context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5362             :                 context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr]);
    5363             : 
    5364             :             tuFullDistortion[0][DIST_CALC_RESIDUAL] = spatial_full_dist_type_fun(
    5365             :                 input_picture_ptr->buffer_y,
    5366             :                 input_tu_origin_index,
    5367             :                 input_picture_ptr->stride_y,
    5368             :                 candidate_buffer->recon_ptr->buffer_y,
    5369             :                 tu_origin_index,
    5370             :                 candidate_buffer->recon_ptr->stride_y,
    5371             :                 context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5372             :                 context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr]);
    5373             : 
    5374             :             tuFullDistortion[0][DIST_CALC_PREDICTION] <<= 4;
    5375             :             tuFullDistortion[0][DIST_CALC_RESIDUAL] <<= 4;
    5376             : 
    5377             :             //LUMA-ONLY
    5378             :             av1_tu_estimate_coeff_bits(
    5379             :                 context_ptr,
    5380             :                 0,   //allow_update_cdf,
    5381             :                 NULL,//FRAME_CONTEXT *ec_ctx,
    5382             :                 picture_control_set_ptr,
    5383             :                 candidate_buffer,
    5384             :                 txb_1d_offset,
    5385             :                 0,
    5386             :                 context_ptr->coeff_est_entropy_coder_ptr,
    5387             :                 candidate_buffer->residual_quant_coeff_ptr,
    5388             :                 y_count_non_zero_coeffs[context_ptr->txb_itr],
    5389             :                 0,
    5390             :                 0,
    5391             :                 &y_tu_coeff_bits,
    5392             :                 &y_tu_coeff_bits,
    5393             :                 &y_tu_coeff_bits,
    5394             :                 context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5395             :                 context_ptr->blk_geom->txsize_uv[context_ptr->tx_depth][context_ptr->txb_itr],
    5396             :                 candidate_buffer->candidate_ptr->transform_type[context_ptr->txb_itr],
    5397             :                 candidate_buffer->candidate_ptr->transform_type_uv,
    5398             :                 COMPONENT_LUMA);
    5399             : 
    5400             :             av1_tu_calc_cost_luma(
    5401             :                 context_ptr->luma_txb_skip_context,
    5402             :                 candidate_buffer->candidate_ptr,
    5403             :                 context_ptr->txb_itr,
    5404             :                 context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5405             :                 y_count_non_zero_coeffs[context_ptr->txb_itr],
    5406             :                 tuFullDistortion[0],
    5407             :                 &y_tu_coeff_bits,
    5408             :                 &y_full_cost,
    5409             :                 context_ptr->full_lambda);
    5410             : 
    5411             :             (*y_coeff_bits) += y_tu_coeff_bits;
    5412             : 
    5413             :             y_full_distortion[DIST_CALC_RESIDUAL] += tuFullDistortion[0][DIST_CALC_RESIDUAL];
    5414             :             y_full_distortion[DIST_CALC_PREDICTION] += tuFullDistortion[0][DIST_CALC_PREDICTION];
    5415             : 
    5416             :             txb_1d_offset += context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr] * context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr];
    5417             : 
    5418             :             if (context_ptr->tx_depth)
    5419             :             {
    5420             :                 NeighborArrayUnit *tx_search_luma_recon =
    5421             :                     context_ptr->hbd_mode_decision ? context_ptr->tx_search_luma_recon_neighbor_array16bit : context_ptr->tx_search_luma_recon_neighbor_array;
    5422             : 
    5423             :                 tx_search_update_recon_sample_neighbor_array(
    5424             :                     tx_search_luma_recon,
    5425             :                     candidate_buffer->recon_ptr,
    5426             :                     context_ptr->blk_geom->tx_org_x[context_ptr->tx_depth][context_ptr->txb_itr],
    5427             :                     context_ptr->blk_geom->tx_org_y[context_ptr->tx_depth][context_ptr->txb_itr],
    5428             :                     context_ptr->sb_origin_x + context_ptr->blk_geom->tx_org_x[context_ptr->tx_depth][context_ptr->txb_itr],
    5429             :                     context_ptr->sb_origin_y + context_ptr->blk_geom->tx_org_y[context_ptr->tx_depth][context_ptr->txb_itr],
    5430             :                     context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5431             :                     context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr],
    5432             :                     context_ptr->hbd_mode_decision);
    5433             : 
    5434             :                 int8_t dc_sign_level_coeff = candidate_buffer->candidate_ptr->quantized_dc[0][context_ptr->txb_itr];
    5435             :                 neighbor_array_unit_mode_write(
    5436             :                     picture_control_set_ptr->md_tx_depth_1_luma_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX],
    5437             :                     (uint8_t*)&dc_sign_level_coeff,
    5438             :                     context_ptr->sb_origin_x + context_ptr->blk_geom->tx_org_x[context_ptr->tx_depth][context_ptr->txb_itr],
    5439             :                     context_ptr->sb_origin_y + context_ptr->blk_geom->tx_org_y[context_ptr->tx_depth][context_ptr->txb_itr],
    5440             :                     context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5441             :                     context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr],
    5442             :                     NEIGHBOR_ARRAY_UNIT_TOP_AND_LEFT_ONLY_MASK);
    5443             :             }
    5444             :         } // Transform Loop
    5445             : 
    5446             :         // To do: estimate the cost of tx size = tx_size_bits
    5447             :         uint64_t cost = RDCOST(context_ptr->full_lambda, (*y_coeff_bits), y_full_distortion[DIST_CALC_RESIDUAL]);
    5448             : 
    5449             :         if (cost < best_cost_search) {
    5450             :             best_cost_search = cost;
    5451             :             best_tx_depth = context_ptr->tx_depth;
    5452             :         }
    5453             :     } // Transform Depth Loop
    5454             : 
    5455             :     // ATB Recon
    5456             :     context_ptr->tx_depth = candidate_buffer->candidate_ptr->tx_depth = best_tx_depth;
    5457             : 
    5458             :     if (context_ptr->tx_depth == 0) {
    5459             :         // Set recon neighbor array to be used @ intra compensation
    5460             :         if (context_ptr->hbd_mode_decision)
    5461             :             context_ptr->tx_search_luma_recon_neighbor_array16bit = picture_control_set_ptr->md_luma_recon_neighbor_array16bit[MD_NEIGHBOR_ARRAY_INDEX];
    5462             :         else
    5463             :             context_ptr->tx_search_luma_recon_neighbor_array = picture_control_set_ptr->md_luma_recon_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    5464             : 
    5465             :         // Initialize TU Split
    5466             :         y_full_distortion[DIST_CALC_RESIDUAL] = 0;
    5467             :         y_full_distortion[DIST_CALC_PREDICTION] = 0;
    5468             :         *y_coeff_bits = 0;
    5469             :         txb_1d_offset = 0;
    5470             :         context_ptr->three_quad_energy = 0;
    5471             :         candidate_buffer->candidate_ptr->y_has_coeff = 0;
    5472             : 
    5473             :         uint16_t txb_count = context_ptr->blk_geom->txb_count[context_ptr->tx_depth];
    5474             :         for (context_ptr->txb_itr = 0; context_ptr->txb_itr < txb_count; context_ptr->txb_itr++) {
    5475             :             uint16_t tx_org_x = context_ptr->blk_geom->tx_org_x[context_ptr->tx_depth][context_ptr->txb_itr];
    5476             :             uint16_t tx_org_y = context_ptr->blk_geom->tx_org_y[context_ptr->tx_depth][context_ptr->txb_itr];
    5477             : 
    5478             :             context_ptr->luma_txb_skip_context = 0;
    5479             :             context_ptr->luma_dc_sign_context = 0;
    5480             :             get_txb_ctx(
    5481             :                 sequence_control_set_ptr,
    5482             :                 COMPONENT_LUMA,
    5483             :                 picture_control_set_ptr->md_luma_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX],
    5484             :                 context_ptr->sb_origin_x + tx_org_x,
    5485             :                 context_ptr->sb_origin_y + tx_org_y,
    5486             :                 context_ptr->blk_geom->bsize,
    5487             :                 context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5488             :                 &context_ptr->luma_txb_skip_context,
    5489             :                 &context_ptr->luma_dc_sign_context);
    5490             : 
    5491             :             tu_origin_index = tx_org_x + (tx_org_y * candidate_buffer->residual_ptr->stride_y);
    5492             :             y_tu_coeff_bits = 0;
    5493             : 
    5494             :             uint32_t input_tu_origin_index = (context_ptr->sb_origin_x + tx_org_x + input_picture_ptr->origin_x) + ((context_ptr->sb_origin_y + tx_org_y + input_picture_ptr->origin_y) * input_picture_ptr->stride_y);
    5495             : 
    5496             :             // Y Prediction
    5497             :             av1_intra_luma_prediction(
    5498             :                 context_ptr,
    5499             :                 picture_control_set_ptr,
    5500             :                 candidate_buffer);
    5501             : 
    5502             :             // Y Residual
    5503             :             residual_kernel(
    5504             :                 input_picture_ptr->buffer_y,
    5505             :                 input_tu_origin_index,
    5506             :                 input_picture_ptr->stride_y,
    5507             :                 candidate_buffer->prediction_ptr->buffer_y,
    5508             :                 tu_origin_index,
    5509             :                 candidate_buffer->prediction_ptr->stride_y,
    5510             :                 (int16_t*)candidate_buffer->residual_ptr->buffer_y,
    5511             :                 tu_origin_index,
    5512             :                 candidate_buffer->residual_ptr->stride_y,
    5513             :                 context_ptr->hbd_mode_decision,
    5514             :                 context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5515             :                 context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr]);
    5516             : 
    5517             :             // Get the depth 0 best tx type
    5518             :             candidate_buffer->candidate_ptr->transform_type[context_ptr->txb_itr] = best_tx_type_depth_0;
    5519             : 
    5520             :             y_tu_coeff_bits = 0;
    5521             : 
    5522             :             // Y: T Q iQ
    5523             :             av1_estimate_transform(
    5524             :                 &(((int16_t*)candidate_buffer->residual_ptr->buffer_y)[tu_origin_index]),
    5525             :                 candidate_buffer->residual_ptr->stride_y,
    5526             :                 &(((int32_t*)context_ptr->trans_quant_buffers_ptr->tu_trans_coeff2_nx2_n_ptr->buffer_y)[txb_1d_offset]),
    5527             :                 NOT_USED_VALUE,
    5528             :                 context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5529             :                 &context_ptr->three_quad_energy,
    5530             :                 context_ptr->transform_inner_array_ptr,
    5531             :                 picture_control_set_ptr->hbd_mode_decision ? BIT_INCREMENT_10BIT : BIT_INCREMENT_8BIT,
    5532             :                 candidate_buffer->candidate_ptr->transform_type[context_ptr->txb_itr],
    5533             :                 PLANE_TYPE_Y,
    5534             :                 DEFAULT_SHAPE);
    5535             : 
    5536             :             int32_t seg_qp = picture_control_set_ptr->parent_pcs_ptr->frm_hdr.segmentation_params.segmentation_enabled ?
    5537             :                              picture_control_set_ptr->parent_pcs_ptr->frm_hdr.segmentation_params.feature_data[context_ptr->cu_ptr->segment_id][SEG_LVL_ALT_Q] : 0;
    5538             :             candidate_buffer->candidate_ptr->quantized_dc[0][context_ptr->txb_itr] = av1_quantize_inv_quantize(
    5539             :                 picture_control_set_ptr,
    5540             :                 context_ptr,
    5541             :                 &(((int32_t*)context_ptr->trans_quant_buffers_ptr->tu_trans_coeff2_nx2_n_ptr->buffer_y)[txb_1d_offset]),
    5542             :                 NOT_USED_VALUE,
    5543             :                 &(((int32_t*)candidate_buffer->residual_quant_coeff_ptr->buffer_y)[txb_1d_offset]),
    5544             :                 &(((int32_t*)candidate_buffer->recon_coeff_ptr->buffer_y)[txb_1d_offset]),
    5545             :                 qp,
    5546             :                 seg_qp,
    5547             :                 context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5548             :                 context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr],
    5549             :                 context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5550             :                 &candidate_buffer->candidate_ptr->eob[0][context_ptr->txb_itr],
    5551             :                 &(y_count_non_zero_coeffs[context_ptr->txb_itr]),
    5552             :                 COMPONENT_LUMA,
    5553             :                 picture_control_set_ptr->hbd_mode_decision ? BIT_INCREMENT_10BIT : BIT_INCREMENT_8BIT,
    5554             :                 candidate_buffer->candidate_ptr->transform_type[context_ptr->txb_itr],
    5555             :                 candidate_buffer,
    5556             :                 context_ptr->luma_txb_skip_context,
    5557             :                 context_ptr->luma_dc_sign_context,
    5558             :                 candidate_buffer->candidate_ptr->pred_mode,
    5559             :                 EB_FALSE,
    5560             :                 EB_FALSE);
    5561             :             uint32_t y_has_coeff = y_count_non_zero_coeffs[context_ptr->txb_itr] > 0;
    5562             : 
    5563             :             if (y_has_coeff)
    5564             :                 inv_transform_recon_wrapper(
    5565             :                     candidate_buffer->prediction_ptr->buffer_y,
    5566             :                     tu_origin_index,
    5567             :                     candidate_buffer->prediction_ptr->stride_y,
    5568             :                     candidate_buffer->recon_ptr->buffer_y,
    5569             :                     tu_origin_index,
    5570             :                     candidate_buffer->recon_ptr->stride_y,
    5571             :                     (int32_t*) candidate_buffer->recon_coeff_ptr->buffer_y,
    5572             :                     txb_1d_offset,
    5573             :                     picture_control_set_ptr->hbd_mode_decision,
    5574             :                     context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5575             :                     candidate_buffer->candidate_ptr->transform_type[context_ptr->txb_itr],
    5576             :                     PLANE_TYPE_Y,
    5577             :                     (uint16_t)candidate_buffer->candidate_ptr->eob[0][context_ptr->txb_itr]);
    5578             : 
    5579             :             else
    5580             :                 picture_copy(
    5581             :                     candidate_buffer->prediction_ptr,
    5582             :                     tu_origin_index,
    5583             :                     0,
    5584             :                     candidate_buffer->recon_ptr,
    5585             :                     tu_origin_index,
    5586             :                     0,
    5587             :                     context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5588             :                     context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr],
    5589             :                     0,
    5590             :                     0,
    5591             :                     PICTURE_BUFFER_DESC_Y_FLAG,
    5592             :                     picture_control_set_ptr->hbd_mode_decision);
    5593             : 
    5594             :             EbSpatialFullDistType spatial_full_dist_type_fun = context_ptr->hbd_mode_decision ?
    5595             :                 full_distortion_kernel16_bits : spatial_full_distortion_kernel;
    5596             : 
    5597             :             tuFullDistortion[0][DIST_CALC_PREDICTION] = spatial_full_dist_type_fun(
    5598             :                 input_picture_ptr->buffer_y,
    5599             :                 input_tu_origin_index,
    5600             :                 input_picture_ptr->stride_y,
    5601             :                 candidate_buffer->prediction_ptr->buffer_y,
    5602             :                 tu_origin_index,
    5603             :                 candidate_buffer->prediction_ptr->stride_y,
    5604             :                 context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5605             :                 context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr]);
    5606             : 
    5607             :             tuFullDistortion[0][DIST_CALC_RESIDUAL] = spatial_full_dist_type_fun(
    5608             :                 input_picture_ptr->buffer_y,
    5609             :                 input_tu_origin_index,
    5610             :                 input_picture_ptr->stride_y,
    5611             :                 candidate_buffer->recon_ptr->buffer_y,
    5612             :                 tu_origin_index,
    5613             :                 candidate_buffer->recon_ptr->stride_y,
    5614             :                 context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr],
    5615             :                 context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr]);
    5616             : 
    5617             :             tuFullDistortion[0][DIST_CALC_PREDICTION] <<= 4;
    5618             :             tuFullDistortion[0][DIST_CALC_RESIDUAL] <<= 4;
    5619             : 
    5620             :             //LUMA-ONLY
    5621             :             av1_tu_estimate_coeff_bits(
    5622             :                 context_ptr,
    5623             :                 0,   //allow_update_cdf,
    5624             :                 NULL,//FRAME_CONTEXT *ec_ctx,
    5625             :                 picture_control_set_ptr,
    5626             :                 candidate_buffer,
    5627             :                 txb_1d_offset,
    5628             :                 0,
    5629             :                 context_ptr->coeff_est_entropy_coder_ptr,
    5630             :                 candidate_buffer->residual_quant_coeff_ptr,
    5631             :                 y_count_non_zero_coeffs[context_ptr->txb_itr],
    5632             :                 0,
    5633             :                 0,
    5634             :                 &y_tu_coeff_bits,
    5635             :                 &y_tu_coeff_bits,
    5636             :                 &y_tu_coeff_bits,
    5637             :                 context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5638             :                 context_ptr->blk_geom->txsize_uv[context_ptr->tx_depth][context_ptr->txb_itr],
    5639             :                 candidate_buffer->candidate_ptr->transform_type[context_ptr->txb_itr],
    5640             :                 candidate_buffer->candidate_ptr->transform_type_uv,
    5641             :                 COMPONENT_LUMA);
    5642             : 
    5643             :             av1_tu_calc_cost_luma(
    5644             :                 context_ptr->luma_txb_skip_context,
    5645             :                 candidate_buffer->candidate_ptr,
    5646             :                 context_ptr->txb_itr,
    5647             :                 context_ptr->blk_geom->txsize[context_ptr->tx_depth][context_ptr->txb_itr],
    5648             :                 y_count_non_zero_coeffs[context_ptr->txb_itr],
    5649             :                 tuFullDistortion[0],
    5650             :                 &y_tu_coeff_bits,
    5651             :                 &y_full_cost,
    5652             :                 context_ptr->full_lambda);
    5653             : 
    5654             :             (*y_coeff_bits) += y_tu_coeff_bits;
    5655             : 
    5656             :             y_full_distortion[DIST_CALC_RESIDUAL] += tuFullDistortion[0][DIST_CALC_RESIDUAL];
    5657             :             y_full_distortion[DIST_CALC_PREDICTION] += tuFullDistortion[0][DIST_CALC_PREDICTION];
    5658             : 
    5659             :             txb_1d_offset += context_ptr->blk_geom->tx_width[context_ptr->tx_depth][context_ptr->txb_itr] * context_ptr->blk_geom->tx_height[context_ptr->tx_depth][context_ptr->txb_itr];
    5660             :         } // Transform Loop
    5661             :     }
    5662             : }
    5663             : #endif
    5664             : 
    5665             : 
    5666    37535300 : void full_loop_core(
    5667             :     PictureControlSet           *picture_control_set_ptr,
    5668             :     LargestCodingUnit           *sb_ptr,
    5669             :     CodingUnit                  *cu_ptr,
    5670             :     ModeDecisionContext         *context_ptr,
    5671             :     ModeDecisionCandidateBuffer *candidate_buffer,
    5672             :     ModeDecisionCandidate       *candidate_ptr,
    5673             :     EbPictureBufferDesc         *input_picture_ptr,
    5674             :     uint32_t                     inputOriginIndex,
    5675             :     uint32_t                     inputCbOriginIndex,
    5676             :     uint32_t                     cuOriginIndex,
    5677             :     uint32_t                     cuChromaOriginIndex,
    5678             :     uint64_t                     ref_fast_cost)
    5679             : {
    5680             :     uint64_t      y_full_distortion[DIST_CALC_TOTAL];
    5681             :     uint32_t      count_non_zero_coeffs[3][MAX_NUM_OF_TU_PER_CU];
    5682             : 
    5683             :     uint64_t      cbFullDistortion[DIST_CALC_TOTAL];
    5684             :     uint64_t      crFullDistortion[DIST_CALC_TOTAL];
    5685             : 
    5686             :     uint64_t      y_coeff_bits;
    5687    37535300 :     uint64_t      cb_coeff_bits = 0;
    5688    37535300 :     uint64_t      cr_coeff_bits = 0;
    5689             : 
    5690             :         // initialize TU Split
    5691    37535300 :         y_full_distortion[DIST_CALC_RESIDUAL] = 0;
    5692    37535300 :         y_full_distortion[DIST_CALC_PREDICTION] = 0;
    5693    37535300 :         y_coeff_bits = 0;
    5694             : 
    5695    37535300 :         candidate_ptr->full_distortion = 0;
    5696             : 
    5697    37535300 :         memset(candidate_ptr->eob[0], 0, sizeof(uint16_t));
    5698    37535300 :         memset(candidate_ptr->eob[1], 0, sizeof(uint16_t));
    5699    37535300 :         memset(candidate_ptr->eob[2], 0, sizeof(uint16_t));
    5700             : 
    5701    37535300 :         candidate_ptr->chroma_distortion = 0;
    5702    37535300 :         candidate_ptr->chroma_distortion_inter_depth = 0;
    5703             :         // Set Skip Flag
    5704    37535300 :         candidate_ptr->skip_flag = EB_FALSE;
    5705             : 
    5706    37535300 :         if (candidate_ptr->type != INTRA_MODE) {
    5707             : #if REMOVE_MD_STAGE_1
    5708    31690800 :             if (context_ptr->md_staging_skip_full_pred == EB_FALSE) {
    5709             : #else
    5710             :             if (picture_control_set_ptr->parent_pcs_ptr->interpolation_search_level > IT_SEARCH_OFF)
    5711             :                 if (picture_control_set_ptr->parent_pcs_ptr->interpolation_search_level == IT_SEARCH_FULL_LOOP || context_ptr->md_staging_skip_full_pred == EB_FALSE) {
    5712             :                     context_ptr->md_staging_skip_interpolation_search = EB_FALSE;
    5713             :                     context_ptr->md_staging_skip_inter_chroma_pred = EB_FALSE;
    5714             : #endif
    5715    31588200 :                     ProductPredictionFunTable[candidate_ptr->type](
    5716             :                         context_ptr,
    5717             :                         picture_control_set_ptr,
    5718             :                         candidate_buffer);
    5719             :                 }
    5720             :         }
    5721             : 
    5722             :         // Initialize luma CBF
    5723    37538800 :         candidate_ptr->y_has_coeff = 0;
    5724    37538800 :         candidate_ptr->u_has_coeff = 0;
    5725    37538800 :         candidate_ptr->v_has_coeff = 0;
    5726             : 
    5727             :         // Initialize tx type
    5728    37538800 :         candidate_ptr->transform_type[0] = DCT_DCT;
    5729    37538800 :         candidate_ptr->transform_type[1] = DCT_DCT;
    5730    37538800 :         candidate_ptr->transform_type[2] = DCT_DCT;
    5731    37538800 :         candidate_ptr->transform_type[3] = DCT_DCT;
    5732             : 
    5733    37538800 :         uint8_t end_tx_depth = 0;
    5734             :         // end_tx_depth set to zero for blocks which go beyond the picture boundaries
    5735    37538800 :         if ((context_ptr->sb_origin_x + context_ptr->blk_geom->origin_x + context_ptr->blk_geom->bwidth < picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header.max_frame_width &&
    5736    36580100 :             context_ptr->sb_origin_y + context_ptr->blk_geom->origin_y + context_ptr->blk_geom->bheight < picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header.max_frame_height))
    5737    35503400 :             end_tx_depth = get_end_tx_depth(context_ptr->blk_geom->bsize, candidate_buffer->candidate_ptr->type);
    5738             :         else
    5739     2035420 :             end_tx_depth = 0;
    5740             :         // Transform partitioning path (INTRA Luma)
    5741             : #if ENHANCE_ATB
    5742    38345000 :         if (picture_control_set_ptr->parent_pcs_ptr->atb_mode && context_ptr->md_staging_skip_atb == EB_FALSE && end_tx_depth && candidate_buffer->candidate_ptr->use_intrabc == 0) {
    5743      813707 :             int32_t is_inter = (candidate_buffer->candidate_ptr->type == INTER_MODE || candidate_buffer->candidate_ptr->use_intrabc) ? EB_TRUE : EB_FALSE;
    5744             : 
    5745             :             //Y Residual: residual for INTRA is computed inside the TU loop
    5746      813707 :             if (is_inter)
    5747             :                 //Y Residual
    5748      385319 :                 residual_kernel(
    5749             :                     input_picture_ptr->buffer_y,
    5750             :                     inputOriginIndex,
    5751      385319 :                     input_picture_ptr->stride_y,
    5752      385319 :                     candidate_buffer->prediction_ptr->buffer_y,
    5753             :                     cuOriginIndex,
    5754      385319 :                     candidate_buffer->prediction_ptr->stride_y,
    5755      385319 :                     (int16_t*)candidate_buffer->residual_ptr->buffer_y,
    5756             :                     cuOriginIndex,
    5757      385319 :                     candidate_buffer->residual_ptr->stride_y,
    5758      385319 :                     context_ptr->hbd_mode_decision,
    5759      385319 :                     context_ptr->blk_geom->bwidth,
    5760      385319 :                     context_ptr->blk_geom->bheight);
    5761             : 
    5762      813706 :             tx_partitioning_path(
    5763             : #else
    5764             :         if (picture_control_set_ptr->parent_pcs_ptr->atb_mode && context_ptr->md_staging_skip_atb == EB_FALSE && end_tx_depth && candidate_buffer->candidate_ptr->type == INTRA_MODE && candidate_buffer->candidate_ptr->use_intrabc == 0) {
    5765             :             perform_intra_tx_partitioning(
    5766             : #endif
    5767             :                 candidate_buffer,
    5768             :                 context_ptr,
    5769             :                 picture_control_set_ptr,
    5770             :                 ref_fast_cost,
    5771             :                 end_tx_depth,
    5772      813706 :                 context_ptr->cu_ptr->qp,
    5773             :                 &(*count_non_zero_coeffs[0]),
    5774             :                 &y_coeff_bits,
    5775             :                 &y_full_distortion[0]);
    5776             :         }
    5777             :         else {
    5778             :             // Transform partitioning free patch (except the 128x128 case)
    5779             : 
    5780             :             //Y Residual
    5781    36717600 :             residual_kernel(
    5782             :                 input_picture_ptr->buffer_y,
    5783             :                 inputOriginIndex,
    5784    36717600 :                 input_picture_ptr->stride_y,
    5785    36717600 :                 candidate_buffer->prediction_ptr->buffer_y,
    5786             :                 cuOriginIndex,
    5787    36717600 :                 candidate_buffer->prediction_ptr->stride_y,
    5788    36717600 :                 (int16_t*)candidate_buffer->residual_ptr->buffer_y,
    5789             :                 cuOriginIndex,
    5790    36717600 :                 candidate_buffer->residual_ptr->stride_y,
    5791    36717600 :                 context_ptr->hbd_mode_decision,
    5792    36717600 :                 context_ptr->blk_geom->bwidth,
    5793    36717600 :                 context_ptr->blk_geom->bheight);
    5794             : 
    5795             :             // Transform partitioning free path
    5796             :             uint8_t  tx_search_skip_flag;
    5797    36722900 :             if (context_ptr->md_staging_tx_search == 0)
    5798    33617700 :                 tx_search_skip_flag = EB_TRUE;
    5799     3105160 :             else if (context_ptr->md_staging_tx_search == 1)
    5800     5349920 :                 tx_search_skip_flag = picture_control_set_ptr->parent_pcs_ptr->tx_search_level == TX_SEARCH_FULL_LOOP ? get_skip_tx_search_flag(
    5801     2622770 :                     context_ptr->blk_geom->sq_size,
    5802             :                     ref_fast_cost,
    5803     2622770 :                     *candidate_buffer->fast_cost_ptr,
    5804     2622770 :                     picture_control_set_ptr->parent_pcs_ptr->tx_weight) : EB_TRUE;
    5805             :             else
    5806      377991 :                 tx_search_skip_flag = picture_control_set_ptr->parent_pcs_ptr->tx_search_level == TX_SEARCH_FULL_LOOP ? EB_FALSE : EB_TRUE;
    5807             : 
    5808    36722900 :             if (!tx_search_skip_flag) {
    5809     1760940 :                 product_full_loop_tx_search(
    5810             :                     candidate_buffer,
    5811             :                     context_ptr,
    5812             :                     picture_control_set_ptr);
    5813             : 
    5814     1760920 :                 candidate_ptr->full_distortion = 0;
    5815             : 
    5816     1760920 :                 memset(candidate_ptr->eob[0], 0, sizeof(uint16_t));
    5817             : 
    5818             :                 //re-init
    5819     1760920 :                 candidate_ptr->y_has_coeff = 0;
    5820             :             }
    5821             : #if ENHANCE_ATB
    5822    36722800 :             context_ptr->tx_depth = candidate_buffer->candidate_ptr->tx_depth;
    5823    36722800 :             context_ptr->full_loop_luma_dc_sign_level_coeff_neighbor_array = context_ptr->luma_dc_sign_level_coeff_neighbor_array;
    5824    36722800 :             context_ptr->txb_1d_offset = 0;
    5825    73443100 :             for (context_ptr->txb_itr = 0; context_ptr->txb_itr < context_ptr->blk_geom->txb_count[context_ptr->tx_depth]; context_ptr->txb_itr++)
    5826             : #endif
    5827    36726100 :             product_full_loop(
    5828             :                 candidate_buffer,
    5829             :                 context_ptr,
    5830             :                 picture_control_set_ptr,
    5831             :                 input_picture_ptr,
    5832    36726100 :                 context_ptr->cu_ptr->qp,
    5833             :                 &(*count_non_zero_coeffs[0]),
    5834             :                 &y_coeff_bits,
    5835             :                 &y_full_distortion[0]);
    5836             :         }
    5837             : 
    5838    37530800 :         candidate_ptr->chroma_distortion_inter_depth = 0;
    5839    37530800 :         candidate_ptr->chroma_distortion = 0;
    5840             : 
    5841             :         //CHROMA
    5842             : 
    5843    37530800 :         cbFullDistortion[DIST_CALC_RESIDUAL] = 0;
    5844    37530800 :         crFullDistortion[DIST_CALC_RESIDUAL] = 0;
    5845    37530800 :         cbFullDistortion[DIST_CALC_PREDICTION] = 0;
    5846    37530800 :         crFullDistortion[DIST_CALC_PREDICTION] = 0;
    5847             : 
    5848    37530800 :         cb_coeff_bits = 0;
    5849    37530800 :         cr_coeff_bits = 0;
    5850             : 
    5851             :         // FullLoop and TU search
    5852    37530800 :         uint16_t cb_qp = context_ptr->qp;
    5853    37530800 :         uint16_t cr_qp = context_ptr->qp;
    5854    37530800 :         if (context_ptr->md_staging_skip_full_chroma == EB_FALSE) {
    5855             : 
    5856     3925010 :             if (context_ptr->blk_geom->has_uv && context_ptr->chroma_level <= CHROMA_MODE_1) {
    5857             :                 //Cb Residual
    5858     3169220 :                 residual_kernel(
    5859             :                     input_picture_ptr->buffer_cb,
    5860             :                     inputCbOriginIndex,
    5861     3169220 :                     input_picture_ptr->stride_cb,
    5862     3169220 :                     candidate_buffer->prediction_ptr->buffer_cb,
    5863             :                     cuChromaOriginIndex,
    5864     3169220 :                     candidate_buffer->prediction_ptr->stride_cb,
    5865     3169220 :                     (int16_t*)candidate_buffer->residual_ptr->buffer_cb,
    5866             :                     cuChromaOriginIndex,
    5867     3169220 :                     candidate_buffer->residual_ptr->stride_cb,
    5868     3169220 :                     context_ptr->hbd_mode_decision,
    5869     3169220 :                     context_ptr->blk_geom->bwidth_uv,
    5870     3169220 :                     context_ptr->blk_geom->bheight_uv);
    5871             : 
    5872             :                 //Cr Residual
    5873     3169260 :                 residual_kernel(
    5874             :                     input_picture_ptr->buffer_cr,
    5875             :                     inputCbOriginIndex,
    5876     3169260 :                     input_picture_ptr->stride_cr,
    5877     3169260 :                     candidate_buffer->prediction_ptr->buffer_cr,
    5878             :                     cuChromaOriginIndex,
    5879     3169260 :                     candidate_buffer->prediction_ptr->stride_cr,
    5880     3169260 :                     (int16_t*)candidate_buffer->residual_ptr->buffer_cr,
    5881             :                     cuChromaOriginIndex,
    5882     3169260 :                     candidate_buffer->residual_ptr->stride_cr,
    5883     3169260 :                     context_ptr->hbd_mode_decision,
    5884     3169260 :                     context_ptr->blk_geom->bwidth_uv,
    5885     3169260 :                     context_ptr->blk_geom->bheight_uv);
    5886             :             }
    5887             : 
    5888     3925090 :             if (candidate_ptr->type == INTRA_MODE && candidate_buffer->candidate_ptr->intra_chroma_mode == UV_CFL_PRED) {
    5889             :                 // If mode is CFL:
    5890             :                 // 1: recon the Luma
    5891             :                 // 2: Form the pred_buf_q3
    5892             :                 // 3: Loop over alphas and find the best or choose DC
    5893             :                 // 4: Recalculate the residual for chroma
    5894      749216 :                 CflPrediction(
    5895             :                     picture_control_set_ptr,
    5896             :                     candidate_buffer,
    5897             :                     sb_ptr,
    5898             :                     context_ptr,
    5899             :                     input_picture_ptr,
    5900             :                     inputCbOriginIndex,
    5901             :                     cuChromaOriginIndex);
    5902             :             }
    5903     3925120 :             if (context_ptr->blk_geom->has_uv && context_ptr->chroma_level <= CHROMA_MODE_1) {
    5904     3169240 :                 full_loop_r(
    5905             :                     sb_ptr,
    5906             :                     candidate_buffer,
    5907             :                     context_ptr,
    5908             :                     input_picture_ptr,
    5909             :                     picture_control_set_ptr,
    5910             :                     PICTURE_BUFFER_DESC_CHROMA_MASK,
    5911             :                     cb_qp,
    5912             :                     cr_qp,
    5913             :                     &(*count_non_zero_coeffs[1]),
    5914             :                     &(*count_non_zero_coeffs[2]));
    5915             : 
    5916     3169250 :                 cu_full_distortion_fast_tu_mode_r(
    5917             :                     sb_ptr,
    5918             :                     candidate_buffer,
    5919             :                     context_ptr,
    5920             :                     candidate_ptr,
    5921             :                     picture_control_set_ptr,
    5922             :                     input_picture_ptr,
    5923             :                     cbFullDistortion,
    5924             :                     crFullDistortion,
    5925             :                     count_non_zero_coeffs,
    5926             :                     COMPONENT_CHROMA,
    5927             :                     &cb_coeff_bits,
    5928             :                     &cr_coeff_bits,
    5929             :                     1);
    5930             :             }
    5931             : 
    5932             :             // Check independant chroma vs. cfl
    5933     3925210 :             if (context_ptr->blk_geom->has_uv && context_ptr->chroma_level == CHROMA_MODE_0) {
    5934     1017200 :                 if (candidate_buffer->candidate_ptr->type == INTRA_MODE && (candidate_buffer->candidate_ptr->intra_chroma_mode == UV_CFL_PRED || candidate_buffer->candidate_ptr->intra_chroma_mode == UV_DC_PRED)) {
    5935      522352 :                     check_best_indepedant_cfl(
    5936             :                         picture_control_set_ptr,
    5937             :                         input_picture_ptr,
    5938             :                         context_ptr,
    5939             :                         inputCbOriginIndex,
    5940             :                         cuChromaOriginIndex,
    5941             :                         candidate_buffer,
    5942      522352 :                         (uint8_t)cb_qp,
    5943      522352 :                         (uint8_t)cr_qp,
    5944             :                         cbFullDistortion,
    5945             :                         crFullDistortion,
    5946             :                         &cb_coeff_bits,
    5947             :                         &cr_coeff_bits);
    5948             :                 }
    5949             :             }
    5950             :         }
    5951             : 
    5952    37531000 :         candidate_ptr->block_has_coeff = (candidate_ptr->y_has_coeff | candidate_ptr->u_has_coeff | candidate_ptr->v_has_coeff) ? EB_TRUE : EB_FALSE;
    5953             : 
    5954             :         //ALL PLANE
    5955    37531000 :         Av1ProductFullCostFuncTable[candidate_ptr->type](
    5956             :             picture_control_set_ptr,
    5957             :             context_ptr,
    5958             :             candidate_buffer,
    5959             :             cu_ptr,
    5960             :             y_full_distortion,
    5961             :             cbFullDistortion,
    5962             :             crFullDistortion,
    5963    37531000 :             context_ptr->full_lambda,
    5964             :             &y_coeff_bits,
    5965             :             &cb_coeff_bits,
    5966             :             &cr_coeff_bits,
    5967    37531000 :             context_ptr->blk_geom->bsize);
    5968             : 
    5969    37543600 :         candidate_buffer->cb_distortion[DIST_CALC_RESIDUAL] = cbFullDistortion[DIST_CALC_RESIDUAL];
    5970    37543600 :         candidate_buffer->cb_distortion[DIST_CALC_PREDICTION] = cbFullDistortion[DIST_CALC_PREDICTION];
    5971    37543600 :         candidate_buffer->cb_coeff_bits = cb_coeff_bits;
    5972             : 
    5973    37543600 :         candidate_buffer->cr_distortion[DIST_CALC_RESIDUAL] = crFullDistortion[DIST_CALC_RESIDUAL];
    5974    37543600 :         candidate_buffer->cr_distortion[DIST_CALC_PREDICTION] = crFullDistortion[DIST_CALC_PREDICTION];
    5975    37543600 :         candidate_buffer->cr_coeff_bits = cr_coeff_bits;
    5976    37543600 :         candidate_buffer->candidate_ptr->full_distortion = (uint32_t)(y_full_distortion[0]);
    5977             : 
    5978    37543600 :         candidate_buffer->y_coeff_bits = y_coeff_bits;
    5979    37543600 :         candidate_ptr->full_distortion = (uint32_t)(y_full_distortion[0]);
    5980    37543600 : }
    5981             : #if REMOVE_MD_STAGE_1
    5982     3160720 : void md_stage_1(
    5983             : #else
    5984             : void md_stage_2(
    5985             : #endif
    5986             :     PictureControlSet     *picture_control_set_ptr,
    5987             :     LargestCodingUnit     *sb_ptr,
    5988             :     CodingUnit            *cu_ptr,
    5989             :     ModeDecisionContext   *context_ptr,
    5990             :     EbPictureBufferDesc   *input_picture_ptr,
    5991             :     uint32_t               inputOriginIndex,
    5992             :     uint32_t               inputCbOriginIndex,
    5993             :     uint32_t               cuOriginIndex,
    5994             :     uint32_t               cuChromaOriginIndex,
    5995             :     uint64_t               ref_fast_cost)
    5996             : {
    5997     3160720 :     ModeDecisionCandidateBuffer **candidate_buffer_ptr_array_base = context_ptr->candidate_buffer_ptr_array;
    5998     3160720 :     ModeDecisionCandidateBuffer **candidate_buffer_ptr_array = &(candidate_buffer_ptr_array_base[0]);
    5999             :     ModeDecisionCandidateBuffer  *candidate_buffer;
    6000             :     ModeDecisionCandidate        *candidate_ptr;
    6001             : 
    6002             :     uint32_t fullLoopCandidateIndex;
    6003             :     uint32_t candidateIndex;
    6004             : 
    6005             :     // Set MD Staging full_loop_core settings
    6006             : #if !REMOVE_MD_STAGE_1
    6007             :     context_ptr->md_staging_skip_full_pred = EB_TRUE;
    6008             : #endif
    6009     3160720 :     context_ptr->md_staging_skip_atb = EB_TRUE;
    6010     3160720 :     context_ptr->md_staging_tx_search = 0;
    6011             : #if FILTER_INTRA_FLAG
    6012             : #if REMOVE_MD_STAGE_1
    6013     3160720 :     context_ptr->md_staging_skip_full_chroma = EB_TRUE;
    6014             : #else
    6015             :     context_ptr->md_staging_skip_full_chroma =  context_ptr->target_class == CAND_CLASS_0 || context_ptr->target_class == CAND_CLASS_6 || context_ptr->md_staging_mode == MD_STAGING_MODE_3;
    6016             : #endif
    6017             : #else
    6018             :     context_ptr->md_staging_skip_full_chroma = context_ptr->target_class == CAND_CLASS_0 || context_ptr->md_staging_mode == MD_STAGING_MODE_3;
    6019             : #endif
    6020             : 
    6021             : #if REMOVE_MD_STAGE_1
    6022     3160720 :     context_ptr->md_staging_skip_rdoq = EB_TRUE;
    6023    36782200 :     for (fullLoopCandidateIndex = 0; fullLoopCandidateIndex < context_ptr->md_stage_1_count[context_ptr->target_class]; ++fullLoopCandidateIndex) {
    6024             : #else
    6025             :     context_ptr->md_staging_skip_rdoq = (context_ptr->md_staging_mode == MD_STAGING_MODE_2 || context_ptr->md_staging_mode == MD_STAGING_MODE_3);
    6026             :     for (fullLoopCandidateIndex = 0; fullLoopCandidateIndex < context_ptr->md_stage_2_count[context_ptr->target_class]; ++fullLoopCandidateIndex) {
    6027             : #endif
    6028    33622900 :         candidateIndex = context_ptr->cand_buff_indices[context_ptr->target_class][fullLoopCandidateIndex];
    6029    33622900 :         candidate_buffer = candidate_buffer_ptr_array[candidateIndex];
    6030    33622900 :         candidate_ptr = candidate_buffer->candidate_ptr;
    6031             : 
    6032             : #if REMOVE_MD_STAGE_1
    6033    33622900 :         context_ptr->md_staging_skip_full_pred = EB_FALSE;
    6034    33622900 :         context_ptr->md_staging_skip_interpolation_search = EB_FALSE;
    6035    33622900 :         context_ptr->md_staging_skip_inter_chroma_pred = EB_TRUE;
    6036    33622900 :         candidate_buffer->candidate_ptr->interp_filters = 0;
    6037             : #endif
    6038    33622900 :         full_loop_core(
    6039             :             picture_control_set_ptr,
    6040             :             sb_ptr,
    6041             :             cu_ptr,
    6042             :             context_ptr,
    6043             :             candidate_buffer,
    6044             :             candidate_ptr,
    6045             :             input_picture_ptr,
    6046             :             inputOriginIndex,
    6047             :             inputCbOriginIndex,
    6048             :             cuOriginIndex,
    6049             :             cuChromaOriginIndex,
    6050             :             ref_fast_cost);
    6051             :     }
    6052     3159220 : }
    6053             : #if REMOVE_MD_STAGE_1
    6054      811357 : void md_stage_2(
    6055             : #else
    6056             : void md_stage_3(
    6057             : #endif
    6058             :     PictureControlSet     *picture_control_set_ptr,
    6059             :     LargestCodingUnit     *sb_ptr,
    6060             :     CodingUnit            *cu_ptr,
    6061             :     ModeDecisionContext   *context_ptr,
    6062             :     EbPictureBufferDesc   *input_picture_ptr,
    6063             :     uint32_t               inputOriginIndex,
    6064             :     uint32_t               inputCbOriginIndex,
    6065             :     uint32_t               cuOriginIndex,
    6066             :     uint32_t               cuChromaOriginIndex,
    6067             :     uint32_t               fullCandidateTotalCount,
    6068             :     uint64_t               ref_fast_cost)
    6069             : {
    6070      811357 :     ModeDecisionCandidateBuffer **candidate_buffer_ptr_array_base = context_ptr->candidate_buffer_ptr_array;
    6071      811357 :     ModeDecisionCandidateBuffer **candidate_buffer_ptr_array = &(candidate_buffer_ptr_array_base[0]);
    6072             :     ModeDecisionCandidateBuffer  *candidate_buffer;
    6073             :     ModeDecisionCandidate        *candidate_ptr;
    6074             : 
    6075      811357 :     uint32_t best_inter_luma_zero_coeff = 1;
    6076      811357 :     uint64_t best_full_cost = 0xFFFFFFFFull;
    6077             :     uint32_t fullLoopCandidateIndex;
    6078             :     uint32_t candidateIndex;
    6079             : 
    6080     4736450 :     for (fullLoopCandidateIndex = 0; fullLoopCandidateIndex < fullCandidateTotalCount; ++fullLoopCandidateIndex) {
    6081             : 
    6082     3984600 :         candidateIndex = (context_ptr->full_loop_escape == 2) ? context_ptr->sorted_candidate_index_array[fullLoopCandidateIndex] : context_ptr->best_candidate_index_array[fullLoopCandidateIndex];
    6083     3984600 :         candidate_buffer = candidate_buffer_ptr_array[candidateIndex];
    6084     3984600 :         candidate_ptr = candidate_buffer->candidate_ptr;
    6085             : 
    6086             :         // Set MD Staging full_loop_core settings
    6087             : #if REMOVE_MD_STAGE_1
    6088     3984600 :         context_ptr->md_staging_skip_full_pred = (context_ptr->md_staging_mode == MD_STAGING_MODE_0 && picture_control_set_ptr->parent_pcs_ptr->interpolation_search_level != IT_SEARCH_FULL_LOOP);
    6089     3984600 :         context_ptr->md_staging_skip_interpolation_search = (context_ptr->md_staging_mode == MD_STAGING_MODE_1 || picture_control_set_ptr->parent_pcs_ptr->interpolation_search_level != IT_SEARCH_FULL_LOOP);
    6090     3984600 :         context_ptr->md_staging_skip_inter_chroma_pred = EB_FALSE;
    6091             : #else
    6092             :         context_ptr->md_staging_skip_full_pred = (context_ptr->md_staging_mode == MD_STAGING_MODE_3) ? EB_FALSE: EB_TRUE;
    6093             : #endif
    6094     3984600 :         context_ptr->md_staging_skip_atb = context_ptr->coeff_based_skip_atb;
    6095             : #if FILTER_INTRA_FLAG
    6096             : #if PAL_CLASS
    6097     3984600 :         context_ptr->md_staging_tx_search =
    6098     3984600 :             (candidate_ptr->cand_class == CAND_CLASS_0 || candidate_ptr->cand_class == CAND_CLASS_6 || candidate_ptr->cand_class == CAND_CLASS_7)
    6099             :             ? 2 : 1;
    6100             : #else
    6101             :         context_ptr->md_staging_tx_search = (candidate_ptr->cand_class == CAND_CLASS_0 || candidate_ptr->cand_class == CAND_CLASS_6)? 2 : 1;
    6102             : #endif
    6103             : #else
    6104             :         context_ptr->md_staging_tx_search = candidate_ptr->cand_class == CAND_CLASS_0 ? 2 : 1;
    6105             : #endif
    6106     3984600 :         context_ptr->md_staging_skip_full_chroma = EB_FALSE;
    6107     3984600 :         context_ptr->md_staging_skip_rdoq = EB_FALSE;
    6108             : 
    6109     3984600 :         if (picture_control_set_ptr->slice_type != I_SLICE) {
    6110     3350080 :             if ((candidate_ptr->type == INTRA_MODE || context_ptr->full_loop_escape == 2) && best_inter_luma_zero_coeff == 0) {
    6111             : #if REMOVE_MD_STAGE_1
    6112       59633 :                 context_ptr->md_stage_2_total_count = fullLoopCandidateIndex;
    6113             : #else
    6114             :                 context_ptr->md_stage_3_total_count = fullLoopCandidateIndex;
    6115             : #endif
    6116       59633 :                 return;
    6117             :             }
    6118             :         }
    6119             : 
    6120     3924960 :         full_loop_core(
    6121             :             picture_control_set_ptr,
    6122             :             sb_ptr,
    6123             :             cu_ptr,
    6124             :             context_ptr,
    6125             :             candidate_buffer,
    6126             :             candidate_ptr,
    6127             :             input_picture_ptr,
    6128             :             inputOriginIndex,
    6129             :             inputCbOriginIndex,
    6130             :             cuOriginIndex,
    6131             :             cuChromaOriginIndex,
    6132             :             ref_fast_cost);
    6133             : 
    6134     3925090 :         if (context_ptr->full_loop_escape)
    6135             :         {
    6136      149237 :             if (picture_control_set_ptr->slice_type != I_SLICE) {
    6137      114803 :                 if (candidate_ptr->type == INTER_MODE) {
    6138      104369 :                     if (*candidate_buffer->full_cost_ptr < best_full_cost) {
    6139       77057 :                         best_inter_luma_zero_coeff = candidate_ptr->y_has_coeff;
    6140       77057 :                         best_full_cost = *candidate_buffer->full_cost_ptr;
    6141             :                     }
    6142             :                 }
    6143             :             }
    6144             :         }
    6145             :     }
    6146             : }
    6147             : 
    6148       33840 : void move_cu_data(
    6149             : #if PAL_SUP
    6150             :     PictureControlSet* pcs,
    6151             :     EncDecContext      *context_ptr,
    6152             : #endif
    6153             :     CodingUnit *src_cu,
    6154             :     CodingUnit *dst_cu)
    6155             : {
    6156             : #if PAL_SUP
    6157       33840 :         memcpy(&dst_cu->palette_info.pmi, &src_cu->palette_info.pmi, sizeof(PaletteModeInfo));
    6158       33840 :         if (svt_av1_allow_palette(pcs->parent_pcs_ptr->palette_mode, context_ptr->blk_geom->bsize)){
    6159           0 :             dst_cu->palette_info.color_idx_map = (uint8_t *)malloc(MAX_PALETTE_SQUARE);
    6160           0 :             assert(dst_cu->palette_info.color_idx_map != NULL && "palette:Not-Enough-Memory");
    6161           0 :             if(dst_cu->palette_info.color_idx_map != NULL)
    6162           0 :                  memcpy(dst_cu->palette_info.color_idx_map, src_cu->palette_info.color_idx_map, MAX_PALETTE_SQUARE);
    6163             :             else
    6164           0 :                 printf("ERROR palette:Not-Enough-Memory\n");
    6165             :         }
    6166             : #endif
    6167             : #if OBMC_FLAG
    6168       33839 :     dst_cu->interp_filters = src_cu->interp_filters;
    6169             : #endif
    6170       33839 :     dst_cu->interinter_comp.type = src_cu->interinter_comp.type;
    6171       33839 :     dst_cu->interinter_comp.mask_type = src_cu->interinter_comp.mask_type;
    6172       33839 :     dst_cu->interinter_comp.wedge_index = src_cu->interinter_comp.wedge_index;
    6173       33839 :     dst_cu->interinter_comp.wedge_sign = src_cu->interinter_comp.wedge_sign;
    6174       33839 :     dst_cu->compound_idx = src_cu->compound_idx;
    6175       33839 :     dst_cu->comp_group_idx = src_cu->comp_group_idx;
    6176             : 
    6177             : #if II_COMP_FLAG
    6178       33839 :        dst_cu->is_interintra_used      = src_cu->is_interintra_used          ;
    6179       33839 :        dst_cu->interintra_mode         = src_cu->interintra_mode             ;
    6180       33839 :        dst_cu->use_wedge_interintra    = src_cu->use_wedge_interintra        ;
    6181       33839 :        dst_cu->interintra_wedge_index  = src_cu->interintra_wedge_index      ;//inter_intra wedge index
    6182       33839 :        dst_cu->ii_wedge_sign           = src_cu->ii_wedge_sign               ;//inter_intra wedge sign=-1
    6183             : #endif
    6184             :     //CHKN TransformUnit             transform_unit_array[TRANSFORM_UNIT_MAX_COUNT]; // 2-bytes * 21 = 42-bytes
    6185       33839 :     memcpy(dst_cu->transform_unit_array, src_cu->transform_unit_array, TRANSFORM_UNIT_MAX_COUNT * sizeof(TransformUnit));
    6186             : 
    6187             :     //CHKN PredictionUnit            prediction_unit_array[MAX_NUM_OF_PU_PER_CU];    // 35-bytes * 4 = 140 bytes
    6188       33839 :     memcpy(dst_cu->prediction_unit_array, src_cu->prediction_unit_array, MAX_NUM_OF_PU_PER_CU * sizeof(PredictionUnit));
    6189             : 
    6190             :     //CHKN     unsigned                    skip_flag_context : 2;
    6191             :     //CHKN     unsigned                    prediction_mode_flag : 2;
    6192             :     //CHKN     unsigned                    rootCbf : 1;
    6193             :     //CHKN     unsigned                    split_flag_context : 2;
    6194             :     //CHKN #if !ADD_DELTA_QP_SUPPORT
    6195             :     //CHKN     unsigned                    qp : 6;
    6196             :     //CHKN     unsigned                    ref_qp : 6;
    6197             :     //CHKN
    6198             :     //CHKN     signed                         delta_qp : 8; // can be signed 8bits
    6199             :     //CHKN     signed                         org_delta_qp : 8;
    6200             :     //CHKN #endif
    6201             :     //CHKN
    6202             :     //CHKN #if ADD_DELTA_QP_SUPPORT
    6203             :     //CHKN     uint16_t                       qp;
    6204             :     //CHKN     uint16_t                       ref_qp;
    6205             :     //CHKN
    6206             :     //CHKN     int16_t                          delta_qp; // can be signed 8bits
    6207             :     //CHKN     int16_t                          org_delta_qp;
    6208             :     //CHKN #endif
    6209             : 
    6210       33839 :     dst_cu->skip_flag_context = src_cu->skip_flag_context;
    6211       33839 :     dst_cu->prediction_mode_flag = src_cu->prediction_mode_flag;
    6212       33839 :     dst_cu->block_has_coeff = src_cu->block_has_coeff;
    6213       33839 :     dst_cu->split_flag_context = src_cu->split_flag_context;
    6214       33839 :     dst_cu->qp = src_cu->qp;
    6215       33839 :     dst_cu->delta_qp = src_cu->delta_qp;
    6216             : 
    6217       33839 :     dst_cu->tx_depth = src_cu->tx_depth;
    6218             : 
    6219             :     //CHKN    // Coded Tree
    6220             :     //CHKN    struct {
    6221             :     //CHKN        unsigned                   leaf_index : 8;
    6222             :     //CHKN        unsigned                   split_flag : 1;
    6223             :     //CHKN        unsigned                   skip_flag : 1;
    6224             :     //CHKN
    6225             :     //CHKN    };
    6226             : 
    6227       33839 :     dst_cu->leaf_index = src_cu->leaf_index;
    6228       33839 :     dst_cu->split_flag = src_cu->split_flag;
    6229       33839 :     dst_cu->skip_flag = src_cu->skip_flag;
    6230             : 
    6231             :     //CHKN    MacroBlockD*  av1xd;
    6232       33839 :     memcpy(dst_cu->av1xd, src_cu->av1xd, sizeof(MacroBlockD));
    6233             : 
    6234             :     // uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
    6235             : 
    6236             :     //CHKN int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
    6237       33839 :     memcpy(dst_cu->inter_mode_ctx, src_cu->inter_mode_ctx, MODE_CTX_REF_FRAMES * sizeof(int16_t));
    6238             : 
    6239             :     //CHKN IntMv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; //used only for nonCompound modes.
    6240       33839 :     memcpy(dst_cu->ref_mvs, src_cu->ref_mvs, MODE_CTX_REF_FRAMES*MAX_MV_REF_CANDIDATES * sizeof(IntMv));
    6241             : 
    6242             :     //CHKN uint8_t  drl_index;
    6243             :     //CHKN PredictionMode               pred_mode;
    6244       33839 :     dst_cu->drl_index = src_cu->drl_index;
    6245       33839 :     dst_cu->pred_mode = src_cu->pred_mode;
    6246             : 
    6247             :     //CHKN IntMv  predmv[2];
    6248             : 
    6249       33839 :     memcpy(dst_cu->predmv, src_cu->predmv, 2 * sizeof(IntMv));
    6250             :     //CHKN uint8_t                         skip_coeff_context;
    6251             :     //CHKN int16_t                        luma_txb_skip_context;
    6252             :     //CHKN int16_t                        luma_dc_sign_context;
    6253             :     //CHKN int16_t                        cb_txb_skip_context;
    6254             :     //CHKN int16_t                        cb_dc_sign_context;
    6255             :     //CHKN int16_t                        cr_txb_skip_context;
    6256             :     //CHKN int16_t                        cr_dc_sign_context;
    6257             :     //CHKN uint8_t                         reference_mode_context;
    6258             :     //CHKN uint8_t                         compoud_reference_type_context;
    6259             :     //CHKN uint32_t                        partitionContext;
    6260             : 
    6261       33839 :     dst_cu->skip_coeff_context = src_cu->skip_coeff_context;
    6262       33839 :     dst_cu->reference_mode_context = src_cu->reference_mode_context;
    6263       33839 :     dst_cu->compoud_reference_type_context = src_cu->compoud_reference_type_context;
    6264       33839 :     dst_cu->segment_id = src_cu->segment_id;
    6265             : 
    6266       33839 :     memcpy(dst_cu->quantized_dc, src_cu->quantized_dc, 3 * MAX_TXB_COUNT * sizeof(int32_t));
    6267             :     //CHKN uint32_t   is_inter_ctx;
    6268             :     //CHKN uint32_t                     interp_filters;
    6269             : 
    6270       33839 :     dst_cu->is_inter_ctx = src_cu->is_inter_ctx;
    6271       33839 :     dst_cu->interp_filters = src_cu->interp_filters;
    6272             : 
    6273       33839 :     dst_cu->part = src_cu->part;
    6274       33839 :     dst_cu->shape = src_cu->shape;
    6275       33839 :     dst_cu->mds_idx = src_cu->mds_idx;
    6276             : #if FILTER_INTRA_FLAG
    6277       33839 :     dst_cu->filter_intra_mode = src_cu->filter_intra_mode;
    6278             : #endif
    6279       33839 : }
    6280      101022 : void move_cu_data_redund(
    6281             : #if PAL_SUP
    6282             :     PictureControlSet     *pcs,
    6283             :     ModeDecisionContext   *context_ptr,
    6284             : #endif
    6285             :     CodingUnit *src_cu,
    6286             :     CodingUnit *dst_cu){
    6287             : #if PAL_SUP
    6288      101022 :     dst_cu->segment_id = src_cu->segment_id;
    6289      101022 :     dst_cu->seg_id_predicted = src_cu->seg_id_predicted;
    6290      101022 :     dst_cu->ref_qp = src_cu->ref_qp;
    6291      101022 :     dst_cu->org_delta_qp = src_cu->org_delta_qp;
    6292             : 
    6293      101022 :     memcpy(&dst_cu->palette_info.pmi, &src_cu->palette_info.pmi, sizeof(PaletteModeInfo));
    6294      101022 :     if (svt_av1_allow_palette(pcs->parent_pcs_ptr->palette_mode, context_ptr->blk_geom->bsize))
    6295           0 :         memcpy(dst_cu->palette_info.color_idx_map, src_cu->palette_info.color_idx_map, MAX_PALETTE_SQUARE);
    6296             : 
    6297             : #endif
    6298             : #if OBMC_FLAG
    6299      101022 :     dst_cu->interp_filters = src_cu->interp_filters;
    6300             : #endif
    6301      101022 :     dst_cu->interinter_comp.type = src_cu->interinter_comp.type;
    6302      101022 :     dst_cu->interinter_comp.mask_type = src_cu->interinter_comp.mask_type;
    6303      101022 :     dst_cu->interinter_comp.wedge_index = src_cu->interinter_comp.wedge_index;
    6304      101022 :     dst_cu->interinter_comp.wedge_sign = src_cu->interinter_comp.wedge_sign;
    6305      101022 :     dst_cu->compound_idx = src_cu->compound_idx;
    6306      101022 :     dst_cu->comp_group_idx = src_cu->comp_group_idx;
    6307             : #if II_COMP_FLAG
    6308      101022 :        dst_cu->is_interintra_used      = src_cu->is_interintra_used          ;
    6309      101022 :        dst_cu->interintra_mode         = src_cu->interintra_mode             ;
    6310      101022 :        dst_cu->use_wedge_interintra    = src_cu->use_wedge_interintra        ;
    6311      101022 :        dst_cu->interintra_wedge_index  = src_cu->interintra_wedge_index      ;//inter_intra wedge index
    6312      101022 :        dst_cu->ii_wedge_sign           = src_cu->ii_wedge_sign               ;//inter_intra wedge sign=-1
    6313             : #endif
    6314             : #if FILTER_INTRA_FLAG
    6315      101022 :     dst_cu->filter_intra_mode = src_cu->filter_intra_mode;
    6316             : #endif
    6317             :     //CHKN TransformUnit_t             transform_unit_array[TRANSFORM_UNIT_MAX_COUNT]; // 2-bytes * 21 = 42-bytes
    6318      101022 :     memcpy(dst_cu->transform_unit_array, src_cu->transform_unit_array, TRANSFORM_UNIT_MAX_COUNT * sizeof(TransformUnit));
    6319             : 
    6320             :     //CHKN PredictionUnit_t            prediction_unit_array[MAX_NUM_OF_PU_PER_CU];    // 35-bytes * 4 = 140 bytes
    6321      101022 :     memcpy(dst_cu->prediction_unit_array, src_cu->prediction_unit_array, MAX_NUM_OF_PU_PER_CU * sizeof(PredictionUnit));
    6322             : 
    6323             :     //CHKN     unsigned                    skip_flag_context : 2;
    6324             :     //CHKN     unsigned                    prediction_mode_flag : 2;
    6325             :     //CHKN     unsigned                    rootCbf : 1;
    6326             :     //CHKN     unsigned                    split_flag_context : 2;
    6327             :     //CHKN #if !ADD_DELTA_QP_SUPPORT
    6328             :     //CHKN     unsigned                    qp : 6;
    6329             :     //CHKN     unsigned                    ref_qp : 6;
    6330             :     //CHKN
    6331             :     //CHKN     signed                         delta_qp : 8; // can be signed 8bits
    6332             :     //CHKN     signed                         org_delta_qp : 8;
    6333             :     //CHKN #endif
    6334             :     //CHKN
    6335             :     //CHKN #if ADD_DELTA_QP_SUPPORT
    6336             :     //CHKN     uint16_t                       qp;
    6337             :     //CHKN     uint16_t                       ref_qp;
    6338             :     //CHKN
    6339             :     //CHKN     int16_t                          delta_qp; // can be signed 8bits
    6340             :     //CHKN     int16_t                          org_delta_qp;
    6341             :     //CHKN #endif
    6342             : 
    6343      101022 :     dst_cu->skip_flag_context = src_cu->skip_flag_context;
    6344      101022 :     dst_cu->prediction_mode_flag = src_cu->prediction_mode_flag;
    6345      101022 :     dst_cu->block_has_coeff = src_cu->block_has_coeff;
    6346      101022 :     dst_cu->split_flag_context = src_cu->split_flag_context;
    6347      101022 :     dst_cu->qp = src_cu->qp;
    6348      101022 :     dst_cu->delta_qp = src_cu->delta_qp;
    6349             :     //CHKN    // Coded Tree
    6350             :     //CHKN    struct {
    6351             :     //CHKN        unsigned                   leaf_index : 8;
    6352             :     //CHKN        unsigned                   split_flag : 1;
    6353             :     //CHKN        unsigned                   skip_flag : 1;
    6354             :     //CHKN
    6355             :     //CHKN    };
    6356             : 
    6357      101022 :     dst_cu->leaf_index = src_cu->leaf_index;
    6358      101022 :     dst_cu->skip_flag = src_cu->skip_flag;
    6359      101022 :     dst_cu->tx_depth = src_cu->tx_depth;
    6360             :     //CHKN    MacroBlockD*  av1xd;
    6361      101022 :     memcpy(dst_cu->av1xd, src_cu->av1xd, sizeof(MacroBlockD));
    6362             : 
    6363             :     // uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
    6364             : 
    6365             :     //CHKN int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
    6366      101022 :     memcpy(dst_cu->inter_mode_ctx, src_cu->inter_mode_ctx, MODE_CTX_REF_FRAMES * sizeof(int16_t));
    6367             : 
    6368             :     //CHKN IntMv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; //used only for nonCompound modes.
    6369      101022 :     memcpy(dst_cu->ref_mvs, src_cu->ref_mvs, MODE_CTX_REF_FRAMES*MAX_MV_REF_CANDIDATES * sizeof(IntMv));
    6370             : 
    6371             :     //CHKN uint8_t  drl_index;
    6372             :     //CHKN PredictionMode               pred_mode;
    6373      101022 :     dst_cu->drl_index = src_cu->drl_index;
    6374      101022 :     dst_cu->pred_mode = src_cu->pred_mode;
    6375             : 
    6376             :     //CHKN IntMv  predmv[2];
    6377             : 
    6378      101022 :     memcpy(dst_cu->predmv, src_cu->predmv, 2 * sizeof(IntMv));
    6379             : 
    6380             :     //CHKN uint8_t                         skip_coeff_context;
    6381             :     //CHKN int16_t                        luma_txb_skip_context;
    6382             :     //CHKN int16_t                        luma_dc_sign_context;
    6383             :     //CHKN int16_t                        cb_txb_skip_context;
    6384             :     //CHKN int16_t                        cb_dc_sign_context;
    6385             :     //CHKN int16_t                        cr_txb_skip_context;
    6386             :     //CHKN int16_t                        cr_dc_sign_context;
    6387             :     //CHKN uint8_t                         reference_mode_context;
    6388             :     //CHKN uint8_t                         compoud_reference_type_context;
    6389             :     //CHKN uint32_t                        partitionContext;
    6390             : 
    6391      101022 :     dst_cu->skip_coeff_context = src_cu->skip_coeff_context;
    6392      101022 :     dst_cu->reference_mode_context = src_cu->reference_mode_context;
    6393      101022 :     dst_cu->compoud_reference_type_context = src_cu->compoud_reference_type_context;
    6394      101022 :     memcpy(dst_cu->quantized_dc, src_cu->quantized_dc, 3 * MAX_TXB_COUNT * sizeof(int32_t));
    6395             :     //CHKN uint32_t   is_inter_ctx;
    6396             :     //CHKN uint32_t                     interp_filters;
    6397             : 
    6398      101022 :     dst_cu->is_inter_ctx = src_cu->is_inter_ctx;
    6399      101022 :     dst_cu->interp_filters = src_cu->interp_filters;
    6400             : 
    6401      101022 :     dst_cu->part = src_cu->part;
    6402      101022 :    dst_cu->shape = src_cu->shape;
    6403             :   //dst_cu->mds_idx = src_cu->mds_idx;
    6404      101022 : }
    6405             : 
    6406     1712870 : void check_redundant_block(const BlockGeom * blk_geom, ModeDecisionContext *context_ptr,  uint8_t * redundant_blk_avail, uint16_t *redundant_blk_mds)
    6407             : {
    6408     1712870 :     if (blk_geom->redund) {
    6409      641211 :         for (int it = 0; it < blk_geom->redund_list.list_size; it++) {
    6410      427155 :             if (context_ptr->md_local_cu_unit[blk_geom->redund_list.blk_mds_table[it]].avail_blk_flag)
    6411             :             {
    6412      101022 :                 *redundant_blk_mds = blk_geom->redund_list.blk_mds_table[it];
    6413      101022 :                 *redundant_blk_avail = 1;
    6414      101022 :                 break;
    6415             :             }
    6416             :         }
    6417             :     }
    6418     1712870 : }
    6419             : 
    6420             : /*******************************************
    6421             : * ModeDecision LCU
    6422             : *   performs CL (LCU)
    6423             : *******************************************/
    6424      811418 : EbBool allowed_ns_cu(
    6425             : #if COMBINE_MDC_NSQ_TABLE
    6426             :     uint8_t                            mdc_depth_level,
    6427             : #endif
    6428             :     EbBool                             is_nsq_table_used,
    6429             :     uint8_t                            nsq_max_shapes_md,
    6430             :     ModeDecisionContext              *context_ptr,
    6431             :     uint8_t                            is_complete_sb){
    6432      811418 :     EbBool  ret = 1;
    6433             :     UNUSED(is_complete_sb);
    6434             : 
    6435             : #if COMBINE_MDC_NSQ_TABLE
    6436      811418 :     if (is_nsq_table_used) {
    6437             : #if MDC_ADAPTIVE_LEVEL
    6438           0 :         if (!mdc_depth_level) {
    6439             : #else
    6440             :         if (mdc_depth_level == MAX_MDC_LEVEL) {
    6441             : #endif
    6442           0 :             if (context_ptr->blk_geom->shape != PART_N) {
    6443           0 :                 ret = 0;
    6444           0 :                 for (int i = 0; i < nsq_max_shapes_md; i++) {
    6445           0 :                     if (context_ptr->blk_geom->shape == context_ptr->nsq_table[i])
    6446           0 :                         ret = 1;
    6447             :                 }
    6448             :             }
    6449             :         }
    6450             :         else {
    6451           0 :             if (context_ptr->blk_geom->shape != PART_N) {
    6452           0 :                 ret = 0;
    6453           0 :                 for (int i = 0; i < nsq_max_shapes_md; i++) {
    6454           0 :                     if (context_ptr->blk_geom->shape == context_ptr->nsq_table[i])
    6455           0 :                         ret = 1;
    6456             :                 }
    6457             :             }
    6458             :         }
    6459             :     }
    6460             : #else
    6461             :     if (is_nsq_table_used) {
    6462             :         if (context_ptr->blk_geom->shape != PART_N) {
    6463             :             ret = 0;
    6464             :             for (int i = 0; i < nsq_max_shapes_md; i++) {
    6465             :                 if (context_ptr->blk_geom->shape == context_ptr->nsq_table[i])
    6466             :                     ret = 1;
    6467             :             }
    6468             :         }
    6469             :     }
    6470             : #endif
    6471      811418 :     return ret;
    6472             : }
    6473             : 
    6474           0 : void init_candidate_buffer(
    6475             :     ModeDecisionCandidate        *candidate_ptr,
    6476             :     uint32_t                        count_non_zero_coeffs[3][MAX_NUM_OF_TU_PER_CU])
    6477             : {
    6478           0 :     candidate_ptr->y_has_coeff = 0;
    6479           0 :     candidate_ptr->u_has_coeff = 0;
    6480           0 :     candidate_ptr->v_has_coeff = 0;
    6481             : 
    6482           0 :     candidate_ptr->full_distortion = 0;
    6483             : 
    6484           0 :     memset(candidate_ptr->eob[0], 0, sizeof(uint16_t)*MAX_TXB_COUNT);
    6485           0 :     memset(count_non_zero_coeffs[0], 0, sizeof(uint32_t)*MAX_NUM_OF_TU_PER_CU);
    6486             : 
    6487           0 :     candidate_ptr->chroma_distortion = 0;
    6488           0 :     candidate_ptr->chroma_distortion_inter_depth = 0;
    6489           0 :     memset(candidate_ptr->eob[1], 0, sizeof(uint16_t)*MAX_TXB_COUNT);
    6490           0 :     memset(count_non_zero_coeffs[1], 0, sizeof(uint32_t)*MAX_NUM_OF_TU_PER_CU);
    6491           0 :     memset(candidate_ptr->eob[2], 0, sizeof(uint16_t)*MAX_TXB_COUNT);
    6492           0 :     memset(count_non_zero_coeffs[2], 0, sizeof(uint32_t)*MAX_NUM_OF_TU_PER_CU);
    6493           0 : }
    6494      811376 : void inter_depth_tx_search(
    6495             :     PictureControlSet                      *picture_control_set_ptr,
    6496             :     ModeDecisionCandidateBuffer            *candidate_buffer,
    6497             :     CodingUnit                             *cu_ptr,
    6498             :     ModeDecisionContext                    *context_ptr,
    6499             :     EbPictureBufferDesc                    *input_picture_ptr,
    6500             :     uint64_t                                ref_fast_cost)
    6501             : {
    6502             :     // Hsan: if Transform Search ON and INTRA, then Tx Type search is performed @ the full loop
    6503      811376 :     uint8_t  tx_search_skip_flag = (picture_control_set_ptr->parent_pcs_ptr->tx_search_level == TX_SEARCH_INTER_DEPTH && (picture_control_set_ptr->parent_pcs_ptr->atb_mode == 0 || candidate_buffer ->candidate_ptr->type == INTER_MODE)) ? get_skip_tx_search_flag(
    6504           0 :         context_ptr->blk_geom->sq_size,
    6505             :         ref_fast_cost,
    6506           0 :         *candidate_buffer->fast_cost_ptr,
    6507           0 :         picture_control_set_ptr->parent_pcs_ptr->tx_weight) : 1;
    6508      811381 :     if (!tx_search_skip_flag) {
    6509           0 :         uint64_t      y_full_distortion[DIST_CALC_TOTAL] = { 0 };
    6510             :         uint32_t      count_non_zero_coeffs[3][MAX_NUM_OF_TU_PER_CU];
    6511             : 
    6512             :         uint64_t      cbFullDistortion[DIST_CALC_TOTAL];
    6513             :         uint64_t      crFullDistortion[DIST_CALC_TOTAL];
    6514             : 
    6515           0 :         uint64_t      y_coeff_bits = 0;
    6516           0 :         uint64_t      cb_coeff_bits = 0;
    6517           0 :         uint64_t      cr_coeff_bits = 0;
    6518             : 
    6519           0 :         ModeDecisionCandidate                *candidate_ptr = candidate_buffer->candidate_ptr;
    6520             : 
    6521           0 :         init_candidate_buffer(
    6522             :             candidate_ptr,
    6523             :             count_non_zero_coeffs);
    6524             : 
    6525           0 :         product_full_loop_tx_search(
    6526             :             candidate_buffer,
    6527             :             context_ptr,
    6528             :             picture_control_set_ptr
    6529             :         );
    6530             : 
    6531           0 :         candidate_ptr->full_distortion = 0;
    6532             : 
    6533           0 :         memset(candidate_ptr->eob[0], 0, sizeof(uint16_t)*MAX_TXB_COUNT);
    6534             : 
    6535             :         //re-init
    6536           0 :         candidate_ptr->y_has_coeff = 0;
    6537             : 
    6538           0 :         product_full_loop(
    6539             :             candidate_buffer,
    6540             :             context_ptr,
    6541             :             picture_control_set_ptr,
    6542             :             input_picture_ptr,
    6543           0 :             context_ptr->cu_ptr->qp,
    6544             :             &(*count_non_zero_coeffs[0]),
    6545             :             &y_coeff_bits,
    6546             :             &y_full_distortion[0]);
    6547             : 
    6548           0 :         candidate_ptr->chroma_distortion_inter_depth = 0;
    6549           0 :         candidate_ptr->chroma_distortion = 0;
    6550             : 
    6551             :         //CHROMA
    6552           0 :         cbFullDistortion[DIST_CALC_RESIDUAL] = 0;
    6553           0 :         crFullDistortion[DIST_CALC_RESIDUAL] = 0;
    6554           0 :         cbFullDistortion[DIST_CALC_PREDICTION] = 0;
    6555           0 :         crFullDistortion[DIST_CALC_PREDICTION] = 0;
    6556             : 
    6557           0 :         cb_coeff_bits = 0;
    6558           0 :         cr_coeff_bits = 0;
    6559             : 
    6560             :         // FullLoop and TU search
    6561           0 :         uint16_t cb_qp = context_ptr->qp;
    6562           0 :         uint16_t cr_qp = context_ptr->qp;
    6563           0 :         if (context_ptr->blk_geom->has_uv && context_ptr->chroma_level <= CHROMA_MODE_1) {
    6564           0 :             full_loop_r(
    6565             :                 context_ptr->sb_ptr,
    6566             :                 candidate_buffer,
    6567             :                 context_ptr,
    6568             :                 input_picture_ptr,
    6569             :                 picture_control_set_ptr,
    6570             :                 PICTURE_BUFFER_DESC_CHROMA_MASK,
    6571             :                 cb_qp,
    6572             :                 cr_qp,
    6573             :                 &(*count_non_zero_coeffs[1]),
    6574             :                 &(*count_non_zero_coeffs[2]));
    6575             : 
    6576           0 :             cu_full_distortion_fast_tu_mode_r(
    6577             :                 context_ptr->sb_ptr,
    6578             :                 candidate_buffer,
    6579             :                 context_ptr,
    6580             :                 candidate_ptr,
    6581             :                 picture_control_set_ptr,
    6582             :                 input_picture_ptr,
    6583             :                 cbFullDistortion,
    6584             :                 crFullDistortion,
    6585             :                 count_non_zero_coeffs,
    6586             :                 COMPONENT_CHROMA,
    6587             :                 &cb_coeff_bits,
    6588             :                 &cr_coeff_bits,
    6589             :                 1);
    6590             : 
    6591           0 :             candidate_ptr->block_has_coeff = (candidate_ptr->y_has_coeff | candidate_ptr->u_has_coeff | candidate_ptr->v_has_coeff) ? EB_TRUE : EB_FALSE;
    6592             :         }
    6593             : 
    6594           0 :         Av1ProductFullCostFuncTable[candidate_ptr->type](
    6595             :             picture_control_set_ptr,
    6596             :             context_ptr,
    6597             :             candidate_buffer,
    6598             :             cu_ptr,
    6599             :             y_full_distortion,
    6600             :             cbFullDistortion,
    6601             :             crFullDistortion,
    6602           0 :             context_ptr->full_lambda,
    6603             :             &y_coeff_bits,
    6604             :             &cb_coeff_bits,
    6605             :             &cr_coeff_bits,
    6606           0 :             context_ptr->blk_geom->bsize);
    6607             : 
    6608           0 :         candidate_buffer->cb_distortion[DIST_CALC_RESIDUAL] = cbFullDistortion[DIST_CALC_RESIDUAL];
    6609           0 :         candidate_buffer->cb_distortion[DIST_CALC_PREDICTION] = cbFullDistortion[DIST_CALC_PREDICTION];
    6610           0 :         candidate_buffer->cb_coeff_bits = cb_coeff_bits;
    6611             : 
    6612           0 :         candidate_buffer->cr_distortion[DIST_CALC_RESIDUAL] = crFullDistortion[DIST_CALC_RESIDUAL];
    6613           0 :         candidate_buffer->cr_distortion[DIST_CALC_PREDICTION] = crFullDistortion[DIST_CALC_PREDICTION];
    6614           0 :         candidate_buffer->cr_coeff_bits = cr_coeff_bits;
    6615             : 
    6616           0 :         candidate_buffer->candidate_ptr->full_distortion = (uint32_t)(y_full_distortion[0]);
    6617             : 
    6618           0 :         candidate_buffer->y_coeff_bits = y_coeff_bits;
    6619           0 :         candidate_ptr->full_distortion = (uint32_t)(y_full_distortion[0]);
    6620             :         //Update tx
    6621           0 :         context_ptr->md_local_cu_unit[cu_ptr->mds_idx].cost = *(candidate_buffer->full_cost_ptr);
    6622           0 :         context_ptr->md_local_cu_unit[cu_ptr->mds_idx].cost = (context_ptr->md_local_cu_unit[cu_ptr->mds_idx].cost - candidate_buffer->candidate_ptr->chroma_distortion) + candidate_buffer->candidate_ptr->chroma_distortion_inter_depth;
    6623             : 
    6624           0 :         if (candidate_ptr->type == INTRA_MODE)
    6625           0 :             context_ptr->md_local_cu_unit[cu_ptr->mds_idx].cost_luma = candidate_buffer->full_cost_luma;
    6626             : 
    6627           0 :         context_ptr->md_ep_pipe_sb[cu_ptr->mds_idx].merge_cost = *candidate_buffer->full_cost_merge_ptr;
    6628           0 :         context_ptr->md_ep_pipe_sb[cu_ptr->mds_idx].skip_cost = *candidate_buffer->full_cost_skip_ptr;
    6629             : 
    6630           0 :         if (candidate_ptr->type == INTER_MODE && candidate_ptr->merge_flag == EB_TRUE)
    6631           0 :             context_ptr->md_ep_pipe_sb[cu_ptr->leaf_index].chroma_distortion = candidate_buffer->candidate_ptr->chroma_distortion;
    6632           0 :         context_ptr->md_local_cu_unit[cu_ptr->mds_idx].full_distortion = candidate_buffer->candidate_ptr->full_distortion;
    6633             : 
    6634           0 :         context_ptr->md_local_cu_unit[cu_ptr->mds_idx].chroma_distortion = (uint32_t)candidate_buffer->candidate_ptr->chroma_distortion;
    6635           0 :         context_ptr->md_local_cu_unit[cu_ptr->mds_idx].chroma_distortion_inter_depth = (uint32_t)candidate_buffer->candidate_ptr->chroma_distortion_inter_depth;
    6636             : 
    6637             :         //cu_ptr->prediction_mode_flag = candidate_ptr->type;
    6638           0 :         cu_ptr->skip_flag = candidate_ptr->skip_flag; // note, the skip flag is re-checked in the ENCDEC process
    6639           0 :         cu_ptr->block_has_coeff = ((candidate_ptr->block_has_coeff) > 0) ? EB_TRUE : EB_FALSE;
    6640             :         // This kernel assumes no atb
    6641           0 :         cu_ptr->quantized_dc[0][0] = candidate_buffer->candidate_ptr->quantized_dc[0][0];
    6642           0 :         cu_ptr->quantized_dc[1][0] = candidate_buffer->candidate_ptr->quantized_dc[1][0];
    6643           0 :         cu_ptr->quantized_dc[2][0] = candidate_buffer->candidate_ptr->quantized_dc[2][0];
    6644             : 
    6645           0 :         context_ptr->md_local_cu_unit[cu_ptr->mds_idx].count_non_zero_coeffs = candidate_ptr->count_non_zero_coeffs;
    6646             : 
    6647             :         TransformUnit        *txb_ptr;
    6648             :         uint32_t                txb_itr;
    6649             :         uint32_t                tu_index;
    6650             :         uint32_t                tuTotalCount;
    6651           0 :         tuTotalCount = context_ptr->blk_geom->txb_count[candidate_buffer->candidate_ptr->tx_depth];
    6652           0 :         tu_index = 0;
    6653           0 :         txb_itr = 0;
    6654             : 
    6655             : #if NO_ENCDEC
    6656             :         int32_t txb_1d_offset = 0, txb_1d_offset_uv = 0;
    6657             : 
    6658             :         cu_ptr->block_has_coeff = 0;
    6659             : #endif
    6660             : 
    6661             :         // Set TU
    6662             :         do {
    6663           0 :             txb_ptr = &cu_ptr->transform_unit_array[tu_index];
    6664             : 
    6665           0 :             txb_ptr->split_flag = EB_FALSE;
    6666           0 :             txb_ptr->y_has_coeff = (EbBool)(((candidate_ptr->y_has_coeff)  & (1 << tu_index)) > 0);
    6667           0 :             txb_ptr->u_has_coeff = (EbBool)(((candidate_ptr->u_has_coeff) & (1 << (tu_index))) > 0);
    6668           0 :             txb_ptr->v_has_coeff = (EbBool)(((candidate_ptr->v_has_coeff) & (1 << (tu_index))) > 0);
    6669           0 :             txb_ptr->transform_type[PLANE_TYPE_Y] = candidate_ptr->transform_type[tu_index];
    6670           0 :             txb_ptr->transform_type[PLANE_TYPE_UV] = candidate_ptr->transform_type_uv;
    6671             : 
    6672             : #if NO_ENCDEC
    6673             : 
    6674             :             if (context_ptr->blk_geom->has_uv) {
    6675             :                 cu_ptr->block_has_coeff |= txb_ptr->y_has_coeff;
    6676             :                 cu_ptr->block_has_coeff |= txb_ptr->u_has_coeff;
    6677             :                 cu_ptr->block_has_coeff |= txb_ptr->v_has_coeff;
    6678             :             }
    6679             :             else
    6680             :                 cu_ptr->block_has_coeff |= txb_ptr->y_has_coeff;
    6681             :             cu_ptr->cand_buff_index = lowestCostIndex;
    6682             : 
    6683             :             cu_ptr->skip_flag = 0;   //SKIP is turned OFF for this case!!
    6684             :             txb_ptr->nz_coef_count[0] = candidate_ptr->eob[0][tu_index];
    6685             :             txb_ptr->nz_coef_count[1] = candidate_ptr->eob[1][tu_index];
    6686             :             txb_ptr->nz_coef_count[2] = candidate_ptr->eob[2][tu_index];
    6687             : 
    6688             :             if (pu_ptr->inter_pred_direction_index == UNI_PRED_LIST_0) {
    6689             :                 cu_ptr->predmv[0].as_mv.col = candidate_ptr->motion_vector_pred_x[REF_LIST_0];
    6690             :                 cu_ptr->predmv[0].as_mv.row = candidate_ptr->motion_vector_pred_y[REF_LIST_0];
    6691             :             }
    6692             :             else if (pu_ptr->inter_pred_direction_index == UNI_PRED_LIST_1) {
    6693             :                 cu_ptr->predmv[0].as_mv.col = candidate_ptr->motion_vector_pred_x[REF_LIST_1];
    6694             :                 cu_ptr->predmv[0].as_mv.row = candidate_ptr->motion_vector_pred_y[REF_LIST_1];
    6695             :             }
    6696             :             else if (pu_ptr->inter_pred_direction_index == BI_PRED) {
    6697             :                 cu_ptr->predmv[0].as_mv.col = candidate_ptr->motion_vector_pred_x[REF_LIST_0];
    6698             :                 cu_ptr->predmv[0].as_mv.row = candidate_ptr->motion_vector_pred_y[REF_LIST_0];
    6699             :                 cu_ptr->predmv[1].as_mv.col = candidate_ptr->motion_vector_pred_x[REF_LIST_1];
    6700             :                 cu_ptr->predmv[1].as_mv.row = candidate_ptr->motion_vector_pred_y[REF_LIST_1];
    6701             :             }
    6702             : #endif
    6703             : #if NO_ENCDEC
    6704             :             //copy coeff
    6705             :             {
    6706             :                 uint32_t  bwidth = context_ptr->blk_geom->tx_width[txb_itr] < 64 ? context_ptr->blk_geom->tx_width[txb_itr] : 32;
    6707             :                 uint32_t  bheight = context_ptr->blk_geom->tx_height[txb_itr] < 64 ? context_ptr->blk_geom->tx_height[txb_itr] : 32;
    6708             : 
    6709             :                 int32_t* src_ptr = &(((int32_t*)buffer_ptr_array[lowestCostIndex]->residual_quant_coeff_ptr->buffer_y)[txb_1d_offset]);
    6710             :                 int32_t* dst_ptr = &(((int32_t*)context_ptr->cu_ptr->coeff_tmp->buffer_y)[txb_1d_offset]);
    6711             : 
    6712             :                 uint32_t j;
    6713             : 
    6714             :                 for (j = 0; j < bheight; j++)
    6715             :                     memcpy(dst_ptr + j * bwidth, src_ptr + j * bwidth, bwidth * sizeof(int32_t));
    6716             :                 if (context_ptr->blk_geom->has_uv)
    6717             :                 {
    6718             :                     // Cb
    6719             :                     bwidth = context_ptr->blk_geom->tx_width_uv[txb_itr];
    6720             :                     bheight = context_ptr->blk_geom->tx_height_uv[txb_itr];
    6721             : 
    6722             :                     src_ptr = &(((int32_t*)buffer_ptr_array[lowestCostIndex]->residual_quant_coeff_ptr->buffer_cb)[txb_1d_offset_uv]);
    6723             :                     dst_ptr = &(((int32_t*)context_ptr->cu_ptr->coeff_tmp->buffer_cb)[txb_1d_offset_uv]);
    6724             : 
    6725             :                     for (j = 0; j < bheight; j++)
    6726             :                         memcpy(dst_ptr + j * bwidth, src_ptr + j * bwidth, bwidth * sizeof(int32_t));
    6727             :                     src_ptr = &(((int32_t*)buffer_ptr_array[lowestCostIndex]->residual_quant_coeff_ptr->buffer_cr)[txb_1d_offset_uv]);
    6728             :                     dst_ptr = &(((int32_t*)context_ptr->cu_ptr->coeff_tmp->buffer_cr)[txb_1d_offset_uv]);
    6729             : 
    6730             :                     for (j = 0; j < bheight; j++)
    6731             :                         memcpy(dst_ptr + j * bwidth, src_ptr + j * bwidth, bwidth * sizeof(int32_t));
    6732             :                 }
    6733             : 
    6734             :                 txb_1d_offset += context_ptr->blk_geom->tx_width[txb_itr] * context_ptr->blk_geom->tx_height[txb_itr];
    6735             :                 if (context_ptr->blk_geom->has_uv)
    6736             :                     txb_1d_offset_uv += context_ptr->blk_geom->tx_width_uv[txb_itr] * context_ptr->blk_geom->tx_height_uv[txb_itr];
    6737             :             }
    6738             : 
    6739             : #endif
    6740             : 
    6741           0 :             ++tu_index;
    6742           0 :             ++txb_itr;
    6743           0 :         } while (txb_itr < tuTotalCount);
    6744             :     }
    6745      811381 : }
    6746             : 
    6747             : /****************************************************
    6748             : * generate the the size in pixel for partition code
    6749             : ****************************************************/
    6750           0 : uint8_t get_part_side(
    6751             :     PartitionContextType part) {
    6752           0 :     switch (part) {
    6753           0 :     case 31:
    6754           0 :         return 4;
    6755             :         break;
    6756           0 :     case 30:
    6757           0 :         return 8;
    6758             :         break;
    6759           0 :     case 28:
    6760           0 :         return 16;
    6761             :         break;
    6762           0 :     case 24:
    6763           0 :         return 32;
    6764             :         break;
    6765           0 :     case 16:
    6766           0 :         return 64;
    6767             :         break;
    6768           0 :     case 0:
    6769           0 :         return 128;
    6770             :         break;
    6771           0 :     default:
    6772           0 :         return 255;
    6773             :         printf("error: non supported partition!!\n");
    6774             :         break;
    6775             :     }
    6776             : }
    6777             : /****************************************************
    6778             : * Return a predicted Shape based on the above and
    6779             : * left partitions
    6780             : ****************************************************/
    6781           0 : PART get_partition_shape(
    6782             :     PartitionContextType above,
    6783             :     PartitionContextType left,
    6784             :     uint8_t           width,
    6785             :     uint8_t           height) {
    6786           0 :     uint8_t above_size = get_part_side(above);
    6787           0 :     uint8_t left_size = get_part_side(left);
    6788           0 :     PART part = PART_N;
    6789             : 
    6790           0 :     if (above_size == width && left_size == height)
    6791           0 :         part = PART_N;
    6792           0 :     else if (above_size > width && left_size > height)
    6793           0 :         part = PART_N;
    6794           0 :     else if (above_size > width) {
    6795           0 :         if (left_size == height)
    6796           0 :             part = PART_N;
    6797           0 :         else if (left_size < (height / 2))
    6798           0 :             part = PART_H4;
    6799           0 :         else if (left_size < height)
    6800           0 :             part = PART_H;
    6801             :         else
    6802           0 :             printf("error: unsupported left_size\n");
    6803             :     }
    6804           0 :     else if (left_size > height) {
    6805           0 :         if (above_size == width)
    6806           0 :             part = PART_N;
    6807           0 :         else if (above_size < (width / 2))
    6808           0 :             part = PART_V4;
    6809           0 :         else if (above_size < width)
    6810           0 :             part = PART_V;
    6811             :         else
    6812           0 :             printf("error: unsupported above_size\n");
    6813             :     }
    6814           0 :     else if (above_size < width) {
    6815           0 :         if (left_size == height)
    6816           0 :             part = PART_VA;
    6817           0 :         else if (left_size < height)
    6818           0 :             part = PART_S;
    6819             :         else
    6820           0 :             printf("error: unsupported left_size\n");
    6821             :     }
    6822           0 :     else if (left_size < height) {
    6823           0 :         if (above_size == width)
    6824           0 :             part = PART_HA;
    6825           0 :         else if (above_size < width)
    6826           0 :             part = PART_S;
    6827             :         else
    6828           0 :             printf("error: unsupported above_size\n");
    6829             :     }
    6830           0 :     else if (above_size == width) {
    6831           0 :         if (left_size < height)
    6832           0 :             part = PART_HB;
    6833             :         else
    6834           0 :             printf("error: unsupported left_size\n");
    6835             :     }
    6836           0 :     else if (left_size == height) {
    6837           0 :         if (above_size == width)
    6838           0 :             part = PART_HB;
    6839             :         else
    6840           0 :             printf("error: unsupported above_size\n");
    6841             :     }
    6842             :     else
    6843           0 :         printf("error: unsupported above_size && left_size\n");
    6844           0 :     return part;
    6845             : };
    6846             : 
    6847             : #if ADJUST_NSQ_RANK_BASED_ON_NEIGH
    6848             : /****************************************************
    6849             : * Adjust the nsq_rank in order to keep the most
    6850             : * probable Shape to be selected in the lowest index
    6851             : ****************************************************/
    6852           0 : void  adjust_nsq_rank(
    6853             :     PictureControlSet            *picture_control_set_ptr,
    6854             :     ModeDecisionContext          *context_ptr,
    6855             :     const SequenceControlSet     *sequence_control_set_ptr,
    6856             :     LargestCodingUnit            *sb_ptr,
    6857             :     NeighborArrayUnit            *leaf_partition_neighbor_array) {
    6858           0 :     const uint32_t                lcu_addr = sb_ptr->index;
    6859           0 :     uint8_t ol_part1 = context_ptr->best_nsq_sahpe1;
    6860           0 :     uint8_t ol_part2 = context_ptr->best_nsq_sahpe2;
    6861           0 :     uint8_t ol_part3 = context_ptr->best_nsq_sahpe3;
    6862           0 :     uint8_t ol_part4 = context_ptr->best_nsq_sahpe4;
    6863           0 :     uint8_t ol_part5 = context_ptr->best_nsq_sahpe5;
    6864           0 :     uint8_t ol_part6 = context_ptr->best_nsq_sahpe6;
    6865           0 :     uint8_t ol_part7 = context_ptr->best_nsq_sahpe7;
    6866           0 :     uint8_t ol_part8 = context_ptr->best_nsq_sahpe8;
    6867           0 :     EbBool is_compound_enabled = (picture_control_set_ptr->parent_pcs_ptr->reference_mode == SINGLE_REFERENCE) ? 0 : 1;
    6868             :     uint32_t me_sb_addr;
    6869             :     uint32_t me_2Nx2N_table_offset;
    6870             :     uint32_t max_number_of_pus_per_sb;
    6871           0 :     uint32_t geom_offset_x = 0;
    6872           0 :     uint32_t geom_offset_y = 0;
    6873           0 :     uint8_t cnt[PART_S + 1] = { 0 };
    6874           0 :     if (sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128) {
    6875           0 :         uint32_t me_sb_size = sequence_control_set_ptr->sb_sz;
    6876           0 :         uint32_t me_pic_width_in_sb = (sequence_control_set_ptr->seq_header.max_frame_width + sequence_control_set_ptr->sb_sz - 1) / me_sb_size;
    6877           0 :         uint32_t me_sb_x = (context_ptr->cu_origin_x / me_sb_size);
    6878           0 :         uint32_t me_sb_y = (context_ptr->cu_origin_y / me_sb_size);
    6879           0 :         me_sb_addr = me_sb_x + me_sb_y * me_pic_width_in_sb;
    6880           0 :         geom_offset_x = (me_sb_x & 0x1) * me_sb_size;
    6881           0 :         geom_offset_y = (me_sb_y & 0x1) * me_sb_size;
    6882             :     }
    6883             :     else
    6884           0 :         me_sb_addr = lcu_addr;
    6885           0 :     max_number_of_pus_per_sb = picture_control_set_ptr->parent_pcs_ptr->max_number_of_pus_per_sb;
    6886           0 :     me_2Nx2N_table_offset = (context_ptr->blk_geom->bwidth == 4 || context_ptr->blk_geom->bheight == 4 || context_ptr->blk_geom->bwidth == 128 || context_ptr->blk_geom->bheight == 128) ? 0 :
    6887             : 
    6888           0 :         get_me_info_index(
    6889             :             max_number_of_pus_per_sb,
    6890             :             context_ptr->blk_geom,
    6891             :             geom_offset_x,
    6892             :             geom_offset_y);
    6893             : 
    6894           0 :     const MeLcuResults *me_results = picture_control_set_ptr->parent_pcs_ptr->me_results[me_sb_addr];
    6895           0 :     uint8_t nsq0 = me_results->me_nsq_0[me_2Nx2N_table_offset];
    6896           0 :     uint8_t nsq1 = me_results->me_nsq_1[me_2Nx2N_table_offset];
    6897             : 
    6898           0 :     uint8_t me_part_0 = nsq0 == 0 ? PART_N : nsq0 == 1 ? PART_H : nsq0 == 2 ? PART_V : nsq0 == 3 ? PART_H4 : nsq0 == 4 ? PART_V4 : nsq0 == 5 ? PART_S : 0;
    6899           0 :     uint8_t me_part_1 = nsq1 == 0 ? PART_N : nsq1 == 1 ? PART_H : nsq1 == 2 ? PART_V : nsq1 == 3 ? PART_H4 : nsq1 == 4 ? PART_V4 : nsq1 == 5 ? PART_S : 0;
    6900             : 
    6901             :     // Generate Partition context
    6902           0 :     uint32_t partition_left_neighbor_index = get_neighbor_array_unit_left_index(
    6903             :         leaf_partition_neighbor_array,
    6904           0 :         context_ptr->cu_origin_y);
    6905           0 :     uint32_t partition_above_neighbor_index = get_neighbor_array_unit_top_index(
    6906             :         leaf_partition_neighbor_array,
    6907           0 :         context_ptr->cu_origin_x);
    6908           0 :     const PartitionContextType above_ctx = (((PartitionContext*)leaf_partition_neighbor_array->top_array)[partition_above_neighbor_index].above == (int8_t)INVALID_NEIGHBOR_DATA) ?
    6909           0 :         0 : ((PartitionContext*)leaf_partition_neighbor_array->top_array)[partition_above_neighbor_index].above;
    6910           0 :     const PartitionContextType left_ctx = (((PartitionContext*)leaf_partition_neighbor_array->left_array)[partition_left_neighbor_index].left == (int8_t)INVALID_NEIGHBOR_DATA) ?
    6911           0 :         0 : ((PartitionContext*)leaf_partition_neighbor_array->left_array)[partition_left_neighbor_index].left;
    6912             : 
    6913           0 :     PART neighbor_part = get_partition_shape(
    6914             :         above_ctx,
    6915             :         left_ctx,
    6916           0 :         context_ptr->blk_geom->bwidth,
    6917           0 :         context_ptr->blk_geom->bheight);
    6918             : 
    6919             :     //init table
    6920           0 :     context_ptr->nsq_table[0] = PART_H;
    6921           0 :     context_ptr->nsq_table[1] = PART_V;
    6922           0 :     context_ptr->nsq_table[2] = PART_HA;
    6923           0 :     context_ptr->nsq_table[3] = PART_HB;
    6924           0 :     context_ptr->nsq_table[4] = PART_VA;
    6925           0 :     context_ptr->nsq_table[5] = PART_VB;
    6926           0 :     context_ptr->nsq_table[6] = PART_H4;
    6927           0 :     context_ptr->nsq_table[7] = PART_V4;
    6928             : 
    6929           0 :     if (is_compound_enabled == 0) me_part_1 = me_part_0;
    6930             : 
    6931             :     // Insert predicted Shapes based on ME information
    6932           0 :     if (me_part_0 != me_part_1) {
    6933           0 :         context_ptr->nsq_table[0] = me_part_0;
    6934           0 :         context_ptr->nsq_table[1] = me_part_1;
    6935             : 
    6936           0 :         if (me_part_0 == PART_H) {
    6937           0 :             context_ptr->nsq_table[2] = PART_HA;
    6938           0 :             context_ptr->nsq_table[3] = PART_HB;
    6939           0 :             context_ptr->nsq_table[4] = me_part_1 != PART_H4 ? PART_H4 : PART_V;
    6940             :         }
    6941           0 :         else if (me_part_0 == PART_V) {
    6942           0 :             context_ptr->nsq_table[2] = PART_VA;
    6943           0 :             context_ptr->nsq_table[3] = PART_VB;
    6944           0 :             context_ptr->nsq_table[4] = me_part_1 != PART_V4 ? PART_V4 : PART_H;
    6945             :         }
    6946           0 :         else if (me_part_0 == PART_H4) {
    6947           0 :             context_ptr->nsq_table[2] = PART_HA;
    6948           0 :             context_ptr->nsq_table[3] = PART_HB;
    6949           0 :             context_ptr->nsq_table[4] = me_part_1 != PART_H ? PART_H : PART_V;
    6950             :         }
    6951           0 :         else if (me_part_0 == PART_V4) {
    6952           0 :             context_ptr->nsq_table[2] = PART_VA;
    6953           0 :             context_ptr->nsq_table[3] = PART_VB;
    6954           0 :             context_ptr->nsq_table[4] = me_part_1 != PART_V ? PART_V : PART_H;
    6955             :         }
    6956           0 :         else if (me_part_0 == PART_S) {
    6957           0 :             context_ptr->nsq_table[2] = PART_VA;
    6958           0 :             context_ptr->nsq_table[3] = PART_HB;
    6959           0 :             context_ptr->nsq_table[4] = me_part_1 != PART_V ? PART_V : PART_H;
    6960             :         }
    6961             :     }
    6962             :     else {
    6963           0 :         context_ptr->nsq_table[0] = me_part_0;
    6964           0 :         if (me_part_0 == PART_H) {
    6965           0 :             context_ptr->nsq_table[1] = PART_HA;
    6966           0 :             context_ptr->nsq_table[2] = PART_HB;
    6967           0 :             context_ptr->nsq_table[3] = PART_H4;
    6968           0 :             context_ptr->nsq_table[4] = PART_V;
    6969             :         }
    6970           0 :         else if (me_part_0 == PART_V) {
    6971           0 :             context_ptr->nsq_table[1] = PART_VA;
    6972           0 :             context_ptr->nsq_table[2] = PART_VB;
    6973           0 :             context_ptr->nsq_table[3] = PART_V4;
    6974           0 :             context_ptr->nsq_table[4] = PART_H;
    6975             :         }
    6976           0 :         else if (me_part_0 == PART_H4) {
    6977           0 :             context_ptr->nsq_table[1] = PART_H;
    6978           0 :             context_ptr->nsq_table[2] = PART_HA;
    6979           0 :             context_ptr->nsq_table[3] = PART_HB;
    6980           0 :             context_ptr->nsq_table[4] = PART_V;
    6981             :         }
    6982           0 :         else if (me_part_0 == PART_V4) {
    6983           0 :             context_ptr->nsq_table[1] = PART_V;
    6984           0 :             context_ptr->nsq_table[2] = PART_VA;
    6985           0 :             context_ptr->nsq_table[3] = PART_VB;
    6986           0 :             context_ptr->nsq_table[4] = PART_H;
    6987             :         }
    6988           0 :         else if (me_part_0 == PART_S) {
    6989           0 :             context_ptr->nsq_table[1] = PART_HA;
    6990           0 :             context_ptr->nsq_table[2] = PART_VA;
    6991           0 :             context_ptr->nsq_table[3] = PART_HB;
    6992           0 :             context_ptr->nsq_table[4] = PART_VB;
    6993             :         }
    6994             :     }
    6995             :     // Insert predicted Shapes based on neighbor information
    6996           0 :     if (neighbor_part == PART_S && me_part_0 == PART_S && me_part_1 == PART_S) {
    6997           0 :         context_ptr->nsq_table[0] = PART_HA;
    6998           0 :         context_ptr->nsq_table[1] = PART_VA;
    6999           0 :         context_ptr->nsq_table[2] = PART_HB;
    7000           0 :         context_ptr->nsq_table[3] = PART_VB;
    7001           0 :         context_ptr->nsq_table[4] = PART_H4;
    7002           0 :         context_ptr->nsq_table[5] = PART_V4;
    7003             :     }
    7004             :     else {
    7005           0 :         if (neighbor_part != PART_N && neighbor_part != PART_S && neighbor_part != me_part_0 && neighbor_part != me_part_1) {
    7006           0 :             context_ptr->nsq_table[5] = context_ptr->nsq_table[4];
    7007           0 :             context_ptr->nsq_table[4] = context_ptr->nsq_table[3];
    7008           0 :             context_ptr->nsq_table[3] = context_ptr->nsq_table[2];
    7009           0 :             context_ptr->nsq_table[2] = context_ptr->nsq_table[1];
    7010           0 :             context_ptr->nsq_table[1] = context_ptr->nsq_table[0];
    7011           0 :             context_ptr->nsq_table[0] = neighbor_part;
    7012             :         }
    7013             :         else
    7014           0 :             context_ptr->nsq_table[5] = neighbor_part != PART_N && neighbor_part != PART_S ? neighbor_part : me_part_0;
    7015             :     }
    7016             : #if MDC_ADAPTIVE_LEVEL
    7017           0 :     if (picture_control_set_ptr->parent_pcs_ptr->enable_adaptive_ol_partitioning) {
    7018             : #else
    7019             :     if (picture_control_set_ptr->parent_pcs_ptr->mdc_depth_level < MAX_MDC_LEVEL) {
    7020             : #endif
    7021           0 :         context_ptr->nsq_table[2] = context_ptr->nsq_table[0] != ol_part1 && context_ptr->nsq_table[1] != ol_part1 ? ol_part1
    7022           0 :             : context_ptr->nsq_table[0] != ol_part2 && context_ptr->nsq_table[1] != ol_part2 ? ol_part2
    7023             :             : ol_part3 != PART_N ? ol_part3 : context_ptr->nsq_table[2];
    7024           0 :         context_ptr->nsq_table[3] = context_ptr->nsq_table[0] != ol_part1 && context_ptr->nsq_table[1] != ol_part1 && context_ptr->nsq_table[2] != ol_part1 ? ol_part1
    7025           0 :             : context_ptr->nsq_table[0] != ol_part2 && context_ptr->nsq_table[1] != ol_part2 && context_ptr->nsq_table[2] != ol_part2 ? ol_part2
    7026           0 :             : context_ptr->nsq_table[0] != ol_part3 && context_ptr->nsq_table[1] != ol_part3 && context_ptr->nsq_table[2] != ol_part3 ? ol_part3
    7027             :             : ol_part4 != PART_N ? ol_part4 : context_ptr->nsq_table[3];
    7028           0 :         context_ptr->nsq_table[4] = context_ptr->nsq_table[0] != ol_part1 && context_ptr->nsq_table[1] != ol_part1 && context_ptr->nsq_table[2] != ol_part1 && context_ptr->nsq_table[3] != ol_part1 ? ol_part1
    7029           0 :             : context_ptr->nsq_table[0] != ol_part2 && context_ptr->nsq_table[1] != ol_part2 && context_ptr->nsq_table[2] != ol_part2 && context_ptr->nsq_table[3] != ol_part2 ? ol_part2
    7030           0 :             : context_ptr->nsq_table[0] != ol_part3 && context_ptr->nsq_table[1] != ol_part3 && context_ptr->nsq_table[2] != ol_part3 && context_ptr->nsq_table[3] != ol_part3 ? ol_part3
    7031           0 :             : context_ptr->nsq_table[0] != ol_part4 && context_ptr->nsq_table[1] != ol_part4 && context_ptr->nsq_table[2] != ol_part4 && context_ptr->nsq_table[3] != ol_part4 ? ol_part4
    7032             :             : ol_part5 != PART_N ? ol_part5 : context_ptr->nsq_table[4];
    7033           0 :         context_ptr->nsq_table[5] = context_ptr->nsq_table[0] != ol_part1 && context_ptr->nsq_table[1] != ol_part1 && context_ptr->nsq_table[2] != ol_part1 && context_ptr->nsq_table[3] != ol_part1 && context_ptr->nsq_table[4] != ol_part1 ? ol_part1
    7034           0 :             : context_ptr->nsq_table[0] != ol_part2 && context_ptr->nsq_table[1] != ol_part2 && context_ptr->nsq_table[2] != ol_part2 && context_ptr->nsq_table[3] != ol_part2 && context_ptr->nsq_table[4] != ol_part2 ? ol_part2
    7035           0 :             : context_ptr->nsq_table[0] != ol_part3 && context_ptr->nsq_table[1] != ol_part3 && context_ptr->nsq_table[2] != ol_part3 && context_ptr->nsq_table[3] != ol_part3 && context_ptr->nsq_table[4] != ol_part3 ? ol_part3
    7036           0 :             : context_ptr->nsq_table[0] != ol_part4 && context_ptr->nsq_table[1] != ol_part4 && context_ptr->nsq_table[2] != ol_part4 && context_ptr->nsq_table[3] != ol_part4 && context_ptr->nsq_table[4] != ol_part4 ? ol_part4
    7037           0 :             : context_ptr->nsq_table[0] != ol_part5 && context_ptr->nsq_table[1] != ol_part5 && context_ptr->nsq_table[2] != ol_part5 && context_ptr->nsq_table[3] != ol_part5 && context_ptr->nsq_table[4] != ol_part5 ? ol_part5
    7038             :             : ol_part6 != PART_N ? ol_part6 : context_ptr->nsq_table[5];
    7039           0 :         context_ptr->nsq_table[6] = context_ptr->nsq_table[0] != ol_part1 && context_ptr->nsq_table[1] != ol_part1 && context_ptr->nsq_table[2] != ol_part1 && context_ptr->nsq_table[3] != ol_part1 && context_ptr->nsq_table[4] != ol_part1 && context_ptr->nsq_table[5] != ol_part1 ? ol_part1
    7040           0 :             : context_ptr->nsq_table[0] != ol_part2 && context_ptr->nsq_table[1] != ol_part2 && context_ptr->nsq_table[2] != ol_part2 && context_ptr->nsq_table[3] != ol_part2 && context_ptr->nsq_table[4] != ol_part2 && context_ptr->nsq_table[5] != ol_part2 ? ol_part2
    7041           0 :             : context_ptr->nsq_table[0] != ol_part3 && context_ptr->nsq_table[1] != ol_part3 && context_ptr->nsq_table[2] != ol_part3 && context_ptr->nsq_table[3] != ol_part3 && context_ptr->nsq_table[4] != ol_part3 && context_ptr->nsq_table[5] != ol_part3 ? ol_part3
    7042           0 :             : context_ptr->nsq_table[0] != ol_part4 && context_ptr->nsq_table[1] != ol_part4 && context_ptr->nsq_table[2] != ol_part4 && context_ptr->nsq_table[3] != ol_part4 && context_ptr->nsq_table[4] != ol_part4 && context_ptr->nsq_table[5] != ol_part4 ? ol_part4
    7043           0 :             : context_ptr->nsq_table[0] != ol_part5 && context_ptr->nsq_table[1] != ol_part5 && context_ptr->nsq_table[2] != ol_part5 && context_ptr->nsq_table[3] != ol_part5 && context_ptr->nsq_table[4] != ol_part5 && context_ptr->nsq_table[5] != ol_part5 ? ol_part5
    7044           0 :             : context_ptr->nsq_table[0] != ol_part6 && context_ptr->nsq_table[1] != ol_part6 && context_ptr->nsq_table[2] != ol_part6 && context_ptr->nsq_table[3] != ol_part6 && context_ptr->nsq_table[4] != ol_part6 && context_ptr->nsq_table[5] != ol_part6 ? ol_part6
    7045             :             : ol_part7;
    7046             : 
    7047             :         // Replace PART_N by best MDC.
    7048           0 :         for (uint8_t idx = 0; idx < NSQ_TAB_SIZE; idx++) {
    7049           0 :             if (context_ptr->nsq_table[idx] == PART_N) {
    7050           0 :                 context_ptr->nsq_table[idx] = ol_part1 != PART_N ? ol_part1 :
    7051             :                     ol_part2 != PART_N ? ol_part2 :
    7052             :                     ol_part3 != PART_N ? ol_part3 :
    7053             :                     ol_part4 != PART_N ? ol_part4 :
    7054             :                     ol_part5 != PART_N ? ol_part5 :
    7055             :                     ol_part6 != PART_N ? ol_part6 :
    7056             :                     ol_part7 != PART_N ? ol_part7 : ol_part8;
    7057           0 :                 break;
    7058             :             }
    7059             :         }
    7060             :     }
    7061             :     // Remove duplicate candidates
    7062           0 :     for (int pidx = 0; pidx < NSQ_TAB_SIZE; pidx++)
    7063           0 :         cnt[context_ptr->nsq_table[pidx]]++;
    7064           0 :     cnt[context_ptr->nsq_table[0]] = 1;
    7065           0 :     for (int iter = 0; iter < NSQ_TAB_SIZE - 1; iter++) {
    7066           0 :         for (int idx = 1 + iter; idx < NSQ_TAB_SIZE; idx++) {
    7067           0 :             if (context_ptr->nsq_table[iter] != context_ptr->nsq_table[idx])
    7068           0 :                 continue;
    7069             :             else {
    7070           0 :                 for (int i = idx; i < NSQ_TAB_SIZE; i++) {
    7071           0 :                     if (idx < NSQ_TAB_SIZE - 1)
    7072           0 :                         context_ptr->nsq_table[idx] = context_ptr->nsq_table[idx + 1];
    7073           0 :                     else if (idx == NSQ_TAB_SIZE - 1) {
    7074           0 :                         for (int pidx = 1; pidx < PART_S; pidx++) {
    7075           0 :                             if (cnt[pidx] == 0)
    7076           0 :                                 context_ptr->nsq_table[idx] = (PART)pidx;
    7077             :                         }
    7078             :                     }
    7079             :                 }
    7080             :             }
    7081             :         }
    7082             :     }
    7083           0 : }
    7084             : #endif
    7085             : 
    7086             : /****************************************************
    7087             : * Reorder the nsq_table in order to keep the most
    7088             : * probable Shape to be selected in the lowest index
    7089             : ****************************************************/
    7090           0 : void  order_nsq_table(
    7091             :     PictureControlSet            *picture_control_set_ptr,
    7092             :     ModeDecisionContext          *context_ptr,
    7093             :     const SequenceControlSet     *sequence_control_set_ptr,
    7094             :     LargestCodingUnit            *sb_ptr,
    7095             :     NeighborArrayUnit            *leaf_partition_neighbor_array) {
    7096           0 :     FrameHeader *frm_hdr = &picture_control_set_ptr->parent_pcs_ptr->frm_hdr;
    7097           0 :     const uint32_t             lcuAddr = sb_ptr->index;
    7098           0 :     EbBool isCompoundEnabled = (frm_hdr->reference_mode == SINGLE_REFERENCE) ? 0 : 1;
    7099             :     uint32_t me_sb_addr;
    7100             :     uint32_t me2Nx2NTableOffset;
    7101             :     uint32_t max_number_of_pus_per_sb;
    7102           0 :     uint32_t geom_offset_x = 0;
    7103           0 :     uint32_t geom_offset_y = 0;
    7104           0 :     uint8_t cnt[PART_S + 1] = { 0 };
    7105           0 :     if (sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128) {
    7106           0 :         uint32_t me_sb_size = sequence_control_set_ptr->sb_sz;
    7107           0 :         uint32_t me_pic_width_in_sb = (sequence_control_set_ptr->seq_header.max_frame_width + sequence_control_set_ptr->sb_sz - 1) / me_sb_size;
    7108           0 :         uint32_t me_sb_x = (context_ptr->cu_origin_x / me_sb_size);
    7109           0 :         uint32_t me_sb_y = (context_ptr->cu_origin_y / me_sb_size);
    7110           0 :         me_sb_addr = me_sb_x + me_sb_y * me_pic_width_in_sb;
    7111           0 :         geom_offset_x = (me_sb_x & 0x1) * me_sb_size;
    7112           0 :         geom_offset_y = (me_sb_y & 0x1) * me_sb_size;
    7113             :     }
    7114             :     else
    7115           0 :         me_sb_addr = lcuAddr;
    7116           0 :     max_number_of_pus_per_sb = picture_control_set_ptr->parent_pcs_ptr->max_number_of_pus_per_sb;
    7117           0 :     me2Nx2NTableOffset = (context_ptr->blk_geom->bwidth == 4 || context_ptr->blk_geom->bheight == 4 || context_ptr->blk_geom->bwidth == 128 || context_ptr->blk_geom->bheight == 128) ? 0 :
    7118             : 
    7119           0 :         get_me_info_index(
    7120             :             max_number_of_pus_per_sb,
    7121             :             context_ptr->blk_geom,
    7122             :             geom_offset_x,
    7123             :             geom_offset_y);
    7124             : 
    7125           0 :     const MeLcuResults *me_results = picture_control_set_ptr->parent_pcs_ptr->me_results[me_sb_addr];
    7126           0 :     uint8_t nsq0 = me_results->me_nsq_0[me2Nx2NTableOffset];
    7127           0 :     uint8_t nsq1 = me_results->me_nsq_1[me2Nx2NTableOffset];
    7128           0 :     uint8_t me_part_0 = nsq0 == 0 ? PART_N : nsq0 == 1 ? PART_H : nsq0 == 2 ? PART_V : nsq0 == 3 ? PART_H4 : nsq0 == 4 ? PART_V4 : nsq0 == 5 ? PART_S : 0;
    7129           0 :     uint8_t me_part_1 = nsq1 == 0 ? PART_N : nsq1 == 1 ? PART_H : nsq1 == 2 ? PART_V : nsq1 == 3 ? PART_H4 : nsq1 == 4 ? PART_V4 : nsq1 == 5 ? PART_S : 0;
    7130             : 
    7131             :     // Generate Partition context
    7132           0 :     uint32_t partition_left_neighbor_index = get_neighbor_array_unit_left_index(
    7133             :         leaf_partition_neighbor_array,
    7134           0 :         context_ptr->cu_origin_y);
    7135           0 :     uint32_t partition_above_neighbor_index = get_neighbor_array_unit_top_index(
    7136             :         leaf_partition_neighbor_array,
    7137           0 :         context_ptr->cu_origin_x);
    7138           0 :     const PartitionContextType above_ctx = (((PartitionContext*)leaf_partition_neighbor_array->top_array)[partition_above_neighbor_index].above == (int8_t)INVALID_NEIGHBOR_DATA) ?
    7139           0 :         0 : ((PartitionContext*)leaf_partition_neighbor_array->top_array)[partition_above_neighbor_index].above;
    7140           0 :     const PartitionContextType left_ctx = (((PartitionContext*)leaf_partition_neighbor_array->left_array)[partition_left_neighbor_index].left == (int8_t)INVALID_NEIGHBOR_DATA) ?
    7141           0 :         0 : ((PartitionContext*)leaf_partition_neighbor_array->left_array)[partition_left_neighbor_index].left;
    7142             : 
    7143           0 :     PART neighbor_part = get_partition_shape(
    7144             :         above_ctx,
    7145             :         left_ctx,
    7146           0 :         context_ptr->blk_geom->bwidth,
    7147           0 :         context_ptr->blk_geom->bheight);
    7148             : 
    7149             :     //init table
    7150           0 :     context_ptr->nsq_table[0] = PART_H;
    7151           0 :     context_ptr->nsq_table[1] = PART_V;
    7152           0 :     context_ptr->nsq_table[2] = PART_HA;
    7153           0 :     context_ptr->nsq_table[3] = PART_HB;
    7154           0 :     context_ptr->nsq_table[4] = PART_VA;
    7155           0 :     context_ptr->nsq_table[5] = PART_VB;
    7156           0 :     context_ptr->nsq_table[6] = PART_H4;
    7157           0 :     context_ptr->nsq_table[7] = PART_V4;
    7158             : 
    7159           0 :     if (isCompoundEnabled == 0) me_part_1 = me_part_0;
    7160             : 
    7161             :     // Insert predicted Shapes based on ME information
    7162           0 :     if (me_part_0 != me_part_1) {
    7163           0 :         context_ptr->nsq_table[0] = me_part_0;
    7164           0 :         context_ptr->nsq_table[1] = me_part_1;
    7165             : 
    7166           0 :         if (me_part_0 == PART_H) {
    7167           0 :             context_ptr->nsq_table[2] = PART_HA;
    7168           0 :             context_ptr->nsq_table[3] = PART_HB;
    7169           0 :             context_ptr->nsq_table[4] = me_part_1 != PART_H4 ? PART_H4 : PART_V;
    7170             :         }
    7171           0 :         else if (me_part_0 == PART_V) {
    7172           0 :             context_ptr->nsq_table[2] = PART_VA;
    7173           0 :             context_ptr->nsq_table[3] = PART_VB;
    7174           0 :             context_ptr->nsq_table[4] = me_part_1 != PART_V4 ? PART_V4 : PART_H;
    7175             :         }
    7176           0 :         else if (me_part_0 == PART_H4) {
    7177           0 :             context_ptr->nsq_table[2] = PART_HA;
    7178           0 :             context_ptr->nsq_table[3] = PART_HB;
    7179           0 :             context_ptr->nsq_table[4] = me_part_1 != PART_H ? PART_H : PART_V;
    7180             :         }
    7181           0 :         else if (me_part_0 == PART_V4) {
    7182           0 :             context_ptr->nsq_table[2] = PART_VA;
    7183           0 :             context_ptr->nsq_table[3] = PART_VB;
    7184           0 :             context_ptr->nsq_table[4] = me_part_1 != PART_V ? PART_V : PART_H;
    7185             :         }
    7186           0 :         else if (me_part_0 == PART_S) {
    7187           0 :             context_ptr->nsq_table[2] = PART_VA;
    7188           0 :             context_ptr->nsq_table[3] = PART_HB;
    7189           0 :             context_ptr->nsq_table[4] = me_part_1 != PART_V ? PART_V : PART_H;
    7190             :         }
    7191             :     }
    7192             :     else {
    7193           0 :         context_ptr->nsq_table[0] = me_part_0;
    7194           0 :         if (me_part_0 == PART_H) {
    7195           0 :             context_ptr->nsq_table[1] = PART_HA;
    7196           0 :             context_ptr->nsq_table[2] = PART_HB;
    7197           0 :             context_ptr->nsq_table[3] = PART_H4;
    7198           0 :             context_ptr->nsq_table[4] = PART_V;
    7199             :         }
    7200           0 :         else if (me_part_0 == PART_V) {
    7201           0 :             context_ptr->nsq_table[1] = PART_VA;
    7202           0 :             context_ptr->nsq_table[2] = PART_VB;
    7203           0 :             context_ptr->nsq_table[3] = PART_V4;
    7204           0 :             context_ptr->nsq_table[4] = PART_H;
    7205             :         }
    7206           0 :         else if (me_part_0 == PART_H4) {
    7207           0 :             context_ptr->nsq_table[1] = PART_H;
    7208           0 :             context_ptr->nsq_table[2] = PART_HA;
    7209           0 :             context_ptr->nsq_table[3] = PART_HB;
    7210           0 :             context_ptr->nsq_table[4] = PART_V;
    7211             :         }
    7212           0 :         else if (me_part_0 == PART_V4) {
    7213           0 :             context_ptr->nsq_table[1] = PART_V;
    7214           0 :             context_ptr->nsq_table[2] = PART_VA;
    7215           0 :             context_ptr->nsq_table[3] = PART_VB;
    7216           0 :             context_ptr->nsq_table[4] = PART_H;
    7217             :         }
    7218           0 :         else if (me_part_0 == PART_S) {
    7219           0 :             context_ptr->nsq_table[1] = PART_HA;
    7220           0 :             context_ptr->nsq_table[2] = PART_VA;
    7221           0 :             context_ptr->nsq_table[3] = PART_HB;
    7222           0 :             context_ptr->nsq_table[4] = PART_VB;
    7223             :         }
    7224             :     }
    7225             : 
    7226             :     // Insert predicted Shapes based on neighbor information
    7227           0 :     if (neighbor_part == PART_S && me_part_0 == PART_S && me_part_1 == PART_S) {
    7228           0 :         context_ptr->nsq_table[0] = PART_HA;
    7229           0 :         context_ptr->nsq_table[1] = PART_VA;
    7230           0 :         context_ptr->nsq_table[2] = PART_HB;
    7231           0 :         context_ptr->nsq_table[3] = PART_VB;
    7232           0 :         context_ptr->nsq_table[4] = PART_H4;
    7233           0 :         context_ptr->nsq_table[5] = PART_V4;
    7234             :     }
    7235             :     else {
    7236           0 :         if (neighbor_part != PART_N && neighbor_part != PART_S && neighbor_part != me_part_0 && neighbor_part != me_part_1) {
    7237           0 :             context_ptr->nsq_table[5] = context_ptr->nsq_table[4];
    7238           0 :             context_ptr->nsq_table[4] = context_ptr->nsq_table[3];
    7239           0 :             context_ptr->nsq_table[3] = context_ptr->nsq_table[2];
    7240           0 :             context_ptr->nsq_table[2] = context_ptr->nsq_table[1];
    7241           0 :             context_ptr->nsq_table[1] = context_ptr->nsq_table[0];
    7242           0 :             context_ptr->nsq_table[0] = neighbor_part;
    7243             :         }
    7244             :         else
    7245           0 :             context_ptr->nsq_table[5] = neighbor_part != PART_N && neighbor_part != PART_S ? neighbor_part : me_part_0;
    7246             :     }
    7247             : 
    7248             :     // Remove duplicate candidates
    7249           0 :     for (int pidx = 0; pidx < NSQ_TAB_SIZE; pidx++)
    7250           0 :         cnt[context_ptr->nsq_table[pidx]]++;
    7251           0 :     cnt[context_ptr->nsq_table[0]] = 1;
    7252           0 :     for (int iter = 0; iter < NSQ_TAB_SIZE - 1; iter++) {
    7253           0 :         for (int idx = 1 + iter; idx < NSQ_TAB_SIZE; idx++) {
    7254           0 :             if (context_ptr->nsq_table[iter] != context_ptr->nsq_table[idx])
    7255           0 :                 continue;
    7256             :             else {
    7257           0 :                 for (int i = idx; i < NSQ_TAB_SIZE; i++) {
    7258           0 :                     if (idx < NSQ_TAB_SIZE - 1)
    7259           0 :                         context_ptr->nsq_table[idx] = context_ptr->nsq_table[idx + 1];
    7260           0 :                     else if (idx == NSQ_TAB_SIZE - 1) {
    7261           0 :                         for (int pidx = 1; pidx < PART_S; pidx++) {
    7262           0 :                             if (cnt[pidx] == 0)
    7263           0 :                                 context_ptr->nsq_table[idx] = (PART)pidx;
    7264             :                         }
    7265             :                     }
    7266             :                 }
    7267             :             }
    7268             :         }
    7269             :     }
    7270           0 : }
    7271           0 : uint8_t check_skip_sub_blks(
    7272             :     PictureControlSet              *picture_control_set_ptr,
    7273             :     ModeDecisionContext            *context_ptr,
    7274             :     CodingUnit                     *cu_ptr,
    7275             :     uint8_t                           is_complete_sb,
    7276             :     uint32_t                          sb_index) {
    7277           0 :     uint8_t skip_sub_blocks = 0;
    7278           0 :     if (picture_control_set_ptr->parent_pcs_ptr->pic_depth_mode == PIC_OPEN_LOOP_DEPTH_MODE || (picture_control_set_ptr->parent_pcs_ptr->pic_depth_mode == PIC_SB_SWITCH_DEPTH_MODE && picture_control_set_ptr->parent_pcs_ptr->sb_depth_mode_array[sb_index] >= SB_OPEN_LOOP_DEPTH_MODE))
    7279           0 :         if (is_complete_sb)
    7280           0 :             if ((context_ptr->md_local_cu_unit[cu_ptr->mds_idx].top_neighbor_depth == context_ptr->blk_geom->bsize) &&  (context_ptr->md_local_cu_unit[cu_ptr->mds_idx].left_neighbor_depth == context_ptr->blk_geom->bsize)) {
    7281           0 :                 skip_sub_blocks = 1;
    7282           0 :                 context_ptr->md_cu_arr_nsq[context_ptr->blk_geom->sqi_mds].split_flag = 0;
    7283             :             }
    7284           0 :     return skip_sub_blocks;
    7285             : }
    7286             : 
    7287             : // Hsan (chroma search) : av1_get_tx_type() to define as extern
    7288       82392 : void search_best_independent_uv_mode(
    7289             :     PictureControlSet     *picture_control_set_ptr,
    7290             :     EbPictureBufferDesc   *input_picture_ptr,
    7291             :     uint32_t                 inputCbOriginIndex,
    7292             :     uint32_t                 cuChromaOriginIndex,
    7293             :     ModeDecisionContext   *context_ptr)
    7294             : {
    7295       82392 :     FrameHeader *frm_hdr = &picture_control_set_ptr->parent_pcs_ptr->frm_hdr;
    7296             :     // Start uv search path
    7297       82392 :     context_ptr->uv_search_path = EB_TRUE;
    7298             : #if !PAETH_HBD
    7299             :     uint8_t is_16_bit = (sequence_control_set_ptr->static_config.encoder_bit_depth > EB_8BIT);
    7300             : #endif
    7301       82392 :     EbBool use_angle_delta = av1_use_angle_delta(context_ptr->blk_geom->bsize);
    7302             : 
    7303             :     UvPredictionMode uv_mode;
    7304             : 
    7305             :     int coeff_rate[UV_PAETH_PRED + 1][(MAX_ANGLE_DELTA << 1) + 1];
    7306             :     int distortion[UV_PAETH_PRED + 1][(MAX_ANGLE_DELTA << 1) + 1];
    7307             : 
    7308             :     // Use the 1st spot of the candidate buffer to hold cfl settings to use same kernel as MD for coef cost estimation
    7309       82392 :     ModeDecisionCandidateBuffer  *candidate_buffer = &(context_ptr->candidate_buffer_ptr_array[0][0]);
    7310       82392 :     candidate_buffer->candidate_ptr = &(context_ptr->fast_candidate_array[0]);
    7311       82392 :     candidate_buffer->candidate_ptr->type = INTRA_MODE;
    7312       82392 :     candidate_buffer->candidate_ptr->distortion_ready = 0;
    7313       82392 :     candidate_buffer->candidate_ptr->use_intrabc = 0;
    7314       82392 :     candidate_buffer->candidate_ptr->angle_delta[PLANE_TYPE_UV] = 0;
    7315             : 
    7316       82392 :     uint8_t uv_mode_start = UV_DC_PRED;
    7317             : #if PAETH_HBD
    7318       82392 :     uint8_t uv_mode_end =  UV_PAETH_PRED;
    7319             : #else
    7320             :     uint8_t uv_mode_end = is_16_bit ? UV_SMOOTH_H_PRED : UV_PAETH_PRED;
    7321             : #endif
    7322     1153420 :     for (uv_mode = uv_mode_start; uv_mode <= uv_mode_end; uv_mode++) {
    7323     1071050 :         uint8_t uv_angleDeltaCandidateCount = (use_angle_delta && av1_is_directional_mode((PredictionMode)uv_mode)) ? 7 : 1;
    7324     1071050 :         uint8_t uv_angle_delta_shift = 1;
    7325             : 
    7326     5126890 :         for (uint8_t uv_angleDeltaCounter = 0; uv_angleDeltaCounter < uv_angleDeltaCandidateCount; ++uv_angleDeltaCounter) {
    7327     4055860 :             int32_t uv_angle_delta = CLIP(uv_angle_delta_shift * (uv_angleDeltaCandidateCount == 1 ? 0 : uv_angleDeltaCounter - (uv_angleDeltaCandidateCount >> 1)), -MAX_ANGLE_DELTA, MAX_ANGLE_DELTA);
    7328             : #if RDOQ_CHROMA
    7329     4055860 :             candidate_buffer->candidate_ptr->pred_mode = DC_PRED;
    7330             : #endif
    7331     4055860 :             candidate_buffer->candidate_ptr->intra_chroma_mode = uv_mode;
    7332     4055860 :             candidate_buffer->candidate_ptr->is_directional_chroma_mode_flag = (uint8_t)av1_is_directional_mode((PredictionMode)uv_mode);
    7333     4055610 :             candidate_buffer->candidate_ptr->angle_delta[PLANE_TYPE_UV] = uv_angle_delta;
    7334     4055610 :             candidate_buffer->candidate_ptr->tx_depth = 0;
    7335             : 
    7336     8111270 :             candidate_buffer->candidate_ptr->transform_type_uv =
    7337     4055610 :                 av1_get_tx_type(
    7338     4055610 :                     context_ptr->blk_geom->bsize,
    7339             :                     0,
    7340             :                     (PredictionMode)NULL,
    7341             :                     (UvPredictionMode)uv_mode,
    7342             :                     PLANE_TYPE_UV,
    7343             :                     0,
    7344             :                     0,
    7345             :                     0,
    7346     4055610 :                     context_ptr->blk_geom->txsize_uv[0][0],
    7347     4055610 :                     frm_hdr->reduced_tx_set);
    7348             : 
    7349     4055670 :             uint16_t  cb_qp = context_ptr->qp;
    7350     4055670 :             uint16_t  cr_qp = context_ptr->qp;
    7351     4055670 :             uint64_t cb_coeff_bits = 0;
    7352     4055670 :             uint64_t cr_coeff_bits = 0;
    7353     4055670 :             uint64_t cbFullDistortion[DIST_CALC_TOTAL] = { 0, 0 };
    7354     4055670 :             uint64_t crFullDistortion[DIST_CALC_TOTAL] = { 0, 0 };
    7355             : 
    7356             :             uint32_t count_non_zero_coeffs[3][MAX_NUM_OF_TU_PER_CU];
    7357     4055670 :             context_ptr->md_staging_skip_inter_chroma_pred = EB_FALSE;
    7358     4055670 :             ProductPredictionFunTable[candidate_buffer->candidate_ptr->type](
    7359             :                 context_ptr,
    7360             :                 picture_control_set_ptr,
    7361             :                 candidate_buffer);
    7362             : 
    7363             :             //Cb Residual
    7364     4055980 :             residual_kernel(
    7365             :                 input_picture_ptr->buffer_cb,
    7366             :                 inputCbOriginIndex,
    7367     4055980 :                 input_picture_ptr->stride_cb,
    7368     4055980 :                 candidate_buffer->prediction_ptr->buffer_cb,
    7369             :                 cuChromaOriginIndex,
    7370     4055980 :                 candidate_buffer->prediction_ptr->stride_cb,
    7371     4055980 :                 (int16_t*)candidate_buffer->residual_ptr->buffer_cb,
    7372             :                 cuChromaOriginIndex,
    7373     4055980 :                 candidate_buffer->residual_ptr->stride_cb,
    7374     4055980 :                 context_ptr->hbd_mode_decision,
    7375     4055980 :                 context_ptr->blk_geom->bwidth_uv,
    7376     4055980 :                 context_ptr->blk_geom->bheight_uv);
    7377             : 
    7378             :             //Cr Residual
    7379     4055780 :             residual_kernel(
    7380             :                 input_picture_ptr->buffer_cr,
    7381             :                 inputCbOriginIndex,
    7382     4055780 :                 input_picture_ptr->stride_cr,
    7383     4055780 :                 candidate_buffer->prediction_ptr->buffer_cr,
    7384             :                 cuChromaOriginIndex,
    7385     4055780 :                 candidate_buffer->prediction_ptr->stride_cr,
    7386     4055780 :                 (int16_t*)candidate_buffer->residual_ptr->buffer_cr,
    7387             :                 cuChromaOriginIndex,
    7388     4055780 :                 candidate_buffer->residual_ptr->stride_cr,
    7389     4055780 :                 context_ptr->hbd_mode_decision,
    7390     4055780 :                 context_ptr->blk_geom->bwidth_uv,
    7391     4055780 :                 context_ptr->blk_geom->bheight_uv);
    7392             : 
    7393     4055780 :             full_loop_r(
    7394             :                 context_ptr->sb_ptr,
    7395             :                 candidate_buffer,
    7396             :                 context_ptr,
    7397             :                 input_picture_ptr,
    7398             :                 picture_control_set_ptr,
    7399             :                 PICTURE_BUFFER_DESC_CHROMA_MASK,
    7400             :                 cb_qp,
    7401             :                 cr_qp,
    7402             :                 &(*count_non_zero_coeffs[1]),
    7403             :                 &(*count_non_zero_coeffs[2]));
    7404             : 
    7405     4055780 :             cu_full_distortion_fast_tu_mode_r(
    7406             :                 context_ptr->sb_ptr,
    7407             :                 candidate_buffer,
    7408             :                 context_ptr,
    7409             :                 candidate_buffer->candidate_ptr,
    7410             :                 picture_control_set_ptr,
    7411             :                 input_picture_ptr,
    7412             :                 cbFullDistortion,
    7413             :                 crFullDistortion,
    7414             :                 count_non_zero_coeffs,
    7415             :                 COMPONENT_CHROMA,
    7416             :                 &cb_coeff_bits,
    7417             :                 &cr_coeff_bits,
    7418             :                 1);
    7419             : 
    7420     4055840 :             coeff_rate[uv_mode][MAX_ANGLE_DELTA + uv_angle_delta] = (int)(cb_coeff_bits + cr_coeff_bits);
    7421     4055840 :             distortion[uv_mode][MAX_ANGLE_DELTA + uv_angle_delta] = (int)(cbFullDistortion[DIST_CALC_RESIDUAL] + crFullDistortion[DIST_CALC_RESIDUAL]);
    7422             :         }
    7423             :     }
    7424             : 
    7425       82371 :     uint8_t intra_mode_start = DC_PRED;
    7426             : #if PAETH_HBD
    7427       82371 :     uint8_t intra_mode_end =  PAETH_PRED;
    7428             : #else
    7429             :     uint8_t intra_mode_end = is_16_bit ? SMOOTH_H_PRED : PAETH_PRED;
    7430             : #endif
    7431             :     // Loop over all intra mode, then over all uv move to derive the best uv mode for a given intra mode in term of rate
    7432      818838 :     for (uint8_t intra_mode = intra_mode_start; intra_mode <= intra_mode_end; ++intra_mode) {
    7433     1071060 :         uint8_t angleDeltaCandidateCount = (use_angle_delta && av1_is_directional_mode((PredictionMode)intra_mode)) ? 7 : 1;
    7434     1071050 :         uint8_t angle_delta_shift = 1;
    7435             : 
    7436     4792210 :         for (uint8_t angleDeltaCounter = 0; angleDeltaCounter < angleDeltaCandidateCount; ++angleDeltaCounter) {
    7437     4055740 :             int32_t angle_delta = CLIP(angle_delta_shift * (angleDeltaCandidateCount == 1 ? 0 : angleDeltaCounter - (angleDeltaCandidateCount >> 1)), -MAX_ANGLE_DELTA, MAX_ANGLE_DELTA);
    7438             : 
    7439     4055740 :             candidate_buffer->candidate_ptr->type = INTRA_MODE;
    7440     4055740 :             candidate_buffer->candidate_ptr->intra_luma_mode = intra_mode;
    7441     4055740 :             candidate_buffer->candidate_ptr->distortion_ready = 0;
    7442     4055740 :             candidate_buffer->candidate_ptr->use_intrabc = 0;
    7443     4055740 :             candidate_buffer->candidate_ptr->is_directional_mode_flag = (uint8_t)av1_is_directional_mode((PredictionMode)intra_mode);
    7444     4055520 :             candidate_buffer->candidate_ptr->angle_delta[PLANE_TYPE_Y] = angle_delta;
    7445     4055520 :             candidate_buffer->candidate_ptr->cfl_alpha_signs = 0;
    7446     4055520 :             candidate_buffer->candidate_ptr->cfl_alpha_idx = 0;
    7447             :             // This kernel assumes no atb
    7448     4055520 :             candidate_buffer->candidate_ptr->transform_type[0] = DCT_DCT;
    7449     4055520 :             candidate_buffer->candidate_ptr->ref_frame_type = INTRA_FRAME;
    7450     4055520 :             candidate_buffer->candidate_ptr->pred_mode = (PredictionMode)intra_mode;
    7451     4055520 :             candidate_buffer->candidate_ptr->motion_mode = SIMPLE_TRANSLATION;
    7452             : 
    7453             :             //int32_t  p_angle = mode_to_angle_map[(PredictionMode)openLoopIntraCandidate] + angle_delta * ANGLE_STEP;
    7454             :             //if (!disable_z2_prediction || (p_angle <= 90 || p_angle >= 180)) {
    7455             :             // uv mode loop
    7456     4055520 :             context_ptr->best_uv_cost[intra_mode][MAX_ANGLE_DELTA + angle_delta] = (uint64_t)~0;
    7457    56384800 :             for (uv_mode = uv_mode_start; uv_mode <= uv_mode_end; uv_mode++) {
    7458    52663700 :                 uint8_t uv_angleDeltaCandidateCount = (use_angle_delta && av1_is_directional_mode((PredictionMode)uv_mode)) ? 7 : 1;
    7459    52660900 :                 uint8_t uv_angle_delta_shift = 1;
    7460             : 
    7461   285010000 :                 for (uint8_t uv_angleDeltaCounter = 0; uv_angleDeltaCounter < uv_angleDeltaCandidateCount; ++uv_angleDeltaCounter) {
    7462   232680000 :                     int32_t uv_angle_delta = CLIP(uv_angle_delta_shift * (uv_angleDeltaCandidateCount == 1 ? 0 : uv_angleDeltaCounter - (uv_angleDeltaCandidateCount >> 1)), -MAX_ANGLE_DELTA, MAX_ANGLE_DELTA);
    7463             : 
    7464   232680000 :                     candidate_buffer->candidate_ptr->intra_chroma_mode = uv_mode;
    7465   232680000 :                     candidate_buffer->candidate_ptr->is_directional_chroma_mode_flag = (uint8_t)av1_is_directional_mode((PredictionMode)uv_mode);
    7466   232326000 :                     candidate_buffer->candidate_ptr->angle_delta[PLANE_TYPE_UV] = uv_angle_delta;
    7467             : 
    7468   464786000 :                     candidate_buffer->candidate_ptr->transform_type_uv =
    7469   232326000 :                         av1_get_tx_type(
    7470   232326000 :                             context_ptr->blk_geom->bsize,
    7471             :                             0,
    7472   232326000 :                             (PredictionMode)candidate_buffer->candidate_ptr->intra_luma_mode,
    7473   232326000 :                             (UvPredictionMode)candidate_buffer->candidate_ptr->intra_chroma_mode,
    7474             :                             PLANE_TYPE_UV,
    7475             :                             0,
    7476             :                             0,
    7477             :                             0,
    7478   232326000 :                             context_ptr->blk_geom->txsize_uv[0][0],
    7479   232326000 :                             frm_hdr->reduced_tx_set);
    7480             : 
    7481             :                     // Fast Cost
    7482   464809000 :                     *(candidate_buffer->fast_cost_ptr) = Av1ProductFastCostFuncTable[candidate_buffer->candidate_ptr->type](
    7483             :                         context_ptr->cu_ptr,
    7484   232460000 :                         candidate_buffer->candidate_ptr,
    7485   232460000 :                         context_ptr->qp,
    7486             :                         0,
    7487             :                         0,
    7488             :                         0,
    7489             :                         0,
    7490             :                         picture_control_set_ptr,
    7491   232460000 :                         &(context_ptr->md_local_cu_unit[context_ptr->blk_geom->blkidx_mds].ed_ref_mv_stack[candidate_buffer->candidate_ptr->ref_frame_type][0]),
    7492             :                         context_ptr->blk_geom,
    7493   232460000 :                         context_ptr->cu_origin_y >> MI_SIZE_LOG2,
    7494   232460000 :                         context_ptr->cu_origin_x >> MI_SIZE_LOG2,
    7495             :                         1,
    7496   232460000 :                         context_ptr->intra_luma_left_mode,
    7497   232460000 :                         context_ptr->intra_luma_top_mode);
    7498             : 
    7499   232349000 :                     uint64_t rate = coeff_rate[uv_mode][MAX_ANGLE_DELTA + uv_angle_delta] + candidate_buffer->candidate_ptr->fast_luma_rate + candidate_buffer->candidate_ptr->fast_chroma_rate;
    7500   232349000 :                     uint64_t uv_cost = RDCOST(context_ptr->full_lambda, rate, distortion[uv_mode][MAX_ANGLE_DELTA + uv_angle_delta]);
    7501             : 
    7502   232349000 :                     if (uv_cost < context_ptr->best_uv_cost[intra_mode][MAX_ANGLE_DELTA + angle_delta]) {
    7503    12342800 :                         context_ptr->best_uv_mode[intra_mode][MAX_ANGLE_DELTA + angle_delta] = uv_mode;
    7504    12342800 :                         context_ptr->best_uv_angle[intra_mode][MAX_ANGLE_DELTA + angle_delta] = uv_angle_delta;
    7505             : 
    7506    12342800 :                         context_ptr->best_uv_cost[intra_mode][MAX_ANGLE_DELTA + angle_delta] = uv_cost;
    7507    12342800 :                         context_ptr->fast_luma_rate[intra_mode][MAX_ANGLE_DELTA + angle_delta] = candidate_buffer->candidate_ptr->fast_luma_rate;
    7508    12342800 :                         context_ptr->fast_chroma_rate[intra_mode][MAX_ANGLE_DELTA + angle_delta] = candidate_buffer->candidate_ptr->fast_chroma_rate;
    7509             :                     }
    7510             :                 }
    7511             :             }
    7512             :         }
    7513             :     }
    7514             : 
    7515             :     // End uv search path
    7516           0 :     context_ptr->uv_search_path = EB_FALSE;
    7517           0 : }
    7518             : #if SPEED_OPT
    7519             : #if !REMOVE_MD_STAGE_1
    7520             : void inter_class_decision_count_1(
    7521             :     struct ModeDecisionContext   *context_ptr
    7522             : )
    7523             : {
    7524             :     ModeDecisionCandidateBuffer **buffer_ptr_array = context_ptr->candidate_buffer_ptr_array;
    7525             :     // Distortion-based NIC pruning not applied to INTRA clases: CLASS_0 and CLASS
    7526             :     for (CAND_CLASS cand_class_it = CAND_CLASS_1; cand_class_it <= CAND_CLASS_3; cand_class_it++) {
    7527             :         if (context_ptr->md_stage_0_count[cand_class_it] > 0 && context_ptr->md_stage_1_count[cand_class_it] > 0) {
    7528             :             uint32_t *cand_buff_indices = context_ptr->cand_buff_indices[cand_class_it];
    7529             :             if (*(buffer_ptr_array[cand_buff_indices[0]]->fast_cost_ptr) < *(buffer_ptr_array[context_ptr->cand_buff_indices[CAND_CLASS_0][0]]->fast_cost_ptr)) {
    7530             :                 uint32_t fast1_cand_count = 1;
    7531             :                 while (fast1_cand_count < context_ptr->md_stage_1_count[cand_class_it] && ((((*(buffer_ptr_array[cand_buff_indices[fast1_cand_count]]->fast_cost_ptr) - *(buffer_ptr_array[cand_buff_indices[0]]->fast_cost_ptr)) * 100) / (*(buffer_ptr_array[cand_buff_indices[0]]->fast_cost_ptr))) < context_ptr->dist_base_md_stage_0_count_th)) {
    7532             :                     fast1_cand_count++;
    7533             :                 }
    7534             :                 context_ptr->md_stage_1_count[cand_class_it] = fast1_cand_count;
    7535             :             }
    7536             :         }
    7537             :     }
    7538             : }
    7539             : #endif
    7540             : extern aom_variance_fn_ptr_t mefn_ptr[BlockSizeS_ALL];
    7541             : unsigned int eb_av1_get_sby_perpixel_variance(const aom_variance_fn_ptr_t *fn_ptr, const uint8_t *src, int stride, BlockSize bs);
    7542             : #endif
    7543             : 
    7544             : #if INTER_INTRA_CLASS_PRUNING
    7545             : 
    7546      811372 : void interintra_class_pruning_1(ModeDecisionContext *context_ptr, uint64_t best_md_stage_cost) {
    7547             :     
    7548     8113460 :     for (CAND_CLASS cand_class_it = CAND_CLASS_0; cand_class_it < CAND_CLASS_TOTAL; cand_class_it++) {
    7549     7302080 :         if (context_ptr->md_stage_0_count[cand_class_it] > 0 && context_ptr->md_stage_1_count[cand_class_it] > 0) {
    7550     3306330 :             uint32_t *cand_buff_indices = context_ptr->cand_buff_indices[cand_class_it];
    7551     3306330 :             uint64_t class_best_cost = *(context_ptr->candidate_buffer_ptr_array[cand_buff_indices[0]]->fast_cost_ptr);
    7552             : 
    7553             :             // inter class pruning
    7554     3306330 :             if ((((class_best_cost - best_md_stage_cost) * 100) / best_md_stage_cost) > context_ptr->md_stage_1_class_prune_th){
    7555           0 :                 context_ptr->md_stage_1_count[cand_class_it] = 0;
    7556           0 :                 continue;
    7557             :             }
    7558             :             // intra class pruning
    7559     3306330 :             uint32_t cand_count = 1;
    7560    33974000 :             while (cand_count < context_ptr->md_stage_1_count[cand_class_it] && ((((*(context_ptr->candidate_buffer_ptr_array[cand_buff_indices[cand_count]]->fast_cost_ptr) - class_best_cost) * 100) / class_best_cost) < context_ptr->md_stage_1_cand_prune_th)) {
    7561    30667700 :                 cand_count++;
    7562             :             }
    7563     3306330 :             context_ptr->md_stage_1_count[cand_class_it] = cand_count;
    7564             :         }
    7565     7302080 :         context_ptr->md_stage_1_total_count += context_ptr->md_stage_1_count[cand_class_it];
    7566             :     }
    7567      811372 : }
    7568             : 
    7569      811413 : void interintra_class_pruning_2(ModeDecisionContext *context_ptr, uint64_t best_md_stage_cost) {
    7570             :     
    7571     8113690 :     for (CAND_CLASS cand_class_it = CAND_CLASS_0; cand_class_it < CAND_CLASS_TOTAL; cand_class_it++) {
    7572     7302280 :         if (context_ptr->md_stage_1_count[cand_class_it] > 0 && context_ptr->md_stage_2_count[cand_class_it] > 0 && context_ptr->bypass_md_stage_1[cand_class_it] == EB_FALSE) {
    7573     3160810 :             uint32_t *cand_buff_indices = context_ptr->cand_buff_indices[cand_class_it];
    7574     3160810 :             uint64_t class_best_cost = *(context_ptr->candidate_buffer_ptr_array[cand_buff_indices[0]]->full_cost_ptr);
    7575             : 
    7576             :             // inter class pruning
    7577     3160810 :             if ((((class_best_cost - best_md_stage_cost) * 100) / best_md_stage_cost) > context_ptr->md_stage_2_class_prune_th) {
    7578     1650830 :                 context_ptr->md_stage_2_count[cand_class_it] = 0;
    7579     1650830 :                 continue;
    7580             :             }
    7581             : 
    7582             :             // intra class pruning
    7583     1509980 :             uint32_t cand_count = 1;
    7584     3776060 :             while (cand_count < context_ptr->md_stage_2_count[cand_class_it] && ((((*(context_ptr->candidate_buffer_ptr_array[cand_buff_indices[cand_count]]->full_cost_ptr) - class_best_cost) * 100) / class_best_cost) < context_ptr->md_stage_2_cand_prune_th)) {
    7585     2266080 :                 cand_count++;
    7586             :             }
    7587     1509980 :             context_ptr->md_stage_2_count[cand_class_it] = cand_count;
    7588             :         }
    7589     5651450 :         context_ptr->md_stage_2_total_count += context_ptr->md_stage_2_count[cand_class_it];
    7590             :     }
    7591      811413 : }
    7592             : 
    7593             : #endif
    7594             : 
    7595      811401 : void md_encode_block(
    7596             :     SequenceControlSet             *sequence_control_set_ptr,
    7597             :     PictureControlSet              *picture_control_set_ptr,
    7598             :     ModeDecisionContext            *context_ptr,
    7599             :     EbPictureBufferDesc            *input_picture_ptr,
    7600             :     SsMeContext                    *ss_mecontext,
    7601             :     uint8_t                        *skip_sub_blocks,
    7602             :     uint32_t                        lcuAddr,
    7603             :     ModeDecisionCandidateBuffer    *bestcandidate_buffers[5])
    7604             : {
    7605      811401 :     ModeDecisionCandidateBuffer  **candidate_buffer_ptr_array_base = context_ptr->candidate_buffer_ptr_array;
    7606             :     ModeDecisionCandidateBuffer  **candidate_buffer_ptr_array;
    7607      811401 :     const BlockGeom               *blk_geom = context_ptr->blk_geom;
    7608             :     ModeDecisionCandidateBuffer   *candidate_buffer;
    7609      811401 :     ModeDecisionCandidate         *fast_candidate_array = context_ptr->fast_candidate_array;
    7610             :     uint32_t                       candidate_index;
    7611             :     uint32_t                       fast_candidate_total_count;
    7612      811401 :     uint32_t                       best_intra_mode = EB_INTRA_MODE_INVALID;
    7613      811401 :     const uint32_t                 inputOriginIndex = (context_ptr->cu_origin_y + input_picture_ptr->origin_y) * input_picture_ptr->stride_y + (context_ptr->cu_origin_x + input_picture_ptr->origin_x);
    7614             : 
    7615      811401 :     const uint32_t inputCbOriginIndex = ((context_ptr->round_origin_y >> 1) + (input_picture_ptr->origin_y >> 1)) * input_picture_ptr->stride_cb + ((context_ptr->round_origin_x >> 1) + (input_picture_ptr->origin_x >> 1));
    7616      811401 :     const uint32_t cuOriginIndex = blk_geom->origin_x + blk_geom->origin_y * SB_STRIDE_Y;
    7617      811401 :     const uint32_t cuChromaOriginIndex = ROUND_UV(blk_geom->origin_x) / 2 + ROUND_UV(blk_geom->origin_y) / 2 * SB_STRIDE_UV;
    7618      811401 :     CodingUnit *  cu_ptr = context_ptr->cu_ptr;
    7619      811401 :     candidate_buffer_ptr_array = &(candidate_buffer_ptr_array_base[0]);
    7620    25150500 :     for (uint8_t ref_idx = 0; ref_idx < MAX_REF_TYPE_CAND; ref_idx++)
    7621    24339100 :         context_ptr->ref_best_cost_sq_table[ref_idx] = MAX_CU_COST;
    7622             : 
    7623             : #if PREDICT_NSQ_SHAPE
    7624     2326580 :     EbBool is_nsq_table_used = (picture_control_set_ptr->slice_type == !I_SLICE &&
    7625      703778 :         picture_control_set_ptr->parent_pcs_ptr->pic_depth_mode <= PIC_ALL_C_DEPTH_MODE &&
    7626      637579 :         picture_control_set_ptr->parent_pcs_ptr->nsq_search_level >= NSQ_SEARCH_LEVEL1 &&
    7627     1515180 :         picture_control_set_ptr->parent_pcs_ptr->nsq_search_level < NSQ_SEARCH_FULL) ? EB_TRUE : EB_FALSE;
    7628             : 
    7629      811401 :     is_nsq_table_used = picture_control_set_ptr->parent_pcs_ptr->sc_content_detected || picture_control_set_ptr->enc_mode == ENC_M0 ? EB_FALSE : is_nsq_table_used;
    7630             : #if ADJUST_NSQ_RANK_BASED_ON_NEIGH
    7631      811401 :     if (is_nsq_table_used) {
    7632           0 :         if (context_ptr->blk_geom->shape == PART_N) {
    7633             : #if MDC_ADAPTIVE_LEVEL
    7634           0 :             if (picture_control_set_ptr->parent_pcs_ptr->enable_adaptive_ol_partitioning) {
    7635             : #else
    7636             :             if (picture_control_set_ptr->parent_pcs_ptr->mdc_depth_level < MAX_MDC_LEVEL) {
    7637             : #endif
    7638           0 :                 adjust_nsq_rank(
    7639             :                     picture_control_set_ptr,
    7640             :                     context_ptr,
    7641             :                     sequence_control_set_ptr,
    7642             :                     context_ptr->sb_ptr,
    7643             :                     context_ptr->leaf_partition_neighbor_array);
    7644             :             }
    7645             :             else {
    7646           0 :                 order_nsq_table(
    7647             :                     picture_control_set_ptr,
    7648             :                     context_ptr,
    7649             :                     sequence_control_set_ptr,
    7650             :                     context_ptr->sb_ptr,
    7651             :                     context_ptr->leaf_partition_neighbor_array);
    7652             :             }
    7653             :         }
    7654             :     }
    7655             : #endif
    7656             : #else
    7657             :     EbBool is_nsq_table_used = (picture_control_set_ptr->slice_type == !I_SLICE &&
    7658             :         picture_control_set_ptr->parent_pcs_ptr->pic_depth_mode <= PIC_ALL_C_DEPTH_MODE &&
    7659             :         picture_control_set_ptr->parent_pcs_ptr->nsq_search_level >= NSQ_SEARCH_LEVEL1 &&
    7660             :         picture_control_set_ptr->parent_pcs_ptr->nsq_search_level < NSQ_SEARCH_FULL) ? EB_TRUE : EB_FALSE;
    7661             : 
    7662             :     is_nsq_table_used = picture_control_set_ptr->enc_mode == ENC_M0 ?  EB_FALSE : is_nsq_table_used;
    7663             :     if (is_nsq_table_used) {
    7664             :         if (context_ptr->blk_geom->shape == PART_N) {
    7665             :             order_nsq_table(
    7666             :                 picture_control_set_ptr,
    7667             :                 context_ptr,
    7668             :                 sequence_control_set_ptr,
    7669             :                 context_ptr->sb_ptr,
    7670             :                 context_ptr->leaf_partition_neighbor_array);
    7671             :         }
    7672             :     }
    7673             : #endif
    7674             : 
    7675      811401 :     uint8_t                            is_complete_sb = sequence_control_set_ptr->sb_geom[lcuAddr].is_complete_sb;
    7676             : 
    7677      811414 :     if (allowed_ns_cu(
    7678             : #if COMBINE_MDC_NSQ_TABLE
    7679             : #if MDC_ADAPTIVE_LEVEL
    7680      811401 :         picture_control_set_ptr->parent_pcs_ptr->enable_adaptive_ol_partitioning,
    7681             : #else
    7682             :         picture_control_set_ptr->parent_pcs_ptr->mdc_depth_level,
    7683             : #endif
    7684             : #endif
    7685      811401 :         is_nsq_table_used, picture_control_set_ptr->parent_pcs_ptr->nsq_max_shapes_md, context_ptr, is_complete_sb))
    7686             :     {
    7687             : 
    7688             : #if SPEED_OPT
    7689      811414 :         const aom_variance_fn_ptr_t *fn_ptr = &mefn_ptr[context_ptr->blk_geom->bsize];
    7690      811414 :         context_ptr->source_variance = eb_av1_get_sby_perpixel_variance(fn_ptr, (input_picture_ptr->buffer_y + inputOriginIndex), input_picture_ptr->stride_y, context_ptr->blk_geom->bsize);
    7691             : #endif
    7692             : 
    7693      811422 :         cu_ptr->av1xd->tile.mi_col_start = context_ptr->sb_ptr->tile_info.mi_col_start;
    7694      811422 :         cu_ptr->av1xd->tile.mi_col_end = context_ptr->sb_ptr->tile_info.mi_col_end;
    7695      811422 :         cu_ptr->av1xd->tile.mi_row_start = context_ptr->sb_ptr->tile_info.mi_row_start;
    7696      811422 :         cu_ptr->av1xd->tile.mi_row_end = context_ptr->sb_ptr->tile_info.mi_row_end;
    7697             : 
    7698      811422 :         ProductCodingLoopInitFastLoop(
    7699             :             context_ptr,
    7700             :             context_ptr->skip_coeff_neighbor_array,
    7701             :             context_ptr->inter_pred_dir_neighbor_array,
    7702             :             context_ptr->ref_frame_type_neighbor_array,
    7703             :             context_ptr->intra_luma_mode_neighbor_array,
    7704             :             context_ptr->skip_flag_neighbor_array,
    7705             :             context_ptr->mode_type_neighbor_array,
    7706             :             context_ptr->leaf_depth_neighbor_array,
    7707             :             context_ptr->leaf_partition_neighbor_array);
    7708             :          // Skip sub blocks if the current block has the same depth as the left block and above block
    7709      811418 :         if (picture_control_set_ptr->parent_pcs_ptr->skip_sub_blks)
    7710           0 :             *skip_sub_blocks =check_skip_sub_blks(picture_control_set_ptr,
    7711             :                                                   context_ptr,
    7712             :                                                   cu_ptr,
    7713             :                                                   is_complete_sb,
    7714             :                                                   lcuAddr);
    7715             : 
    7716             :         // Initialize uv_search_path
    7717      811418 :         context_ptr->uv_search_path = EB_FALSE;
    7718             :         // Search the best independent intra chroma mode
    7719      811418 :         if (context_ptr->chroma_level == CHROMA_MODE_0) {
    7720      124200 :             if (context_ptr->blk_geom->sq_size < 128) {
    7721      124200 :                 if (context_ptr->blk_geom->has_uv) {
    7722       82392 :                     search_best_independent_uv_mode(
    7723             :                         picture_control_set_ptr,
    7724             :                         input_picture_ptr,
    7725             :                         inputCbOriginIndex,
    7726             :                         cuChromaOriginIndex,
    7727             :                         context_ptr);
    7728             :                 }
    7729             :             }
    7730             :         }
    7731             : 
    7732      811419 :         FrameHeader *frm_hdr = &picture_control_set_ptr->parent_pcs_ptr->frm_hdr;
    7733      811419 :         context_ptr->geom_offset_x = 0;
    7734      811419 :         context_ptr->geom_offset_y = 0;
    7735             : 
    7736      811419 :         if (sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128) {
    7737           0 :             uint32_t me_sb_size = sequence_control_set_ptr->sb_sz;
    7738           0 :             uint32_t me_pic_width_in_sb = (sequence_control_set_ptr->seq_header.max_frame_width + sequence_control_set_ptr->sb_sz - 1) / me_sb_size;
    7739           0 :             uint32_t me_sb_x = (context_ptr->cu_origin_x / me_sb_size);
    7740           0 :             uint32_t me_sb_y = (context_ptr->cu_origin_y / me_sb_size);
    7741           0 :             context_ptr->me_sb_addr = me_sb_x + me_sb_y * me_pic_width_in_sb;
    7742           0 :             context_ptr->geom_offset_x = (me_sb_x & 0x1) * me_sb_size;
    7743           0 :             context_ptr->geom_offset_y = (me_sb_y & 0x1) * me_sb_size;
    7744             :         }
    7745             :         else
    7746      811419 :             context_ptr->me_sb_addr = lcuAddr;
    7747             : 
    7748      811402 :         context_ptr->me_block_offset =
    7749      640590 :             (context_ptr->blk_geom->bwidth == 4 || context_ptr->blk_geom->bheight == 4 || context_ptr->blk_geom->bwidth == 128 || context_ptr->blk_geom->bheight == 128) ?
    7750      950339 :             0 :
    7751      501670 :             get_me_info_index(picture_control_set_ptr->parent_pcs_ptr->max_number_of_pus_per_sb, context_ptr->blk_geom, context_ptr->geom_offset_x, context_ptr->geom_offset_y);
    7752             : 
    7753             :         // Generate MVP(s)
    7754      811402 :         if (frm_hdr->allow_intrabc) // picture_control_set_ptr->slice_type == I_SLICE
    7755           0 :             generate_av1_mvp_table(
    7756           0 :                 &context_ptr->sb_ptr->tile_info,
    7757             :                 context_ptr,
    7758             :                 context_ptr->cu_ptr,
    7759             :                 context_ptr->blk_geom,
    7760           0 :                 context_ptr->cu_origin_x,
    7761           0 :                 context_ptr->cu_origin_y,
    7762           0 :                 picture_control_set_ptr->parent_pcs_ptr->ref_frame_type_arr,
    7763             :                 1,
    7764             :                 picture_control_set_ptr);
    7765      811402 :         else if (picture_control_set_ptr->slice_type != I_SLICE)
    7766      745162 :             generate_av1_mvp_table(
    7767      745162 :                 &context_ptr->sb_ptr->tile_info,
    7768             :                 context_ptr,
    7769             :                 context_ptr->cu_ptr,
    7770             :                 context_ptr->blk_geom,
    7771      745162 :                 context_ptr->cu_origin_x,
    7772      745162 :                 context_ptr->cu_origin_y,
    7773      745162 :                 picture_control_set_ptr->parent_pcs_ptr->ref_frame_type_arr,
    7774      745162 :                 picture_control_set_ptr->parent_pcs_ptr->tot_ref_frame_types,
    7775             :                 picture_control_set_ptr);
    7776             : 
    7777             :         // Perform ME search around the best MVP
    7778      811363 :         if (context_ptr->predictive_me_level)
    7779      675250 :             predictive_me_search(
    7780             :                 picture_control_set_ptr,
    7781             :                 context_ptr,
    7782             :                 input_picture_ptr,
    7783             :                 inputOriginIndex,
    7784             :                 cuOriginIndex);
    7785             : 
    7786             : #if II_COMP_FLAG
    7787             :         //for every CU, perform Luma DC/V/H/S intra prediction to be used later in inter-intra search
    7788      811366 :         int allow_ii = is_interintra_allowed_bsize(context_ptr->blk_geom->bsize);
    7789      811360 :         if (picture_control_set_ptr->parent_pcs_ptr->enable_inter_intra && allow_ii)
    7790      295933 :             precompute_intra_pred_for_inter_intra(
    7791             :                 picture_control_set_ptr,
    7792             :                 context_ptr);
    7793             : #endif
    7794             : 
    7795      811357 :         generate_md_stage_0_cand(
    7796             :             context_ptr->sb_ptr,
    7797             :             context_ptr,
    7798             :             ss_mecontext,
    7799             :             &fast_candidate_total_count,
    7800             :             picture_control_set_ptr);
    7801             : 
    7802             :         //MD Stages
    7803             :         //The first stage(old fast loop) and the last stage(old full loop) should remain at their locations, new stages could be created between those two.
    7804             :         //a bypass mechanism should be added to skip one or all of the intermediate stages, in a way to to be able to fall back to org design (FastLoop->FullLoop)
    7805      811355 :         set_md_stage_counts(
    7806             :             picture_control_set_ptr,
    7807             :             context_ptr,
    7808             :             fast_candidate_total_count);
    7809             : 
    7810             :         CAND_CLASS  cand_class_it;
    7811      811052 :         uint32_t buffer_start_idx = 0;
    7812             :         uint32_t buffer_count_for_curr_class;
    7813      811052 :         uint32_t buffer_total_count = 0;
    7814             : #if REMOVE_MD_STAGE_1
    7815      811052 :         context_ptr->md_stage_1_total_count = 0;
    7816      811052 :         context_ptr->md_stage_2_total_count = 0;
    7817             : #else
    7818             :         context_ptr->md_stage_2_total_count = 0;
    7819             :         context_ptr->md_stage_3_total_count = 0;
    7820             : 
    7821             :         context_ptr->md_stage = MD_STAGE_0;
    7822             : #endif
    7823             : #if INTER_INTRA_CLASS_PRUNING
    7824      811052 :         uint64_t best_md_stage_cost = (uint64_t)~0;
    7825             : #endif
    7826     8112060 :         for (cand_class_it = CAND_CLASS_0; cand_class_it < CAND_CLASS_TOTAL; cand_class_it++) {
    7827             : 
    7828             :             //number of next level candidates could not exceed number of curr level candidates
    7829     7300640 :             context_ptr->md_stage_1_count[cand_class_it] = MIN(context_ptr->md_stage_0_count[cand_class_it], context_ptr->md_stage_1_count[cand_class_it]);
    7830             : 
    7831     7300640 :             if (context_ptr->md_stage_0_count[cand_class_it] > 0 && context_ptr->md_stage_1_count[cand_class_it] > 0) {
    7832             : 
    7833     3305860 :                 buffer_count_for_curr_class = context_ptr->md_stage_0_count[cand_class_it] > context_ptr->md_stage_1_count[cand_class_it] ? (context_ptr->md_stage_1_count[cand_class_it] + 1) : context_ptr->md_stage_1_count[cand_class_it];
    7834             : 
    7835     3305860 :                 buffer_total_count += buffer_count_for_curr_class;
    7836     3305860 :                 assert(buffer_total_count <= MAX_NFL_BUFF && "not enough cand buffers");
    7837             : 
    7838             :                 //Input: md_stage_0_count[cand_class_it]  Output:  md_stage_1_count[cand_class_it]
    7839     3305860 :                 context_ptr->target_class = cand_class_it;
    7840             : 
    7841     3305860 :                 md_stage_0(
    7842             :                     picture_control_set_ptr,
    7843             :                     context_ptr,
    7844             :                     candidate_buffer_ptr_array_base,
    7845             :                     fast_candidate_array,
    7846             :                     0,
    7847     3305860 :                     fast_candidate_total_count - 1,
    7848             :                     input_picture_ptr,
    7849             :                     inputOriginIndex,
    7850             :                     inputCbOriginIndex,
    7851             :                     inputCbOriginIndex,
    7852             :                     cu_ptr,
    7853             :                     cuOriginIndex,
    7854             :                     cuChromaOriginIndex,
    7855             :                     buffer_start_idx,
    7856             :                     buffer_count_for_curr_class,
    7857     3305860 :                     context_ptr->md_stage_0_count[cand_class_it] > context_ptr->md_stage_1_count[cand_class_it],  //is there need to max the temp buffer
    7858             :                     0);
    7859             : 
    7860             :                 //Sort:  md_stage_1_count[cand_class_it]
    7861     3305790 :                 memset(context_ptr->cand_buff_indices[cand_class_it], 0xFFFFFFFF, MAX_NFL_BUFF * sizeof(uint32_t));
    7862     3305790 :                 sort_stage0_fast_candidates(
    7863             :                     context_ptr,
    7864             :                     buffer_start_idx,
    7865             :                     buffer_count_for_curr_class, //how many cand buffers to sort. one of the buffers can have max cost.
    7866     3305790 :                     context_ptr->cand_buff_indices[cand_class_it]);
    7867             : 
    7868             : #if INTER_INTRA_CLASS_PRUNING
    7869     3306220 :                 uint32_t *cand_buff_indices = context_ptr->cand_buff_indices[cand_class_it];
    7870     3306220 :                 best_md_stage_cost = MIN((*(context_ptr->candidate_buffer_ptr_array[cand_buff_indices[0]]->fast_cost_ptr)), best_md_stage_cost);
    7871             : #else
    7872             : #if REMOVE_MD_STAGE_1
    7873             :                 // Distortion-based NIC pruning to CLASS_1, CLASS_2, CLASS_3
    7874             :                 if (cand_class_it == CAND_CLASS_1 || cand_class_it == CAND_CLASS_2 || cand_class_it == CAND_CLASS_3) {
    7875             :                     uint32_t *cand_buff_indices = context_ptr->cand_buff_indices[cand_class_it];
    7876             :                     assert(context_ptr->md_stage_0_count[CAND_CLASS_0] > 0);
    7877             :                     if (context_ptr->md_stage_0_count[CAND_CLASS_0] > 0 && *(context_ptr->candidate_buffer_ptr_array[cand_buff_indices[0]]->fast_cost_ptr) <
    7878             :                         *(context_ptr->candidate_buffer_ptr_array[context_ptr->cand_buff_indices[CAND_CLASS_0][0]]->fast_cost_ptr)) {
    7879             :                         uint32_t fast1_cand_count = 1;
    7880             :                         while (fast1_cand_count < context_ptr->md_stage_1_count[cand_class_it] && ((((*(context_ptr->candidate_buffer_ptr_array[cand_buff_indices[fast1_cand_count]]->fast_cost_ptr) - *(context_ptr->candidate_buffer_ptr_array[cand_buff_indices[0]]->fast_cost_ptr)) * 100) / (*(context_ptr->candidate_buffer_ptr_array[cand_buff_indices[0]]->fast_cost_ptr))) < context_ptr->dist_base_md_stage_0_count_th)) {
    7881             :                             fast1_cand_count++;
    7882             :                         }
    7883             :                         context_ptr->md_stage_1_count[cand_class_it] = fast1_cand_count;
    7884             :                     }
    7885             :                 }
    7886             : #endif
    7887             : #endif
    7888             : 
    7889     3306220 :                 buffer_start_idx += buffer_count_for_curr_class;//for next iteration.
    7890             : 
    7891             :             }
    7892             : 
    7893             : #if !INTER_INTRA_CLASS_PRUNING
    7894             : #if REMOVE_MD_STAGE_1
    7895             :             context_ptr->md_stage_1_total_count += context_ptr->md_stage_1_count[cand_class_it];
    7896             : #endif
    7897             : #endif
    7898             :         }
    7899             : 
    7900             : #if INTER_INTRA_CLASS_PRUNING
    7901      811412 :         interintra_class_pruning_1(context_ptr,best_md_stage_cost);
    7902             : #endif
    7903             : 
    7904             : #if !REMOVE_MD_STAGE_1
    7905             : #if SPEED_OPT
    7906             :         //after completing stage0, we might shorten cand count for some classes.
    7907             :         inter_class_decision_count_1(context_ptr);
    7908             : #endif
    7909             : 
    7910             :         context_ptr->md_stage = MD_STAGE_1;
    7911             :         for (cand_class_it = CAND_CLASS_0; cand_class_it < CAND_CLASS_TOTAL; cand_class_it++) {
    7912             : 
    7913             :             //number of next level candidates could not exceed number of curr level candidates
    7914             :             context_ptr->md_stage_2_count[cand_class_it] = MIN(context_ptr->md_stage_1_count[cand_class_it], context_ptr->md_stage_2_count[cand_class_it]);
    7915             :             context_ptr->md_stage_2_total_count += context_ptr->md_stage_2_count[cand_class_it];
    7916             : 
    7917             :             if (context_ptr->bypass_stage1[cand_class_it] == 0 && context_ptr->md_stage_1_count[cand_class_it] > 0 && context_ptr->md_stage_2_count[cand_class_it] > 0) {
    7918             :                 //Input: md_stage_1_count[cand_class_it]  Output:  full_cand_count[cand_class_it]
    7919             :                 context_ptr->target_class = cand_class_it;
    7920             :                 md_stage_1(
    7921             :                     picture_control_set_ptr,
    7922             :                     context_ptr,
    7923             :                     candidate_buffer_ptr_array_base,
    7924             :                     context_ptr->md_stage_1_count[cand_class_it],
    7925             :                     input_picture_ptr,
    7926             :                     inputOriginIndex,
    7927             :                     inputCbOriginIndex,
    7928             :                     inputCbOriginIndex,
    7929             :                     cu_ptr,
    7930             :                     cuOriginIndex,
    7931             :                     cuChromaOriginIndex,
    7932             :                     0);
    7933             : 
    7934             :                 //sort the new set of candidates
    7935             :                 sort_stage1_fast_candidates(
    7936             :                     context_ptr,
    7937             :                     context_ptr->md_stage_1_count[cand_class_it],
    7938             :                     context_ptr->cand_buff_indices[cand_class_it]);
    7939             :             }
    7940             :         }
    7941             : #endif
    7942      811402 :         memset(context_ptr->best_candidate_index_array, 0xFFFFFFFF, MAX_NFL_BUFF * sizeof(uint32_t));
    7943      811402 :         memset(context_ptr->sorted_candidate_index_array, 0xFFFFFFFF, MAX_NFL * sizeof(uint32_t));
    7944             : 
    7945      811402 :         uint64_t ref_fast_cost = MAX_MODE_COST;
    7946             : #if REMOVE_MD_STAGE_1
    7947      811402 :         construct_best_sorted_arrays_md_stage_1(
    7948             : #else
    7949             :         construct_best_sorted_arrays_md_stage_2(
    7950             : #endif
    7951             :             context_ptr,
    7952             :             candidate_buffer_ptr_array,
    7953      811402 :             context_ptr->best_candidate_index_array,
    7954      811402 :             context_ptr->sorted_candidate_index_array,
    7955             :             &ref_fast_cost);
    7956             : 
    7957             : 
    7958             :         // 1st Full-Loop
    7959             : #if INTER_INTRA_CLASS_PRUNING
    7960      811456 :         best_md_stage_cost = (uint64_t)~0;
    7961             : #endif
    7962             : #if REMOVE_MD_STAGE_1
    7963     8113500 :         for (cand_class_it = CAND_CLASS_0; cand_class_it < CAND_CLASS_TOTAL; cand_class_it++) {
    7964             :             //number of next level candidates could not exceed number of curr level candidates
    7965     7302080 :             context_ptr->md_stage_2_count[cand_class_it] = MIN(context_ptr->md_stage_1_count[cand_class_it], context_ptr->md_stage_2_count[cand_class_it]);
    7966             : #if !INTER_INTRA_CLASS_PRUNING
    7967             :             context_ptr->md_stage_2_total_count += context_ptr->md_stage_2_count[cand_class_it];
    7968             : #endif
    7969     7302080 :             if (context_ptr->bypass_md_stage_1[cand_class_it] == EB_FALSE && context_ptr->md_stage_1_count[cand_class_it] > 0 && context_ptr->md_stage_2_count[cand_class_it] > 0) {
    7970             : #else
    7971             :         context_ptr->md_stage = MD_STAGE_2;
    7972             :         for (cand_class_it = CAND_CLASS_0; cand_class_it < CAND_CLASS_TOTAL; cand_class_it++) {
    7973             :             //number of next level candidates could not exceed number of curr level candidates
    7974             :             context_ptr->md_stage_3_count[cand_class_it] = MIN(context_ptr->md_stage_2_count[cand_class_it], context_ptr->md_stage_3_count[cand_class_it]);
    7975             :             context_ptr->md_stage_3_total_count += context_ptr->md_stage_3_count[cand_class_it];
    7976             : 
    7977             :             if (context_ptr->bypass_stage2[cand_class_it] == EB_FALSE && context_ptr->md_stage_2_count[cand_class_it] > 0 && context_ptr->md_stage_3_count[cand_class_it] > 0) {
    7978             : #endif
    7979     3160780 :                 context_ptr->target_class = cand_class_it;
    7980             : #if REMOVE_MD_STAGE_1
    7981     3160780 :                 md_stage_1(
    7982             : #else
    7983             :                 md_stage_2(
    7984             : #endif
    7985             :                     picture_control_set_ptr,
    7986             :                     context_ptr->sb_ptr,
    7987             :                     cu_ptr,
    7988             :                     context_ptr,
    7989             :                     input_picture_ptr,
    7990             :                     inputOriginIndex,
    7991             :                     inputCbOriginIndex,
    7992             :                     cuOriginIndex,
    7993             :                     cuChromaOriginIndex,
    7994             :                     ref_fast_cost);
    7995             : 
    7996             :                 // Sort the candidates of the target class based on the 1st full loop cost
    7997             : 
    7998             :                 //sort the new set of candidates
    7999             : #if REMOVE_MD_STAGE_1
    8000     3160750 :                 if (context_ptr->md_stage_1_count[cand_class_it])
    8001     3160780 :                     sort_stage1_candidates(
    8002             :                         context_ptr,
    8003             :                         context_ptr->md_stage_1_count[cand_class_it],
    8004     3160780 :                         context_ptr->cand_buff_indices[cand_class_it]);
    8005             : #else
    8006             :                 if (context_ptr->md_stage_2_count[cand_class_it])
    8007             :                     sort_stage2_candidates(
    8008             :                         context_ptr,
    8009             :                         context_ptr->md_stage_2_count[cand_class_it],
    8010             :                         context_ptr->cand_buff_indices[cand_class_it]);
    8011             : #endif
    8012             : 
    8013             : #if INTER_INTRA_CLASS_PRUNING
    8014     3160740 :                 uint32_t *cand_buff_indices = context_ptr->cand_buff_indices[cand_class_it];
    8015     3160740 :                 best_md_stage_cost = MIN((*(context_ptr->candidate_buffer_ptr_array[cand_buff_indices[0]]->full_cost_ptr)), best_md_stage_cost);
    8016             : #endif
    8017             :             }
    8018             :         }
    8019             : #if INTER_INTRA_CLASS_PRUNING
    8020      811424 :         interintra_class_pruning_2(context_ptr, best_md_stage_cost);
    8021             : #endif
    8022             : 
    8023             : #if REMOVE_MD_STAGE_1
    8024      811401 :         assert(context_ptr->md_stage_2_total_count <= MAX_NFL);
    8025      811402 :         assert(context_ptr->md_stage_2_total_count > 0);
    8026      811402 :         construct_best_sorted_arrays_md_stage_2(
    8027             : #else
    8028             :         assert(context_ptr->md_stage_3_total_count <= MAX_NFL);
    8029             :         assert(context_ptr->md_stage_3_total_count > 0);
    8030             :         construct_best_sorted_arrays_md_stage_3(
    8031             : #endif
    8032             :             context_ptr,
    8033             :             candidate_buffer_ptr_array,
    8034      811402 :             context_ptr->best_candidate_index_array,
    8035      811402 :             context_ptr->sorted_candidate_index_array);
    8036             : 
    8037             :         // 2nd Full-Loop
    8038             : #if REMOVE_MD_STAGE_1
    8039      811417 :         md_stage_2(
    8040             : #else
    8041             :         context_ptr->md_stage = MD_STAGE_3;
    8042             :         md_stage_3(
    8043             : #endif
    8044             :             picture_control_set_ptr,
    8045             :             context_ptr->sb_ptr,
    8046             :             cu_ptr,
    8047             :             context_ptr,
    8048             :             input_picture_ptr,
    8049             :             inputOriginIndex,
    8050             :             inputCbOriginIndex,
    8051             :             cuOriginIndex,
    8052             :             cuChromaOriginIndex,
    8053             : #if REMOVE_MD_STAGE_1
    8054             :             context_ptr->md_stage_2_total_count,
    8055             : #else
    8056             :             context_ptr->md_stage_3_total_count,
    8057             : #endif
    8058             :             ref_fast_cost); // fullCandidateTotalCount to number of buffers to process
    8059             : 
    8060             :         // Full Mode Decision (choose the best mode)
    8061      811396 :         candidate_index = product_full_mode_decision(
    8062             :             context_ptr,
    8063             :             cu_ptr,
    8064             :             candidate_buffer_ptr_array,
    8065             : #if REMOVE_MD_STAGE_1
    8066             :             context_ptr->md_stage_2_total_count,
    8067             : #else
    8068             :             context_ptr->md_stage_3_total_count,
    8069             : #endif
    8070      811396 :             (context_ptr->full_loop_escape == 2) ? context_ptr->sorted_candidate_index_array : context_ptr->best_candidate_index_array,
    8071      811396 :             context_ptr->prune_ref_frame_for_rec_partitions,
    8072             :             &best_intra_mode);
    8073      811430 :         candidate_buffer = candidate_buffer_ptr_array[candidate_index];
    8074             : 
    8075      811430 :         bestcandidate_buffers[0] = candidate_buffer;
    8076             : 
    8077             : 
    8078      811430 :         if (picture_control_set_ptr->parent_pcs_ptr->interpolation_search_level == IT_SEARCH_INTER_DEPTH) {
    8079           0 :             if (candidate_buffer->candidate_ptr->type != INTRA_MODE && candidate_buffer->candidate_ptr->motion_mode == SIMPLE_TRANSLATION) {
    8080             : 
    8081           0 :                 context_ptr->md_staging_skip_interpolation_search = EB_FALSE;
    8082           0 :                 context_ptr->md_staging_skip_inter_chroma_pred = EB_FALSE;
    8083           0 :                 ProductPredictionFunTable[candidate_buffer->candidate_ptr->type](
    8084             :                     context_ptr,
    8085             :                     picture_control_set_ptr,
    8086             :                     candidate_buffer);
    8087           0 :                 cu_ptr->interp_filters = candidate_buffer->candidate_ptr->interp_filters;
    8088             :             }
    8089             :         }
    8090      811430 :         inter_depth_tx_search(
    8091             :             picture_control_set_ptr,
    8092             :             candidate_buffer,
    8093             :             cu_ptr,
    8094             :             context_ptr,
    8095             :             input_picture_ptr,
    8096             :             ref_fast_cost);
    8097             : 
    8098      811374 :         uint8_t sq_index = LOG2F(context_ptr->blk_geom->sq_size) - 2;
    8099      811386 :         if (context_ptr->blk_geom->shape == PART_N) {
    8100      164402 :             context_ptr->parent_sq_type[sq_index] = candidate_buffer->candidate_ptr->type;
    8101             : 
    8102      457182 :             context_ptr->parent_sq_has_coeff[sq_index] = (candidate_buffer->candidate_ptr->y_has_coeff ||
    8103      128378 :                 candidate_buffer->candidate_ptr->u_has_coeff ||
    8104      292780 :                 candidate_buffer->candidate_ptr->v_has_coeff) ? 1 : 0;
    8105             : 
    8106      164402 :             context_ptr->parent_sq_pred_mode[sq_index] = candidate_buffer->candidate_ptr->pred_mode;
    8107             :         }
    8108             : 
    8109      811386 :         AV1PerformInverseTransformRecon(
    8110             :             picture_control_set_ptr,
    8111             :             context_ptr,
    8112             :             candidate_buffer,
    8113             :             cu_ptr,
    8114             :             context_ptr->blk_geom);
    8115             : 
    8116      811377 :         if (!context_ptr->blk_geom->has_uv) {
    8117             :             // Store the luma data for 4x* and *x4 blocks to be used for CFL
    8118      184727 :             EbPictureBufferDesc  *recon_ptr = candidate_buffer->recon_ptr;
    8119      184727 :             uint32_t rec_luma_offset = context_ptr->blk_geom->origin_x + context_ptr->blk_geom->origin_y * recon_ptr->stride_y;
    8120      184727 :             if (context_ptr->hbd_mode_decision) {
    8121           0 :                for (uint32_t j = 0; j < context_ptr->blk_geom->bheight; ++j)
    8122           0 :                     memcpy(context_ptr->cfl_temp_luma_recon16bit + rec_luma_offset + j* recon_ptr->stride_y, ((uint16_t *)recon_ptr->buffer_y) + (rec_luma_offset + j * recon_ptr->stride_y), sizeof(uint16_t) * context_ptr->blk_geom->bwidth);
    8123             :             } else {
    8124     1578480 :                 for (uint32_t j = 0; j < context_ptr->blk_geom->bheight; ++j)
    8125     1393750 :                     memcpy(&context_ptr->cfl_temp_luma_recon[rec_luma_offset + j* recon_ptr->stride_y], recon_ptr->buffer_y + rec_luma_offset + j * recon_ptr->stride_y, context_ptr->blk_geom->bwidth);
    8126             :             }
    8127             :         }
    8128             :         //copy neigh recon data in cu_ptr
    8129             :         {
    8130             :             uint32_t j;
    8131      811377 :             EbPictureBufferDesc  *recon_ptr = candidate_buffer->recon_ptr;
    8132      811377 :             uint32_t recLumaOffset = context_ptr->blk_geom->origin_x + context_ptr->blk_geom->origin_y * recon_ptr->stride_y;
    8133             : 
    8134      811377 :             uint32_t recCbOffset = ((((context_ptr->blk_geom->origin_x >> 3) << 3) + ((context_ptr->blk_geom->origin_y >> 3) << 3) * candidate_buffer->recon_ptr->stride_cb) >> 1);
    8135      811377 :             uint32_t recCrOffset = ((((context_ptr->blk_geom->origin_x >> 3) << 3) + ((context_ptr->blk_geom->origin_y >> 3) << 3) * candidate_buffer->recon_ptr->stride_cr) >> 1);
    8136             : 
    8137      811377 :             if (!context_ptr->hbd_mode_decision) {
    8138      811377 :                 memcpy(cu_ptr->neigh_top_recon[0], recon_ptr->buffer_y + recLumaOffset + (context_ptr->blk_geom->bheight - 1)*recon_ptr->stride_y, context_ptr->blk_geom->bwidth);
    8139      811377 :                 if (context_ptr->blk_geom->has_uv && context_ptr->chroma_level <= CHROMA_MODE_1) {
    8140      551039 :                     memcpy(cu_ptr->neigh_top_recon[1], recon_ptr->buffer_cb + recCbOffset + (context_ptr->blk_geom->bheight_uv - 1)*recon_ptr->stride_cb, context_ptr->blk_geom->bwidth_uv);
    8141      551039 :                     memcpy(cu_ptr->neigh_top_recon[2], recon_ptr->buffer_cr + recCrOffset + (context_ptr->blk_geom->bheight_uv - 1)*recon_ptr->stride_cr, context_ptr->blk_geom->bwidth_uv);
    8142             :                 }
    8143             : 
    8144    12128200 :                 for (j = 0; j < context_ptr->blk_geom->bheight; ++j)
    8145    11316900 :                     cu_ptr->neigh_left_recon[0][j] = recon_ptr->buffer_y[recLumaOffset + context_ptr->blk_geom->bwidth - 1 + j * recon_ptr->stride_y];
    8146             : 
    8147      811377 :                 if (context_ptr->blk_geom->has_uv && context_ptr->chroma_level <= CHROMA_MODE_1) {
    8148     5074060 :                     for (j = 0; j < context_ptr->blk_geom->bheight_uv; ++j) {
    8149     4523020 :                         cu_ptr->neigh_left_recon[1][j] = recon_ptr->buffer_cb[recCbOffset + context_ptr->blk_geom->bwidth_uv - 1 + j * recon_ptr->stride_cb];
    8150     4523020 :                         cu_ptr->neigh_left_recon[2][j] = recon_ptr->buffer_cr[recCrOffset + context_ptr->blk_geom->bwidth_uv - 1 + j * recon_ptr->stride_cr];
    8151             :                     }
    8152             :                 }
    8153             :             } else {
    8154           0 :                 uint16_t sz = sizeof(uint16_t);
    8155           0 :                 memcpy(cu_ptr->neigh_top_recon_16bit[0], recon_ptr->buffer_y + sz * (recLumaOffset + (context_ptr->blk_geom->bheight - 1)*recon_ptr->stride_y), sz * context_ptr->blk_geom->bwidth);
    8156           0 :                 if (context_ptr->blk_geom->has_uv && context_ptr->chroma_level <= CHROMA_MODE_1) {
    8157           0 :                     memcpy(cu_ptr->neigh_top_recon_16bit[1], recon_ptr->buffer_cb + sz * (recCbOffset + (context_ptr->blk_geom->bheight_uv - 1)*recon_ptr->stride_cb), sz * context_ptr->blk_geom->bwidth_uv);
    8158           0 :                     memcpy(cu_ptr->neigh_top_recon_16bit[2], recon_ptr->buffer_cr + sz * (recCrOffset + (context_ptr->blk_geom->bheight_uv - 1)*recon_ptr->stride_cr), sz * context_ptr->blk_geom->bwidth_uv);
    8159             :                 }
    8160             : 
    8161           0 :                 for (j = 0; j < context_ptr->blk_geom->bheight; ++j)
    8162           0 :                     cu_ptr->neigh_left_recon_16bit[0][j] =  ((uint16_t *) recon_ptr->buffer_y)[recLumaOffset + context_ptr->blk_geom->bwidth - 1 + j * recon_ptr->stride_y];
    8163             : 
    8164           0 :                 if (context_ptr->blk_geom->has_uv && context_ptr->chroma_level <= CHROMA_MODE_1) {
    8165           0 :                     for (j = 0; j < context_ptr->blk_geom->bheight_uv; ++j) {
    8166           0 :                         cu_ptr->neigh_left_recon_16bit[1][j] = ((uint16_t *) recon_ptr->buffer_cb)[recCbOffset + context_ptr->blk_geom->bwidth_uv - 1 + j * recon_ptr->stride_cb];
    8167           0 :                         cu_ptr->neigh_left_recon_16bit[2][j] = ((uint16_t *) recon_ptr->buffer_cr)[recCrOffset + context_ptr->blk_geom->bwidth_uv - 1 + j * recon_ptr->stride_cr];
    8168             :                     }
    8169             :                 }
    8170             :             }
    8171             :         }
    8172             : 
    8173             : #if NO_ENCDEC
    8174             :         //copy recon
    8175             :         uint32_t  tu_origin_index = context_ptr->blk_geom->origin_x + (context_ptr->blk_geom->origin_y * 128);
    8176             :         uint32_t  bwidth = context_ptr->blk_geom->bwidth;
    8177             :         uint32_t  bheight = context_ptr->blk_geom->bheight;
    8178             : 
    8179             :         if (!context_ptr->hbd_mode_decision) {
    8180             :             uint8_t* src_ptr = &(((uint8_t*)candidate_buffer->recon_ptr->buffer_y)[tu_origin_index]);
    8181             :             uint8_t* dst_ptr = &(((uint8_t*)context_ptr->cu_ptr->recon_tmp->buffer_y)[0]);
    8182             : 
    8183             :             uint32_t j;
    8184             :             for (j = 0; j < bheight; j++)
    8185             :                 memcpy(dst_ptr + j * 128, src_ptr + j * 128, bwidth * sizeof(uint8_t));
    8186             : 
    8187             :             if (context_ptr->blk_geom->has_uv)
    8188             :             {
    8189             :                 uint32_t tu_origin_index = ((((context_ptr->blk_geom->origin_x >> 3) << 3) + ((context_ptr->blk_geom->origin_y >> 3) << 3) * candidate_buffer->recon_ptr->stride_cb) >> 1);
    8190             :                 bwidth = context_ptr->blk_geom->bwidth_uv;
    8191             :                 bheight = context_ptr->blk_geom->bheight_uv;
    8192             : 
    8193             :                 // Cb
    8194             :                 src_ptr = &(((uint8_t*)candidate_buffer->recon_ptr->buffer_cb)[tu_origin_index]);
    8195             :                 dst_ptr = &(((uint8_t*)context_ptr->cu_ptr->recon_tmp->buffer_cb)[0]);
    8196             : 
    8197             :                 for (j = 0; j < bheight; j++)
    8198             :                     memcpy(dst_ptr + j * 64, src_ptr + j * 64, bwidth * sizeof(uint8_t));
    8199             : 
    8200             :                 // Cr
    8201             :                 src_ptr = &(((uint8_t*)candidate_buffer->recon_ptr->buffer_cr)[tu_origin_index]);
    8202             :                 dst_ptr = &(((uint8_t*)context_ptr->cu_ptr->recon_tmp->buffer_cr)[0]);
    8203             : 
    8204             :                 for (j = 0; j < bheight; j++)
    8205             :                     memcpy(dst_ptr + j * 64, src_ptr + j * 64, bwidth * sizeof(uint8_t));
    8206             :             }
    8207             :         } else {
    8208             :             uint16_t* src_ptr = ((uint16_t*) candidate_buffer->recon_ptr->buffer_y) + tu_origin_index;
    8209             :             uint16_t* dst_ptr = (uint16_t*) context_ptr->cu_ptr->recon_tmp->buffer_y;
    8210             :             for (uint32_t j = 0; j < bheight; j++)
    8211             :                 memcpy(dst_ptr + j * 128, src_ptr + j * 128, bwidth * sizeof(uint16_t));
    8212             : 
    8213             :             if (context_ptr->blk_geom->has_uv) {
    8214             :                 tu_origin_index = ((((context_ptr->blk_geom->origin_x >> 3) << 3) + ((context_ptr->blk_geom->origin_y >> 3) << 3) * candidate_buffer->recon_ptr->stride_cb) >> 1);
    8215             :                 bwidth = context_ptr->blk_geom->bwidth_uv;
    8216             :                 bheight = context_ptr->blk_geom->bheight_uv;
    8217             : 
    8218             :                  // Cb
    8219             :                 src_ptr = ((uint16_t*) candidate_buffer->recon_ptr->buffer_cb) + tu_origin_index;
    8220             :                 dst_ptr = (uint16_t*) context_ptr->cu_ptr->recon_tmp->buffer_cb;
    8221             :                 for (uint32_t j = 0; j < bheight; j++)
    8222             :                     memcpy(dst_ptr + j * 64, src_ptr + j * 64, bwidth * sizeof(uint16_t));
    8223             : 
    8224             :                 // Cr
    8225             :                 src_ptr = ((uint16_t*) candidate_buffer->recon_ptr->buffer_cr) + tu_origin_index;
    8226             :                 dst_ptr = (uint16_t*) context_ptr->cu_ptr->recon_tmp->buffer_cr;
    8227             :                 for (uint32_t j = 0; j < bheight; j++)
    8228             :                     memcpy(dst_ptr + j * 64, src_ptr + j * 64, bwidth * sizeof(uint16_t));
    8229             :             }
    8230             :         }
    8231             : #endif
    8232             : 
    8233      811377 :         context_ptr->md_local_cu_unit[cu_ptr->mds_idx].avail_blk_flag = EB_TRUE;
    8234             :     }
    8235             :     else
    8236             :     {
    8237           0 :         context_ptr->md_local_cu_unit[cu_ptr->mds_idx].cost = MAX_MODE_COST;
    8238           0 :         cu_ptr->prediction_unit_array->ref_frame_type = 0;
    8239             :     }
    8240      811377 : }
    8241             : 
    8242             : #if LESS_RECTANGULAR_CHECK_LEVEL
    8243      987920 : void update_skip_next_nsq_for_a_b_shapes(
    8244             :     ModeDecisionContext *context_ptr,
    8245             :     uint64_t *sq_cost, uint64_t *h_cost,
    8246             :     uint64_t *v_cost, int *skip_next_nsq) {
    8247             : 
    8248      987920 :     switch (context_ptr->blk_geom->d1i)
    8249             :     {
    8250             : 
    8251             :     // NS
    8252       89363 :     case 0:
    8253       89363 :         *sq_cost = context_ptr->md_local_cu_unit[context_ptr->cu_ptr->mds_idx].cost;
    8254       89363 :         *h_cost = 0;
    8255       89363 :         *v_cost = 0;
    8256       89363 :         break;
    8257             : 
    8258             :     // H
    8259       47750 :     case 1:
    8260       47750 :         *h_cost = context_ptr->md_local_cu_unit[context_ptr->cu_ptr->mds_idx].cost;
    8261       47750 :         break;
    8262       47750 :     case 2:
    8263       47750 :         *h_cost += context_ptr->md_local_cu_unit[context_ptr->cu_ptr->mds_idx].cost;
    8264       47750 :         break;
    8265             : 
    8266             :     // V
    8267       47749 :     case 3:
    8268       47749 :         *v_cost = context_ptr->md_local_cu_unit[context_ptr->cu_ptr->mds_idx].cost;
    8269       47749 :         break;
    8270       47750 :     case 4:
    8271       47750 :         *v_cost += context_ptr->md_local_cu_unit[context_ptr->cu_ptr->mds_idx].cost;
    8272       47750 :         *skip_next_nsq = (*h_cost > ((*sq_cost * context_ptr->sq_weight) / 100)) ? 1 : *skip_next_nsq;
    8273       47750 :         break;
    8274             : 
    8275             :     // HA
    8276      145966 :     case 5:
    8277             :     case 6:
    8278             :     case 7:
    8279             : 
    8280             :     // HB
    8281             :     case 8:
    8282             :     case 9:
    8283      145966 :         *skip_next_nsq = (*h_cost > ((*sq_cost * context_ptr->sq_weight) / 100)) ? 1 : *skip_next_nsq;
    8284      145966 :         break;
    8285      162788 :     case 10:
    8286             : 
    8287             :     // VA
    8288             :     case 11:
    8289             :     case 12:
    8290             :     case 13:
    8291             : 
    8292             :     // VB
    8293             :     case 14:
    8294             :     case 15:
    8295      162788 :         *skip_next_nsq = (*v_cost > ((*sq_cost * context_ptr->sq_weight) / 100)) ? 1 : *skip_next_nsq;
    8296      162788 :         break;
    8297             :     }
    8298      987920 : }
    8299             : #endif
    8300             : 
    8301        7193 : EB_EXTERN EbErrorType mode_decision_sb(
    8302             :     SequenceControlSet                *sequence_control_set_ptr,
    8303             :     PictureControlSet                 *picture_control_set_ptr,
    8304             :     const MdcLcuData * const           mdcResultTbPtr,
    8305             :     LargestCodingUnit                 *sb_ptr,
    8306             :     uint16_t                             sb_origin_x,
    8307             :     uint16_t                             sb_origin_y,
    8308             :     uint32_t                             lcuAddr,
    8309             :     SsMeContext                       *ss_mecontext,
    8310             :     ModeDecisionContext               *context_ptr)
    8311             : {
    8312        7193 :     EbErrorType                          return_error = EB_ErrorNone;
    8313             : 
    8314             :     uint32_t                             cuIdx;
    8315             :     ModeDecisionCandidateBuffer       *bestcandidate_buffers[5];
    8316             :     // Pre Intra Search
    8317        7193 :     uint32_t                               leaf_count = mdcResultTbPtr->leaf_count;
    8318        7193 :     const EbMdcLeafData *const           leaf_data_array = mdcResultTbPtr->leaf_data_array;
    8319        7193 :     context_ptr->sb_ptr = sb_ptr;
    8320             : #if FIX_COEF_BASED_ATB_SKIP
    8321        7193 :     context_ptr->coeff_based_skip_atb = 0;
    8322             : #endif
    8323        7193 :     EbBool all_cu_init = (picture_control_set_ptr->parent_pcs_ptr->pic_depth_mode <= PIC_SQ_DEPTH_MODE);
    8324        7193 :     if (all_cu_init) {
    8325        3600 :         init_sq_nsq_block(
    8326             :             sequence_control_set_ptr,
    8327             :             context_ptr);
    8328             :     }
    8329             :     else {
    8330        3593 :         init_sq_non4_block(
    8331             :             sequence_control_set_ptr,
    8332             :             context_ptr);
    8333             :     }
    8334             :     // Mode Decision Neighbor Arrays
    8335        7191 :     context_ptr->intra_luma_mode_neighbor_array = picture_control_set_ptr->md_intra_luma_mode_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8336        7191 :     context_ptr->intra_chroma_mode_neighbor_array = picture_control_set_ptr->md_intra_chroma_mode_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8337        7191 :     context_ptr->mv_neighbor_array = picture_control_set_ptr->md_mv_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8338        7191 :     context_ptr->skip_flag_neighbor_array = picture_control_set_ptr->md_skip_flag_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8339        7191 :     context_ptr->mode_type_neighbor_array = picture_control_set_ptr->md_mode_type_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8340        7191 :     context_ptr->leaf_depth_neighbor_array = picture_control_set_ptr->md_leaf_depth_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8341        7191 :     context_ptr->leaf_partition_neighbor_array = picture_control_set_ptr->mdleaf_partition_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8342             : 
    8343        7191 :     if (!context_ptr->hbd_mode_decision) {
    8344        7191 :         context_ptr->luma_recon_neighbor_array = picture_control_set_ptr->md_luma_recon_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8345        7191 :         context_ptr->cb_recon_neighbor_array = picture_control_set_ptr->md_cb_recon_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8346        7191 :         context_ptr->cr_recon_neighbor_array = picture_control_set_ptr->md_cr_recon_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8347             :     } else {
    8348           0 :         context_ptr->luma_recon_neighbor_array16bit = picture_control_set_ptr->md_luma_recon_neighbor_array16bit[MD_NEIGHBOR_ARRAY_INDEX];
    8349           0 :         context_ptr->cb_recon_neighbor_array16bit = picture_control_set_ptr->md_cb_recon_neighbor_array16bit[MD_NEIGHBOR_ARRAY_INDEX];
    8350           0 :         context_ptr->cr_recon_neighbor_array16bit = picture_control_set_ptr->md_cr_recon_neighbor_array16bit[MD_NEIGHBOR_ARRAY_INDEX];
    8351             :     }
    8352        7191 :     context_ptr->skip_coeff_neighbor_array = picture_control_set_ptr->md_skip_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8353        7191 :     context_ptr->luma_dc_sign_level_coeff_neighbor_array = picture_control_set_ptr->md_luma_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8354        7191 :     context_ptr->cb_dc_sign_level_coeff_neighbor_array = picture_control_set_ptr->md_cb_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8355        7191 :     context_ptr->cr_dc_sign_level_coeff_neighbor_array = picture_control_set_ptr->md_cr_dc_sign_level_coeff_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8356        7191 :     context_ptr->txfm_context_array = picture_control_set_ptr->md_txfm_context_array[MD_NEIGHBOR_ARRAY_INDEX];
    8357        7191 :     context_ptr->inter_pred_dir_neighbor_array = picture_control_set_ptr->md_inter_pred_dir_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8358        7191 :     context_ptr->ref_frame_type_neighbor_array = picture_control_set_ptr->md_ref_frame_type_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8359        7191 :     context_ptr->interpolation_type_neighbor_array = picture_control_set_ptr->md_interpolation_type_neighbor_array[MD_NEIGHBOR_ARRAY_INDEX];
    8360             : #if ADD_SUPPORT_TO_SKIP_PART_N
    8361        7191 :     uint32_t  d1_block_itr = 0;
    8362        7191 :     uint32_t  d1_first_block = 1;
    8363             : #endif
    8364             : 
    8365        7191 :     EbPictureBufferDesc *input_picture_ptr = picture_control_set_ptr->parent_pcs_ptr->enhanced_picture_ptr;
    8366        7191 :     if (context_ptr->hbd_mode_decision) {
    8367           0 :         const uint32_t input_luma_offset = ((sb_origin_y + input_picture_ptr->origin_y) * input_picture_ptr->stride_y) + (sb_origin_x + input_picture_ptr->origin_x);
    8368           0 :         const uint32_t input_bit_inc_luma_offset = ((sb_origin_y + input_picture_ptr->origin_y) * input_picture_ptr->stride_bit_inc_y) + (sb_origin_x + input_picture_ptr->origin_x);
    8369           0 :         const uint32_t input_cb_offset = (((sb_origin_y + input_picture_ptr->origin_y) >> 1)  * input_picture_ptr->stride_cb) + ((sb_origin_x + input_picture_ptr->origin_x) >> 1);
    8370           0 :         const uint32_t input_bit_inc_cb_offset = (((sb_origin_y + input_picture_ptr->origin_y) >> 1)  * input_picture_ptr->stride_bit_inc_cb) + ((sb_origin_x + input_picture_ptr->origin_x) >> 1);
    8371           0 :         const uint32_t input_cr_offset = (((sb_origin_y + input_picture_ptr->origin_y) >> 1)  * input_picture_ptr->stride_cr) + ((sb_origin_x + input_picture_ptr->origin_x) >> 1);
    8372           0 :         const uint32_t input_bit_inc_cr_offset = (((sb_origin_y + input_picture_ptr->origin_y) >> 1)  * input_picture_ptr->stride_bit_inc_cr) + ((sb_origin_x + input_picture_ptr->origin_x) >> 1);
    8373             : 
    8374           0 :         uint32_t sb_width  = MIN(sequence_control_set_ptr->sb_size_pix, sequence_control_set_ptr->seq_header.max_frame_width - sb_origin_x);
    8375           0 :         uint32_t sb_height = MIN(sequence_control_set_ptr->sb_size_pix, sequence_control_set_ptr->seq_header.max_frame_height - sb_origin_y);
    8376             : 
    8377           0 :         pack2d_src(
    8378           0 :             input_picture_ptr->buffer_y + input_luma_offset,
    8379           0 :             input_picture_ptr->stride_y,
    8380           0 :             input_picture_ptr->buffer_bit_inc_y + input_bit_inc_luma_offset,
    8381           0 :             input_picture_ptr->stride_bit_inc_y,
    8382           0 :             (uint16_t *)context_ptr->input_sample16bit_buffer->buffer_y,
    8383           0 :             context_ptr->input_sample16bit_buffer->stride_y,
    8384             :             sb_width,
    8385             :             sb_height);
    8386             : 
    8387           0 :         pack2d_src(
    8388           0 :             input_picture_ptr->buffer_cb + input_cb_offset,
    8389           0 :             input_picture_ptr->stride_cb,
    8390           0 :             input_picture_ptr->buffer_bit_inc_cb + input_bit_inc_cb_offset,
    8391           0 :             input_picture_ptr->stride_bit_inc_cb,
    8392           0 :             (uint16_t *)context_ptr->input_sample16bit_buffer->buffer_cb,
    8393           0 :             context_ptr->input_sample16bit_buffer->stride_cb,
    8394             :             sb_width >> 1,
    8395             :             sb_height >> 1);
    8396             : 
    8397           0 :         pack2d_src(
    8398           0 :             input_picture_ptr->buffer_cr + input_cr_offset,
    8399           0 :             input_picture_ptr->stride_cr,
    8400           0 :             input_picture_ptr->buffer_bit_inc_cr + input_bit_inc_cr_offset,
    8401           0 :             input_picture_ptr->stride_bit_inc_cr,
    8402           0 :             (uint16_t *)context_ptr->input_sample16bit_buffer->buffer_cr,
    8403           0 :             context_ptr->input_sample16bit_buffer->stride_cr,
    8404             :             sb_width >> 1,
    8405             :             sb_height >> 1);
    8406             : 
    8407           0 :         Store16bitInputSrc(context_ptr->input_sample16bit_buffer, picture_control_set_ptr, sb_origin_x, sb_origin_y, sb_width, sb_height);
    8408             :         //input_picture_ptr = context_ptr->input_sample16bit_buffer;
    8409           0 :         input_picture_ptr = picture_control_set_ptr->input_frame16bit;
    8410             :     }
    8411             : 
    8412             :     //CU Loop
    8413        7135 :     cuIdx = 0;  //index over mdc array
    8414             : 
    8415             : #if LESS_RECTANGULAR_CHECK_LEVEL
    8416        7135 :     uint64_t sq_cost = 0;
    8417             :     uint64_t h_cost;
    8418             :     uint64_t v_cost;
    8419             : #endif
    8420             : 
    8421        7135 :     uint32_t blk_idx_mds = 0;
    8422        7135 :     uint32_t  d1_blocks_accumlated = 0;
    8423        7135 :     int skip_next_nsq = 0;
    8424        7135 :     int skip_next_sq = 0;
    8425        7135 :     uint32_t next_non_skip_blk_idx_mds = 0;
    8426             :     uint8_t skip_sub_blocks;
    8427             :     do {
    8428     1822010 :         skip_sub_blocks = 0;
    8429     1822010 :         blk_idx_mds = leaf_data_array[cuIdx].mds_idx;
    8430             : 
    8431     1822010 :         const BlockGeom * blk_geom = context_ptr->blk_geom = get_blk_geom_mds(blk_idx_mds);
    8432     1822020 :         CodingUnit *  cu_ptr = context_ptr->cu_ptr = &context_ptr->md_cu_arr_nsq[blk_idx_mds];
    8433             : 
    8434     1822020 :         context_ptr->cu_size_log2 = blk_geom->bwidth_log2;
    8435     1822020 :         context_ptr->cu_origin_x = sb_origin_x + blk_geom->origin_x;
    8436     1822020 :         context_ptr->cu_origin_y = sb_origin_y + blk_geom->origin_y;
    8437             : 
    8438     1822020 :         const EbMdcLeafData * const leafDataPtr = &mdcResultTbPtr->leaf_data_array[cuIdx];
    8439     1822020 :         context_ptr->sb_sz = BLOCK_SIZE_64;
    8440     1822020 :         context_ptr->round_origin_x = ((context_ptr->cu_origin_x >> 3) << 3);
    8441     1822020 :         context_ptr->round_origin_y = ((context_ptr->cu_origin_y >> 3) << 3);
    8442     1822020 :         context_ptr->sb_origin_x = sb_origin_x;
    8443     1822020 :         context_ptr->sb_origin_y = sb_origin_y;
    8444     1822020 :         context_ptr->md_local_cu_unit[blk_idx_mds].tested_cu_flag = EB_TRUE;
    8445     1822020 :         context_ptr->md_ep_pipe_sb[blk_idx_mds].merge_cost = 0;
    8446     1822020 :         context_ptr->md_ep_pipe_sb[blk_idx_mds].skip_cost = 0;
    8447             : 
    8448             : #if OBMC_FLAG
    8449     1822020 :         cu_ptr->av1xd->sb_type = blk_geom->bsize;
    8450             : #endif
    8451     1822020 :         cu_ptr->mds_idx = blk_idx_mds;
    8452     1822020 :         context_ptr->md_cu_arr_nsq[blk_idx_mds].mdc_split_flag = (uint16_t)leafDataPtr->split_flag;
    8453             : #if ADD_SUPPORT_TO_SKIP_PART_N
    8454     1822020 :         context_ptr->md_cu_arr_nsq[blk_geom->sqi_mds].split_flag = (uint16_t)leafDataPtr->split_flag;
    8455             : #endif
    8456     1822020 :         cu_ptr->split_flag = (uint16_t)leafDataPtr->split_flag; //mdc indicates smallest or non valid CUs with split flag=
    8457     1822020 :         cu_ptr->qp = context_ptr->qp;
    8458     1822020 :         cu_ptr->best_d1_blk = blk_idx_mds;
    8459             : #if COMBINE_MDC_NSQ_TABLE
    8460     1822020 :         context_ptr->best_nsq_sahpe1 = leafDataPtr->ol_best_nsq_shape1;
    8461     1822020 :         context_ptr->best_nsq_sahpe2 = leafDataPtr->ol_best_nsq_shape2;
    8462     1822020 :         context_ptr->best_nsq_sahpe3 = leafDataPtr->ol_best_nsq_shape3;
    8463     1822020 :         context_ptr->best_nsq_sahpe4 = leafDataPtr->ol_best_nsq_shape4;
    8464     1822020 :         context_ptr->best_nsq_sahpe5 = leafDataPtr->ol_best_nsq_shape5;
    8465     1822020 :         context_ptr->best_nsq_sahpe6 = leafDataPtr->ol_best_nsq_shape6;
    8466     1822020 :         context_ptr->best_nsq_sahpe7 = leafDataPtr->ol_best_nsq_shape7;
    8467     1822020 :         context_ptr->best_nsq_sahpe8 = leafDataPtr->ol_best_nsq_shape8;
    8468             : #endif
    8469     1822020 :             if (leafDataPtr->tot_d1_blocks != 1)
    8470             :             {
    8471             : #if ADD_SUPPORT_TO_SKIP_PART_N
    8472             :                 // We need to get the index of the sq_block for each NSQ branch
    8473     1527050 :                 if (d1_first_block) {
    8474             : #else
    8475             :                 if (blk_geom->shape == PART_N)
    8476             : #endif
    8477      114418 :                     copy_neighbour_arrays(      //save a clean neigh in [1], encode uses [0], reload the clean in [0] after done last ns block in a partition
    8478             :                         picture_control_set_ptr,
    8479             :                         context_ptr,
    8480             :                         0, 1,
    8481             : #if ADD_SUPPORT_TO_SKIP_PART_N
    8482      114418 :                         blk_geom->sqi_mds,
    8483             : #else
    8484             :                         blk_idx_mds,
    8485             : #endif
    8486             :                         sb_origin_x,
    8487             :                         sb_origin_y);
    8488             : #if ADD_SUPPORT_TO_SKIP_PART_N
    8489             :                 }
    8490             : #endif
    8491             :             }
    8492             : 
    8493     1822010 :             int32_t mi_row = context_ptr->cu_origin_y >> MI_SIZE_LOG2;
    8494     1822010 :             int32_t mi_col = context_ptr->cu_origin_x >> MI_SIZE_LOG2;
    8495     1822010 :             int mi_stride = picture_control_set_ptr->parent_pcs_ptr->av1_cm->mi_stride;
    8496     1822010 :             const int32_t offset = mi_row * mi_stride + mi_col;
    8497     1822010 :             cu_ptr->av1xd->mi = picture_control_set_ptr->parent_pcs_ptr->av1_cm->pcs_ptr->mi_grid_base + offset;
    8498     1822010 :             ModeInfo *mi_ptr = *cu_ptr->av1xd->mi;
    8499     1822010 :             cu_ptr->av1xd->up_available = (mi_row > sb_ptr->tile_info.mi_row_start);
    8500     1822010 :             cu_ptr->av1xd->left_available = (mi_col > sb_ptr->tile_info.mi_col_start);
    8501     1822010 :             if (cu_ptr->av1xd->up_available)
    8502     1762400 :                 cu_ptr->av1xd->above_mbmi = &mi_ptr[-mi_stride].mbmi;
    8503             :             else
    8504       59611 :                 cu_ptr->av1xd->above_mbmi = NULL;
    8505     1822010 :             if (cu_ptr->av1xd->left_available)
    8506     1773510 :                 cu_ptr->av1xd->left_mbmi = &mi_ptr[-1].mbmi;
    8507             :             else
    8508       48501 :                 cu_ptr->av1xd->left_mbmi = NULL;
    8509             : 
    8510     1822010 :         uint8_t redundant_blk_avail = 0;
    8511             :         uint16_t redundant_blk_mds;
    8512     1822010 :         if (all_cu_init)
    8513     1712880 :             check_redundant_block(blk_geom, context_ptr, &redundant_blk_avail, &redundant_blk_mds);
    8514             : 
    8515     1822080 :         if (redundant_blk_avail && context_ptr->redundant_blk)
    8516      101020 :         {
    8517             :             // Copy results
    8518      101022 :             CodingUnit *src_cu = &context_ptr->md_cu_arr_nsq[redundant_blk_mds];
    8519      101022 :             CodingUnit *dst_cu = cu_ptr;
    8520             : #if PAL_SUP
    8521             : 
    8522      101022 :             move_cu_data_redund(picture_control_set_ptr, context_ptr,src_cu, dst_cu);
    8523             : #else
    8524             :             move_cu_data_redund(src_cu, dst_cu);
    8525             : #endif
    8526      101020 :             memcpy(&context_ptr->md_local_cu_unit[cu_ptr->mds_idx], &context_ptr->md_local_cu_unit[redundant_blk_mds], sizeof(MdCodingUnit));
    8527             : 
    8528      101020 :             if (!context_ptr->hbd_mode_decision) {
    8529      101022 :                 memcpy(dst_cu->neigh_left_recon[0], src_cu->neigh_left_recon[0], 128);
    8530      101022 :                 memcpy(dst_cu->neigh_left_recon[1], src_cu->neigh_left_recon[1], 128);
    8531      101022 :                 memcpy(dst_cu->neigh_left_recon[2], src_cu->neigh_left_recon[2], 128);
    8532      101022 :                 memcpy(dst_cu->neigh_top_recon[0], src_cu->neigh_top_recon[0], 128);
    8533      101022 :                 memcpy(dst_cu->neigh_top_recon[1], src_cu->neigh_top_recon[1], 128);
    8534      101022 :                 memcpy(dst_cu->neigh_top_recon[2], src_cu->neigh_top_recon[2], 128);
    8535             :             } else {
    8536           0 :                 uint16_t sz = sizeof(uint16_t);
    8537           0 :                 memcpy(dst_cu->neigh_left_recon_16bit[0], src_cu->neigh_left_recon_16bit[0], 128 * sz);
    8538           0 :                 memcpy(dst_cu->neigh_left_recon_16bit[1], src_cu->neigh_left_recon_16bit[1], 128 * sz);
    8539           0 :                 memcpy(dst_cu->neigh_left_recon_16bit[2], src_cu->neigh_left_recon_16bit[2], 128 * sz);
    8540           0 :                 memcpy(dst_cu->neigh_top_recon_16bit[0], src_cu->neigh_top_recon_16bit[0], 128 * sz);
    8541           0 :                 memcpy(dst_cu->neigh_top_recon_16bit[1], src_cu->neigh_top_recon_16bit[1], 128 * sz);
    8542           0 :                 memcpy(dst_cu->neigh_top_recon_16bit[2], src_cu->neigh_top_recon_16bit[2], 128 * sz);
    8543             :             }
    8544             : 
    8545      101020 :             memcpy(&context_ptr->md_ep_pipe_sb[cu_ptr->mds_idx], &context_ptr->md_ep_pipe_sb[redundant_blk_mds], sizeof(MdEncPassCuData));
    8546             : 
    8547      101020 :             if (context_ptr->blk_geom->shape == PART_N) {
    8548       15708 :                 uint8_t sq_index = LOG2F(context_ptr->blk_geom->sq_size) - 2;
    8549       15708 :                 context_ptr->parent_sq_type[sq_index] = src_cu->prediction_mode_flag;
    8550       15708 :                 context_ptr->parent_sq_has_coeff[sq_index] = src_cu->block_has_coeff;
    8551       15708 :                 context_ptr->parent_sq_pred_mode[sq_index] = src_cu->pred_mode;
    8552             :             }
    8553             :         }
    8554             :         else
    8555             : #if FIX_SKIP_REDUNDANT_BLOCK
    8556             :         {
    8557             : #endif
    8558             :             // Initialize tx_depth
    8559     1721060 :             cu_ptr->tx_depth = 0;
    8560             : #if ADD_SUPPORT_TO_SKIP_PART_N
    8561     1721060 :             if (blk_geom->quadi > 0 && d1_block_itr == 0) {
    8562             : #else
    8563             :             if (blk_geom->quadi > 0 && blk_geom->shape == PART_N) {
    8564             : #endif
    8565             : 
    8566      299903 :                 uint32_t blk_mds = context_ptr->blk_geom->sqi_mds;
    8567      299903 :                 uint64_t parent_depth_cost = 0, current_depth_cost = 0;
    8568      299903 :                 SequenceControlSet *sequence_control_set_ptr = (SequenceControlSet*)picture_control_set_ptr->sequence_control_set_wrapper_ptr->object_ptr;
    8569      299903 :                 uint32_t parent_depth_idx_mds = blk_mds;
    8570             : 
    8571             :                 // from a given child index, derive the index of the parent
    8572      299903 :                 parent_depth_idx_mds = (context_ptr->blk_geom->sqi_mds - (context_ptr->blk_geom->quadi - 3) * ns_depth_offset[sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128][context_ptr->blk_geom->depth]) -
    8573      299903 :                     parent_depth_offset[sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128][blk_geom->depth];
    8574             : 
    8575      299903 :                 if (picture_control_set_ptr->slice_type == I_SLICE && parent_depth_idx_mds == 0 && sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128)
    8576           0 :                     parent_depth_cost = MAX_MODE_COST;
    8577             :                 else
    8578      299903 :                     compute_depth_costs_md_skip(
    8579             :                         context_ptr,
    8580             :                         sequence_control_set_ptr,
    8581             :                         parent_depth_idx_mds,
    8582      299903 :                         ns_depth_offset[sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128][context_ptr->blk_geom->depth], &parent_depth_cost, &current_depth_cost);
    8583             : 
    8584      299994 :                 if (!sequence_control_set_ptr->sb_geom[lcuAddr].block_is_allowed[parent_depth_idx_mds])
    8585       61833 :                     parent_depth_cost = MAX_MODE_COST;
    8586             : 
    8587             :                 // compare the cost of the parent to the cost of the already encoded child + an estimated cost for the remaining child @ the current depth
    8588             :                 // if the total child cost is higher than the parent cost then skip the remaining  child @ the current depth
    8589             :                 // when md_exit_th=0 the estimated cost for the remaining child is not taken into account and the action will be lossless compared to no exit
    8590             :                 // MD_EXIT_THSL could be tuned toward a faster encoder but lossy
    8591             : #if SPEED_OPT
    8592      299994 :                 if (parent_depth_cost <= current_depth_cost + (current_depth_cost* (4 - context_ptr->blk_geom->quadi)* context_ptr->md_exit_th / context_ptr->blk_geom->quadi / 100)) {
    8593             : #else
    8594             :                 if (parent_depth_cost <= current_depth_cost + (current_depth_cost* (4 - context_ptr->blk_geom->quadi)* MD_EXIT_THSL / context_ptr->blk_geom->quadi / 100)) {
    8595             : #endif
    8596      134299 :                     skip_next_sq = 1;
    8597      134299 :                     next_non_skip_blk_idx_mds = parent_depth_idx_mds + ns_depth_offset[sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128][context_ptr->blk_geom->depth - 1];
    8598             :                 }
    8599             :                 else
    8600      165695 :                     skip_next_sq = 0;
    8601             :             }
    8602             :             // skip until we reach the next block @ the parent block depth
    8603     1721150 :             if (cu_ptr->mds_idx >= next_non_skip_blk_idx_mds && skip_next_sq == 1)
    8604         782 :                 skip_next_sq = 0;
    8605             : 
    8606     1721150 :             if (picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->sb_geom[lcuAddr].block_is_allowed[cu_ptr->mds_idx] && !skip_next_nsq && !skip_next_sq) {
    8607      811420 :                 md_encode_block(
    8608             :                     sequence_control_set_ptr,
    8609             :                     picture_control_set_ptr,
    8610             :                     context_ptr,
    8611             :                     input_picture_ptr,
    8612             :                     ss_mecontext,
    8613             :                     &skip_sub_blocks,
    8614             :                     lcuAddr,
    8615             :                     bestcandidate_buffers);
    8616             : 
    8617             :             }
    8618      909727 :             else if (skip_next_sq) {
    8619      409818 :                 context_ptr->md_local_cu_unit[context_ptr->cu_ptr->mds_idx].cost = (MAX_MODE_COST >> 10);
    8620             :             }
    8621             :             else {
    8622             :                 // If the block is out of the boundaries, md is not performed.
    8623             :                 // - For square blocks, since the blocks can be further splitted, they are considered in d2_inter_depth_block_decision with cost of zero.
    8624             :                 // - For non square blocks, since they can not be splitted further the cost is set to a large value (MAX_MODE_COST >> 4) to make sure they are not selected.
    8625             :                 //   The value is set to MAX_MODE_COST >> 4 to make sure there is not overflow when adding costs.
    8626      499909 :                 if (context_ptr->blk_geom->shape != PART_N)
    8627      425391 :                     context_ptr->md_local_cu_unit[context_ptr->cu_ptr->mds_idx].cost = (MAX_MODE_COST >> 4);
    8628             :                 else
    8629       74518 :                     context_ptr->md_local_cu_unit[context_ptr->cu_ptr->mds_idx].cost = 0;
    8630             :             }
    8631             : #if FIX_SKIP_REDUNDANT_BLOCK
    8632             :         }
    8633             : #endif
    8634     1822060 :         skip_next_nsq = 0;
    8635             : #if ADD_SUPPORT_TO_SKIP_PART_N
    8636     1822060 :         if (blk_geom->nsi + 1 == blk_geom->totns) {
    8637      924744 :             d1_non_square_block_decision(context_ptr, d1_block_itr);
    8638      924753 :             d1_block_itr++;
    8639             :         }
    8640             : #else
    8641             :         if (blk_geom->nsi + 1 == blk_geom->totns)
    8642             :             d1_non_square_block_decision(context_ptr);
    8643             : #endif
    8644             : #if ADD_SUPPORT_TO_SKIP_PART_N
    8645      897316 :         else if (d1_block_itr) {
    8646             : #else
    8647             :         else {
    8648             : #endif
    8649      897317 :             uint64_t tot_cost = 0;
    8650      897317 :             uint32_t first_blk_idx = context_ptr->cu_ptr->mds_idx - (blk_geom->nsi);//index of first block in this partition
    8651     2272130 :             for (int blk_it = 0; blk_it < blk_geom->nsi + 1; blk_it++)
    8652     1374810 :                 tot_cost += context_ptr->md_local_cu_unit[first_blk_idx + blk_it].cost;
    8653             : #if SPEED_OPT
    8654      897317 :             if ((tot_cost + tot_cost * (blk_geom->totns - (blk_geom->nsi + 1))* context_ptr->md_exit_th / (blk_geom->nsi + 1) / 100) > context_ptr->md_local_cu_unit[context_ptr->blk_geom->sqi_mds].cost)
    8655             : #else
    8656             :             if ((tot_cost + tot_cost * (blk_geom->totns - (blk_geom->nsi + 1))* MD_EXIT_THSL / (blk_geom->nsi + 1) / 100) > context_ptr->md_local_cu_unit[context_ptr->blk_geom->sqi_mds].cost)
    8657             : #endif
    8658      447529 :                 skip_next_nsq = 1;
    8659             :         }
    8660             : 
    8661             : #if LESS_RECTANGULAR_CHECK_LEVEL
    8662     1822070 :         if (context_ptr->sq_weight != (uint32_t)~0 && blk_geom->bsize > BLOCK_8X8)
    8663      987922 :             update_skip_next_nsq_for_a_b_shapes(context_ptr, &sq_cost, &h_cost, &v_cost, &skip_next_nsq);
    8664             : #endif
    8665             : 
    8666     1822060 :         if (blk_geom->shape != PART_N) {
    8667     1412650 :             if (blk_geom->nsi + 1 < blk_geom->totns)
    8668      897332 :                 md_update_all_neighbour_arrays(
    8669             :                     picture_control_set_ptr,
    8670             :                     context_ptr,
    8671             :                     blk_idx_mds,
    8672             :                     sb_origin_x,
    8673             :                     sb_origin_y);
    8674             :             else
    8675      515314 :                 copy_neighbour_arrays(      //restore [1] in [0] after done last ns block
    8676             :                     picture_control_set_ptr,
    8677             :                     context_ptr,
    8678             :                     1, 0,
    8679      515314 :                     blk_geom->sqi_mds,
    8680             :                     sb_origin_x,
    8681             :                     sb_origin_y);
    8682             :         }
    8683             : 
    8684             : #if ADD_SUPPORT_TO_SKIP_PART_N
    8685     1822050 :         d1_blocks_accumlated = d1_first_block == 1 ? 1 : d1_blocks_accumlated + 1;
    8686             : #else
    8687             :         d1_blocks_accumlated = blk_geom->shape == PART_N ? 1 : d1_blocks_accumlated + 1;
    8688             : #endif
    8689             : 
    8690     1822050 :         if (d1_blocks_accumlated == leafDataPtr->tot_d1_blocks)
    8691             :         {
    8692      409427 :             uint32_t  lastCuIndex_mds = d2_inter_depth_block_decision(
    8693             :                 context_ptr,
    8694      409427 :                 blk_geom->sqi_mds,//input is parent square
    8695             :                 sb_ptr,
    8696             :                 lcuAddr,
    8697             :                 sb_origin_x,
    8698             :                 sb_origin_y,
    8699      409427 :                 context_ptr->full_lambda,
    8700             :                 context_ptr->md_rate_estimation_ptr,
    8701             :                 picture_control_set_ptr);
    8702             : #if ADD_SUPPORT_TO_SKIP_PART_N
    8703      409386 :             d1_block_itr = 0;
    8704      409386 :             d1_first_block = 1;
    8705             : #endif
    8706     1228180 :             context_ptr->coeff_based_skip_atb = picture_control_set_ptr->parent_pcs_ptr->coeff_based_skip_atb &&
    8707      409405 :                 context_ptr->md_local_cu_unit[lastCuIndex_mds].avail_blk_flag &&
    8708      818791 :                 context_ptr->md_cu_arr_nsq[lastCuIndex_mds].block_has_coeff == 0 ? 1 : 0;
    8709             : 
    8710      409386 :             if (context_ptr->md_cu_arr_nsq[lastCuIndex_mds].split_flag == EB_FALSE)
    8711             :             {
    8712      313066 :                 md_update_all_neighbour_arrays_multiple(
    8713             :                     picture_control_set_ptr,
    8714             :                     context_ptr,
    8715      313066 :                     context_ptr->md_cu_arr_nsq[lastCuIndex_mds].best_d1_blk,
    8716             :                     sb_origin_x,
    8717             :                     sb_origin_y);
    8718             :             }
    8719             :         }
    8720             : #if ADD_SUPPORT_TO_SKIP_PART_N
    8721     1412630 :         else if (d1_first_block)
    8722      114422 :             d1_first_block = 0;
    8723             : #endif
    8724             : 
    8725     1822070 :         if (skip_sub_blocks && leaf_data_array[cuIdx].split_flag) {
    8726           0 :             cuIdx++;
    8727           0 :             while (cuIdx < leaf_count) {
    8728           0 :                 const BlockGeom * next_blk_geom = get_blk_geom_mds(leaf_data_array[cuIdx].mds_idx);
    8729           0 :                 if ((next_blk_geom->origin_x < blk_geom->origin_x + blk_geom->bwidth) && (next_blk_geom->origin_y < blk_geom->origin_y + blk_geom->bheight))
    8730           0 :                     cuIdx++;
    8731             :                 else
    8732             :                     break;
    8733             :             }
    8734             :         }
    8735             :         else
    8736     1822070 :             cuIdx++;
    8737     1822070 :     } while (cuIdx < leaf_count);// End of CU loop
    8738             : 
    8739        7200 :     return return_error;
    8740             : }
    8741             : 
    8742             : static uint32_t tab4x4[256] = {
    8743             :     0, 1, 4, 5, 16, 17, 20, 21, 64, 65, 68, 69, 80, 81, 84, 85,
    8744             :     2, 3, 6, 7, 18, 19, 22, 23, 66, 67, 70, 71, 82, 83, 86, 87,
    8745             :     8, 9, 12, 13, 24, 25, 28, 29, 72, 73, 76, 77, 88, 89, 92, 93,
    8746             :     10, 11, 14, 15, 26, 27, 30, 31, 74, 75, 78, 79, 90, 91, 94, 95,
    8747             :     32, 33, 36, 37, 48, 49, 52, 53, 96, 97, 100, 101, 112, 113, 116, 117,
    8748             :     34, 35, 38, 39, 50, 51, 54, 55, 98, 99, 102, 103, 114, 115, 118, 119,
    8749             :     40, 41, 44, 45, 56, 57, 60, 61, 104, 105, 108, 109, 120, 121, 124, 125,
    8750             :     42, 43, 46, 47, 58, 59, 62, 63, 106, 107, 110, 111, 122, 123, 126, 127,
    8751             :     128, 129, 132, 133, 144, 145, 148, 149, 192, 193, 196, 197, 208, 209, 212, 213,
    8752             :     130, 131, 134, 135, 146, 147, 150, 151, 194, 195, 198, 199, 210, 211, 214, 215,
    8753             :     136, 137, 140, 141, 152, 153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221,
    8754             :     138, 139, 142, 143, 154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223,
    8755             :     160, 161, 164, 165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245,
    8756             :     162, 163, 166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247,
    8757             :     168, 169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
    8758             :     170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254, 255,
    8759             : };
    8760             : 
    8761             : static uint32_t tab8x4[128] = {
    8762             :     0, 2, 8, 10, 32, 34, 40, 42,
    8763             :     1, 3, 9, 11, 33, 35, 41, 43,
    8764             :     4, 6, 12, 14, 36, 38, 44, 46,
    8765             :     5, 7, 13, 15, 37, 39, 45, 47,
    8766             :     16, 18, 24, 26, 48, 50, 56, 58,
    8767             :     17, 19, 25, 27, 49, 51, 57, 59,
    8768             :     20, 22, 28, 30, 52, 54, 60, 62,
    8769             :     21, 23, 29, 31, 53, 55, 61, 63,
    8770             :     64, 66, 72, 74, 96, 98, 104, 106,
    8771             :     65, 67, 73, 75, 97, 99, 105, 107,
    8772             :     68, 70, 76, 78, 100, 102, 108, 110,
    8773             :     69, 71, 77, 79, 101, 103, 109, 111,
    8774             :     80, 82, 88, 90, 112, 114, 120, 122,
    8775             :     81, 83, 89, 91, 113, 115, 121, 123,
    8776             :     84, 86, 92, 94, 116, 118, 124, 126,
    8777             :     85, 87, 93, 95, 117, 119, 125, 127
    8778             : };
    8779             : 
    8780             : static uint32_t tab4x8[128] = {
    8781             :     0, 1, 2, 3, 8, 9, 10, 11, 32, 33, 34, 35, 40, 41, 42, 43,
    8782             :     4, 5, 6, 7, 12, 13, 14, 15, 36, 37, 38, 39, 44, 45, 46, 47,
    8783             :     16, 17, 18, 19, 24, 25, 26, 27, 48, 49, 50, 51, 56, 57, 58, 59,
    8784             :     20, 21, 22, 23, 28, 29, 30, 31, 52, 53, 54, 55, 60, 61, 62, 63,
    8785             :     64, 65, 66, 67, 72, 73, 74, 75, 96, 97, 98, 99, 104, 105, 106, 107,
    8786             :     68, 69, 70, 71, 76, 77, 78, 79, 100, 101, 102, 103, 108, 109, 110, 111,
    8787             :     80, 81, 82, 83, 88, 89, 90, 91, 112, 113, 114, 115, 120, 121, 122, 123,
    8788             :     84, 85, 86, 87, 92, 93, 94, 95, 116, 117, 118, 119, 124, 125, 126, 127
    8789             : };
    8790             : 
    8791             : static uint32_t tab16x4[64] = {
    8792             :      0    ,        4,        16,            20,
    8793             :      1    ,        5,        17,            21,
    8794             :      2    ,        6,        18,            22,
    8795             :      3    ,        7,        19,            23,
    8796             :      8    ,        12,        24,            28,
    8797             :      9    ,        13,        25,            29,
    8798             :      10,        14,        26,            30,
    8799             :      11,        15,        27,            31,
    8800             :      32,        36,        48,            52,
    8801             :      33,        37,        49,            53,
    8802             :      34,        38,        50,            54,
    8803             :      35,        39,        51,            55,
    8804             :      40,        44,        56,            60,
    8805             :      41,        45,        57,            61,
    8806             :      42,        46,        58,            62,
    8807             :      43,        47,        59,            63
    8808             : };
    8809             : static uint32_t tab4x16[64] = {
    8810             :     0,    1,    2,    3,    8,    9,    10,    11,    16,    17,    18,    19,    24,    25,    26,    27,
    8811             :     4,    5,    6,    7,    12,    13,    14,    15,    20,    21,    22,    23,    28,    29,    30,    31,
    8812             :     32,    33,    34,    35,    40,    41,    42,    43,    48,    49,    50,    51,    56,    57,    58,    59,
    8813             :     36,    37,    38,    39,    44,    45,    46,    47,    52,    53,    54,    55,    60,    61,    62,    63
    8814             : };
    8815             : 
    8816             : static uint32_t tab64x16[4] = {
    8817             :     0,    1,    2,    3,
    8818             : };
    8819             : 
    8820             : static uint32_t tab16x64[4] = {
    8821             :     0,    1,    2,    3,
    8822             : };
    8823             : 
    8824             : /***************************************************************
    8825             : * in_loop_me_8xN_Nx8_distortion_update
    8826             : *  Compute the distortion at a given position and update
    8827             : *  the best for the supported 8xN and Nx8 blocks
    8828             : ***************************************************************/
    8829           0 : static void in_loop_me_8xN_Nx8_distortion_update(
    8830             :     //Inputs
    8831             :     uint32_t  curr_mv,
    8832             :     uint32_t    block_4x4_index,
    8833             :     uint32_t    *dist_4x4,
    8834             :     //Outputs
    8835             :     uint32_t    *best_mv_8x4,
    8836             :     uint32_t    *best_dist_8x4,
    8837             :     uint32_t    *dist_8x4,
    8838             :     uint32_t    *best_mv_4x8,
    8839             :     uint32_t    *best_dist_4x8,
    8840             :     uint32_t    *dist_4x8,
    8841             :     uint32_t    *best_mv_8x8,
    8842             :     uint32_t    *best_dist_8x8,
    8843             :     uint32_t    *dist_8x8)
    8844             : {
    8845             :     uint32_t square_block_index;
    8846             :     uint32_t first_rec_block_index;
    8847             :     uint32_t second_rec_block_index;
    8848             : 
    8849             :     //8x4
    8850           0 :     first_rec_block_index = (block_4x4_index - 3) / 2;
    8851           0 :     second_rec_block_index = first_rec_block_index + 1;
    8852             : 
    8853           0 :     dist_8x4[first_rec_block_index] = dist_4x4[block_4x4_index - 3] + dist_4x4[block_4x4_index - 2];
    8854             : 
    8855           0 :     if (dist_8x4[first_rec_block_index] < best_dist_8x4[first_rec_block_index]) {
    8856           0 :         best_mv_8x4[first_rec_block_index] = curr_mv;
    8857           0 :         best_dist_8x4[first_rec_block_index] = dist_8x4[first_rec_block_index];
    8858             :     }
    8859             : 
    8860           0 :     dist_8x4[second_rec_block_index] = dist_4x4[block_4x4_index - 1] + dist_4x4[block_4x4_index];
    8861             : 
    8862           0 :     if (dist_8x4[second_rec_block_index] < best_dist_8x4[second_rec_block_index]) {
    8863           0 :         best_mv_8x4[second_rec_block_index] = curr_mv;
    8864           0 :         best_dist_8x4[second_rec_block_index] = dist_8x4[second_rec_block_index];
    8865             :     }
    8866             : 
    8867             :     //4x8
    8868           0 :     dist_4x8[first_rec_block_index] = dist_4x4[block_4x4_index - 3] + dist_4x4[block_4x4_index - 1];
    8869             : 
    8870           0 :     if (dist_4x8[first_rec_block_index] < best_dist_4x8[first_rec_block_index]) {
    8871           0 :         best_mv_4x8[first_rec_block_index] = curr_mv;
    8872           0 :         best_dist_4x8[first_rec_block_index] = dist_4x8[first_rec_block_index];
    8873             :     }
    8874             : 
    8875           0 :     dist_4x8[second_rec_block_index] = dist_4x4[block_4x4_index - 2] + dist_4x4[block_4x4_index];
    8876             : 
    8877           0 :     if (dist_4x8[second_rec_block_index] < best_dist_4x8[second_rec_block_index]) {
    8878           0 :         best_mv_4x8[second_rec_block_index] = curr_mv;
    8879           0 :         best_dist_4x8[second_rec_block_index] = dist_4x8[second_rec_block_index];
    8880             :     }
    8881             : 
    8882             :     //8x8
    8883           0 :     square_block_index = (block_4x4_index - 3) / 4;
    8884             : 
    8885           0 :     dist_8x8[square_block_index] = dist_4x8[first_rec_block_index] + dist_4x8[second_rec_block_index];
    8886             : 
    8887           0 :     if (dist_8x8[square_block_index] < best_dist_8x8[square_block_index]) {
    8888           0 :         best_mv_8x8[square_block_index] = curr_mv;
    8889           0 :         best_dist_8x8[square_block_index] = dist_8x8[square_block_index];
    8890             :     }
    8891           0 : }
    8892             : /***************************************************************
    8893             : * in_loop_me_16xN_Nx16_distortion_update
    8894             : *  Compute the distortion at a given position and update
    8895             : *  the best for the supported 16xN and Nx16 blocks
    8896             : ***************************************************************/
    8897           0 : static void in_loop_me_16xN_Nx16_distortion_update(
    8898             :     //Inputs
    8899             :     uint32_t  curr_mv,
    8900             :     uint32_t  block_8x8_index,
    8901             :     uint32_t    block_4x4_index,
    8902             :     uint32_t    *dist_8x4,
    8903             :     uint32_t    *dist_4x8,
    8904             :     uint32_t    *dist_8x8,
    8905             :     //Outputs
    8906             :     uint32_t    *best_mv_16x4,
    8907             :     uint32_t    *best_dist_16x4,
    8908             :     uint32_t    *dist_16x4,
    8909             :     uint32_t    *best_mv_16x8,
    8910             :     uint32_t    *best_dist_16x8,
    8911             :     uint32_t    *dist_16x8,
    8912             :     uint32_t    *best_mv_4x16,
    8913             :     uint32_t    *best_dist_4x16,
    8914             :     uint32_t    *dist_4x16,
    8915             :     uint32_t    *best_mv_8x16,
    8916             :     uint32_t    *best_dist_8x16,
    8917             :     uint32_t    *dist_8x16,
    8918             :     uint32_t    *best_mv_16x16,
    8919             :     uint32_t    *best_dist_16x16,
    8920             :     uint32_t    *dist_16x16
    8921             : )
    8922             : {
    8923             :     uint32_t square_block_index;
    8924             :     uint32_t first_rec_block_index;
    8925             :     uint32_t second_rec_block_index;
    8926             :     uint32_t third_rec_block_index;
    8927             :     uint32_t fourth_rec_block_index;
    8928             :     uint32_t start_index;
    8929             :     //16x4
    8930           0 :     first_rec_block_index = (block_8x8_index - 3);
    8931           0 :     second_rec_block_index = first_rec_block_index + 1;
    8932           0 :     third_rec_block_index = second_rec_block_index + 1;
    8933           0 :     fourth_rec_block_index = third_rec_block_index + 1;
    8934             : 
    8935           0 :     start_index = (block_4x4_index - 15) >> 1;
    8936             : 
    8937           0 :     dist_16x4[first_rec_block_index] = dist_8x4[start_index] + dist_8x4[start_index + 2];
    8938             : 
    8939           0 :     if (dist_16x4[first_rec_block_index] < best_dist_16x4[first_rec_block_index]) {
    8940           0 :         best_mv_16x4[first_rec_block_index] = curr_mv;
    8941           0 :         best_dist_16x4[first_rec_block_index] = dist_16x4[first_rec_block_index];
    8942             :     }
    8943             : 
    8944           0 :     dist_16x4[second_rec_block_index] = dist_8x4[start_index + 1] + dist_8x4[start_index + 3];
    8945             : 
    8946           0 :     if (dist_16x4[second_rec_block_index] < best_dist_16x4[second_rec_block_index]) {
    8947           0 :         best_mv_16x4[second_rec_block_index] = curr_mv;
    8948           0 :         best_dist_16x4[second_rec_block_index] = dist_16x4[second_rec_block_index];
    8949             :     }
    8950             : 
    8951           0 :     dist_16x4[third_rec_block_index] = dist_8x4[start_index + 4] + dist_8x4[start_index + 6];
    8952             : 
    8953           0 :     if (dist_16x4[third_rec_block_index] < best_dist_16x4[third_rec_block_index]) {
    8954           0 :         best_mv_16x4[third_rec_block_index] = curr_mv;
    8955           0 :         best_dist_16x4[third_rec_block_index] = dist_16x4[third_rec_block_index];
    8956             :     }
    8957             : 
    8958           0 :     dist_16x4[fourth_rec_block_index] = dist_8x4[start_index + 5] + dist_8x4[start_index + 7];
    8959             : 
    8960           0 :     if (dist_16x4[fourth_rec_block_index] < best_dist_16x4[fourth_rec_block_index]) {
    8961           0 :         best_mv_16x4[fourth_rec_block_index] = curr_mv;
    8962           0 :         best_dist_16x4[fourth_rec_block_index] = dist_16x4[fourth_rec_block_index];
    8963             :     }
    8964             : 
    8965             :     //4x16
    8966             : 
    8967           0 :     dist_4x16[first_rec_block_index] = dist_4x8[start_index] + dist_4x8[start_index + 4];
    8968             : 
    8969           0 :     if (dist_4x16[first_rec_block_index] < best_dist_4x16[first_rec_block_index]) {
    8970           0 :         best_mv_4x16[first_rec_block_index] = curr_mv;
    8971           0 :         best_dist_4x16[first_rec_block_index] = dist_4x16[first_rec_block_index];
    8972             :     }
    8973             : 
    8974           0 :     dist_4x16[second_rec_block_index] = dist_4x8[start_index + 1] + dist_4x8[start_index + 5];
    8975             : 
    8976           0 :     if (dist_4x16[second_rec_block_index] < best_dist_4x16[second_rec_block_index]) {
    8977           0 :         best_mv_4x16[second_rec_block_index] = curr_mv;
    8978           0 :         best_dist_4x16[second_rec_block_index] = dist_4x16[second_rec_block_index];
    8979             :     }
    8980             : 
    8981           0 :     dist_4x16[third_rec_block_index] = dist_4x8[start_index + 2] + dist_4x8[start_index + 6];
    8982             : 
    8983           0 :     if (dist_4x16[third_rec_block_index] < best_dist_4x16[third_rec_block_index]) {
    8984           0 :         best_mv_4x16[third_rec_block_index] = curr_mv;
    8985           0 :         best_dist_4x16[third_rec_block_index] = dist_4x16[third_rec_block_index];
    8986             :     }
    8987             : 
    8988           0 :     dist_4x16[fourth_rec_block_index] = dist_4x8[start_index + 3] + dist_4x8[start_index + 7];
    8989             : 
    8990           0 :     if (dist_4x16[fourth_rec_block_index] < best_dist_4x16[fourth_rec_block_index]) {
    8991           0 :         best_mv_4x16[fourth_rec_block_index] = curr_mv;
    8992           0 :         best_dist_4x16[fourth_rec_block_index] = dist_4x16[fourth_rec_block_index];
    8993             :     }
    8994             : 
    8995             :     //16x8
    8996           0 :     first_rec_block_index = (block_8x8_index - 3) / 2;
    8997           0 :     second_rec_block_index = first_rec_block_index + 1;
    8998             : 
    8999           0 :     dist_16x8[first_rec_block_index] = dist_8x8[block_8x8_index - 3] + dist_8x8[block_8x8_index - 2];
    9000             : 
    9001           0 :     if (dist_16x8[first_rec_block_index] < best_dist_16x8[first_rec_block_index]) {
    9002           0 :         best_mv_16x8[first_rec_block_index] = curr_mv;
    9003           0 :         best_dist_16x8[first_rec_block_index] = dist_16x8[first_rec_block_index];
    9004             :     }
    9005             : 
    9006           0 :     dist_16x8[second_rec_block_index] = dist_8x8[block_8x8_index - 1] + dist_8x8[block_8x8_index];
    9007             : 
    9008           0 :     if (dist_16x8[second_rec_block_index] < best_dist_16x8[second_rec_block_index]) {
    9009           0 :         best_mv_16x8[second_rec_block_index] = curr_mv;
    9010           0 :         best_dist_16x8[second_rec_block_index] = dist_16x8[second_rec_block_index];
    9011             :     }
    9012             : 
    9013             :     //8x16
    9014           0 :     dist_8x16[first_rec_block_index] = dist_8x8[block_8x8_index - 3] + dist_8x8[block_8x8_index - 1];
    9015             : 
    9016           0 :     if (dist_8x16[first_rec_block_index] < best_dist_8x16[first_rec_block_index]) {
    9017           0 :         best_mv_8x16[first_rec_block_index] = curr_mv;
    9018           0 :         best_dist_8x16[first_rec_block_index] = dist_8x16[first_rec_block_index];
    9019             :     }
    9020             : 
    9021           0 :     dist_8x16[second_rec_block_index] = dist_8x8[block_8x8_index - 2] + dist_8x8[block_8x8_index];
    9022             : 
    9023           0 :     if (dist_8x16[second_rec_block_index] < best_dist_8x16[second_rec_block_index]) {
    9024           0 :         best_mv_8x16[second_rec_block_index] = curr_mv;
    9025           0 :         best_dist_8x16[second_rec_block_index] = dist_8x16[second_rec_block_index];
    9026             :     }
    9027             : 
    9028             :     //16x16
    9029           0 :     square_block_index = (block_8x8_index - 3) / 4;
    9030             : 
    9031           0 :     dist_16x16[square_block_index] = dist_16x8[first_rec_block_index] + dist_16x8[second_rec_block_index];
    9032             : 
    9033           0 :     if (dist_16x16[square_block_index] < best_dist_16x16[square_block_index]) {
    9034           0 :         best_mv_16x16[square_block_index] = curr_mv;
    9035           0 :         best_dist_16x16[square_block_index] = dist_16x16[square_block_index];
    9036             :     }
    9037           0 : }
    9038             : /***************************************************************
    9039             : * in_loop_me_32xN_Nx32_distortion_update
    9040             : *  Compute the distortion at a given position and update
    9041             : *  the best for the supported 32xN and Nx32 blocks
    9042             : ***************************************************************/
    9043           0 : static void in_loop_me_32xN_Nx32_distortion_update(
    9044             :     //Inputs
    9045             :     uint32_t  curr_mv,
    9046             :     uint32_t  block_16x16_index,
    9047             :     uint32_t    block_8x8_index,
    9048             :     uint32_t    *dist_16x8,
    9049             :     uint32_t    *dist_8x16,
    9050             :     uint32_t    *dist_16x16,
    9051             :     //Outputs
    9052             :     uint32_t    *best_mv_32x8,
    9053             :     uint32_t    *best_dist_32x8,
    9054             :     uint32_t    *dist_32x8,
    9055             :     uint32_t    *best_mv_32x16,
    9056             :     uint32_t    *best_dist_32x16,
    9057             :     uint32_t    *dist_32x16,
    9058             :     uint32_t    *best_mv_8x32,
    9059             :     uint32_t    *best_dist_8x32,
    9060             :     uint32_t    *dist_8x32,
    9061             :     uint32_t    *best_mv_16x32,
    9062             :     uint32_t    *best_dist_16x32,
    9063             :     uint32_t    *dist_16x32,
    9064             :     uint32_t    *best_mv_32x32,
    9065             :     uint32_t    *best_dist_32x32,
    9066             :     uint32_t    *dist_32x32
    9067             : )
    9068             : {
    9069             :     uint32_t square_block_index;
    9070             :     uint32_t first_rec_block_index;
    9071             :     uint32_t second_rec_block_index;
    9072             :     uint32_t third_rec_block_index;
    9073             :     uint32_t fourth_rec_block_index;
    9074             :     uint32_t start_index;
    9075             : 
    9076             :     //32x8
    9077           0 :     first_rec_block_index = (block_16x16_index - 3);
    9078           0 :     second_rec_block_index = first_rec_block_index + 1;
    9079           0 :     third_rec_block_index = second_rec_block_index + 1;
    9080           0 :     fourth_rec_block_index = third_rec_block_index + 1;
    9081             : 
    9082           0 :     start_index = (block_8x8_index - 15) >> 1;
    9083             : 
    9084           0 :     dist_32x8[first_rec_block_index] = dist_16x8[start_index] + dist_16x8[start_index + 2];
    9085             : 
    9086           0 :     if (dist_32x8[first_rec_block_index] < best_dist_32x8[first_rec_block_index]) {
    9087           0 :         best_mv_32x8[first_rec_block_index] = curr_mv;
    9088           0 :         best_dist_32x8[first_rec_block_index] = dist_32x8[first_rec_block_index];
    9089             :     }
    9090             : 
    9091           0 :     dist_32x8[second_rec_block_index] = dist_16x8[start_index + 1] + dist_16x8[start_index + 3];
    9092             : 
    9093           0 :     if (dist_32x8[second_rec_block_index] < best_dist_32x8[second_rec_block_index]) {
    9094           0 :         best_mv_32x8[second_rec_block_index] = curr_mv;
    9095           0 :         best_dist_32x8[second_rec_block_index] = dist_32x8[second_rec_block_index];
    9096             :     }
    9097             : 
    9098           0 :     dist_32x8[third_rec_block_index] = dist_16x8[start_index + 4] + dist_16x8[start_index + 6];
    9099             : 
    9100           0 :     if (dist_32x8[third_rec_block_index] < best_dist_32x8[third_rec_block_index]) {
    9101           0 :         best_mv_32x8[third_rec_block_index] = curr_mv;
    9102           0 :         best_dist_32x8[third_rec_block_index] = dist_32x8[third_rec_block_index];
    9103             :     }
    9104             : 
    9105           0 :     dist_32x8[fourth_rec_block_index] = dist_16x8[start_index + 5] + dist_16x8[start_index + 7];
    9106             : 
    9107           0 :     if (dist_32x8[fourth_rec_block_index] < best_dist_32x8[fourth_rec_block_index]) {
    9108           0 :         best_mv_32x8[fourth_rec_block_index] = curr_mv;
    9109           0 :         best_dist_32x8[fourth_rec_block_index] = dist_32x8[fourth_rec_block_index];
    9110             :     }
    9111             : 
    9112             :     //8x32
    9113             : 
    9114           0 :     dist_8x32[first_rec_block_index] = dist_8x16[start_index] + dist_8x16[start_index + 4];
    9115             : 
    9116           0 :     if (dist_8x32[first_rec_block_index] < best_dist_8x32[first_rec_block_index]) {
    9117           0 :         best_mv_8x32[first_rec_block_index] = curr_mv;
    9118           0 :         best_dist_8x32[first_rec_block_index] = dist_8x32[first_rec_block_index];
    9119             :     }
    9120             : 
    9121           0 :     dist_8x32[second_rec_block_index] = dist_8x16[start_index + 1] + dist_8x16[start_index + 5];
    9122             : 
    9123           0 :     if (dist_8x32[second_rec_block_index] < best_dist_8x32[second_rec_block_index]) {
    9124           0 :         best_mv_8x32[second_rec_block_index] = curr_mv;
    9125           0 :         best_dist_8x32[second_rec_block_index] = dist_8x32[second_rec_block_index];
    9126             :     }
    9127             : 
    9128           0 :     dist_8x32[third_rec_block_index] = dist_8x16[start_index + 2] + dist_8x16[start_index + 6];
    9129             : 
    9130           0 :     if (dist_8x32[third_rec_block_index] < best_dist_8x32[third_rec_block_index]) {
    9131           0 :         best_mv_8x32[third_rec_block_index] = curr_mv;
    9132           0 :         best_dist_8x32[third_rec_block_index] = dist_8x32[third_rec_block_index];
    9133             :     }
    9134             : 
    9135           0 :     dist_8x32[fourth_rec_block_index] = dist_8x16[start_index + 3] + dist_8x16[start_index + 7];
    9136             : 
    9137           0 :     if (dist_8x32[fourth_rec_block_index] < best_dist_8x32[fourth_rec_block_index]) {
    9138           0 :         best_mv_8x32[fourth_rec_block_index] = curr_mv;
    9139           0 :         best_dist_8x32[fourth_rec_block_index] = dist_8x32[fourth_rec_block_index];
    9140             :     }
    9141             : 
    9142             :     //32x16
    9143           0 :     first_rec_block_index = (block_16x16_index - 3) / 2;
    9144           0 :     second_rec_block_index = first_rec_block_index + 1;
    9145             : 
    9146           0 :     dist_32x16[first_rec_block_index] = dist_16x16[block_16x16_index - 3] + dist_16x16[block_16x16_index - 2];
    9147             : 
    9148           0 :     if (dist_32x16[first_rec_block_index] < best_dist_32x16[first_rec_block_index]) {
    9149           0 :         best_mv_32x16[first_rec_block_index] = curr_mv;
    9150           0 :         best_dist_32x16[first_rec_block_index] = dist_32x16[first_rec_block_index];
    9151             :     }
    9152             : 
    9153           0 :     dist_32x16[second_rec_block_index] = dist_16x16[block_16x16_index - 1] + dist_16x16[block_16x16_index];
    9154             : 
    9155           0 :     if (dist_32x16[second_rec_block_index] < best_dist_32x16[second_rec_block_index]) {
    9156           0 :         best_mv_32x16[second_rec_block_index] = curr_mv;
    9157           0 :         best_dist_32x16[second_rec_block_index] = dist_32x16[second_rec_block_index];
    9158             :     }
    9159             : 
    9160             :     //16x32
    9161           0 :     dist_16x32[first_rec_block_index] = dist_16x16[block_16x16_index - 3] + dist_16x16[block_16x16_index - 1];
    9162             : 
    9163           0 :     if (dist_16x32[first_rec_block_index] < best_dist_16x32[first_rec_block_index]) {
    9164           0 :         best_mv_16x32[first_rec_block_index] = curr_mv;
    9165           0 :         best_dist_16x32[first_rec_block_index] = dist_16x32[first_rec_block_index];
    9166             :     }
    9167             : 
    9168           0 :     dist_16x32[second_rec_block_index] = dist_16x16[block_16x16_index - 2] + dist_16x16[block_16x16_index];
    9169             : 
    9170           0 :     if (dist_16x32[second_rec_block_index] < best_dist_16x32[second_rec_block_index]) {
    9171           0 :         best_mv_16x32[second_rec_block_index] = curr_mv;
    9172           0 :         best_dist_16x32[second_rec_block_index] = dist_16x32[second_rec_block_index];
    9173             :     }
    9174             : 
    9175             :     //32x32
    9176           0 :     square_block_index = (block_16x16_index - 3) / 4;
    9177             : 
    9178           0 :     dist_32x32[square_block_index] = dist_32x16[first_rec_block_index] + dist_32x16[second_rec_block_index];
    9179             : 
    9180           0 :     if (dist_32x32[square_block_index] < best_dist_32x32[square_block_index]) {
    9181           0 :         best_mv_32x32[square_block_index] = curr_mv;
    9182           0 :         best_dist_32x32[square_block_index] = dist_32x32[square_block_index];
    9183             :     }
    9184           0 : }
    9185             : /***************************************************************
    9186             : * in_loop_me_64xN_Nx64_distortion_update
    9187             : *  Compute the distortion at a given position and update
    9188             : *  the best for the supported 64xN and Nx64 blocks
    9189             : ***************************************************************/
    9190           0 : static void in_loop_me_64xN_Nx64_distortion_update(
    9191             :     uint32_t     curr_mv,
    9192             :     uint32_t     block_32x32_index,
    9193             :     uint32_t     block_16x16_index,
    9194             :     uint32_t    *dist_32x16,
    9195             :     uint32_t    *dist_16x32,
    9196             :     uint32_t    *dist_32x32,
    9197             :     uint32_t    *best_mv_64x16,
    9198             :     uint32_t    *best_dist_64x16,
    9199             :     uint32_t    *dist_64x16,
    9200             :     uint32_t    *best_mv_64x32,
    9201             :     uint32_t    *best_dist_64x32,
    9202             :     uint32_t    *dist_64x32,
    9203             :     uint32_t    *best_mv_16x64,
    9204             :     uint32_t    *best_dist_16x64,
    9205             :     uint32_t    *dist_16x64,
    9206             :     uint32_t    *best_mv_32x64,
    9207             :     uint32_t    *best_dist_32x64,
    9208             :     uint32_t    *dist_32x64,
    9209             :     uint32_t    *best_mv_64x64,
    9210             :     uint32_t    *best_dist_64x64,
    9211             :     uint32_t    *dist_64x64)
    9212             : {
    9213             :     uint32_t square_block_index;
    9214             :     uint32_t first_rec_block_index;
    9215             :     uint32_t second_rec_block_index;
    9216             :     uint32_t third_rec_block_index;
    9217             :     uint32_t fourth_rec_block_index;
    9218             :     uint32_t start_index;
    9219             :     UNUSED(dist_64x32);
    9220             :     UNUSED(dist_32x64);
    9221             :     //64x16
    9222           0 :     first_rec_block_index = (block_32x32_index - 3);
    9223           0 :     second_rec_block_index = first_rec_block_index + 1;
    9224           0 :     third_rec_block_index = second_rec_block_index + 1;
    9225           0 :     fourth_rec_block_index = third_rec_block_index + 1;
    9226             : 
    9227           0 :     start_index = (block_16x16_index - 15) >> 1;
    9228             : 
    9229           0 :     dist_64x16[first_rec_block_index] = dist_32x16[start_index] + dist_32x16[start_index + 2];
    9230             : 
    9231           0 :     if (dist_64x16[first_rec_block_index] < best_dist_64x16[first_rec_block_index]) {
    9232           0 :         best_mv_64x16[first_rec_block_index] = curr_mv;
    9233           0 :         best_dist_64x16[first_rec_block_index] = dist_64x16[first_rec_block_index];
    9234             :     }
    9235             : 
    9236           0 :     dist_64x16[second_rec_block_index] = dist_32x16[start_index + 1] + dist_32x16[start_index + 3];
    9237             : 
    9238           0 :     if (dist_64x16[second_rec_block_index] < best_dist_64x16[second_rec_block_index]) {
    9239           0 :         best_mv_64x16[second_rec_block_index] = curr_mv;
    9240           0 :         best_dist_64x16[second_rec_block_index] = dist_64x16[second_rec_block_index];
    9241             :     }
    9242             : 
    9243           0 :     dist_64x16[third_rec_block_index] = dist_32x16[start_index + 4] + dist_32x16[start_index + 6];
    9244             : 
    9245           0 :     if (dist_64x16[third_rec_block_index] < best_dist_64x16[third_rec_block_index]) {
    9246           0 :         best_mv_64x16[third_rec_block_index] = curr_mv;
    9247           0 :         best_dist_64x16[third_rec_block_index] = dist_64x16[third_rec_block_index];
    9248             :     }
    9249             : 
    9250           0 :     dist_64x16[fourth_rec_block_index] = dist_32x16[start_index + 5] + dist_32x16[start_index + 7];
    9251             : 
    9252           0 :     if (dist_64x16[fourth_rec_block_index] < best_dist_64x16[fourth_rec_block_index]) {
    9253           0 :         best_mv_64x16[fourth_rec_block_index] = curr_mv;
    9254           0 :         best_dist_64x16[fourth_rec_block_index] = dist_64x16[fourth_rec_block_index];
    9255             :     }
    9256             : 
    9257             :     //16x64
    9258             : 
    9259           0 :     dist_16x64[first_rec_block_index] = dist_16x32[start_index] + dist_16x32[start_index + 4];
    9260             : 
    9261           0 :     if (dist_16x64[first_rec_block_index] < best_dist_16x64[first_rec_block_index]) {
    9262           0 :         best_mv_16x64[first_rec_block_index] = curr_mv;
    9263           0 :         best_dist_16x64[first_rec_block_index] = dist_16x64[first_rec_block_index];
    9264             :     }
    9265             : 
    9266           0 :     dist_16x64[second_rec_block_index] = dist_16x32[start_index + 1] + dist_16x32[start_index + 5];
    9267             : 
    9268           0 :     if (dist_16x64[second_rec_block_index] < best_dist_16x64[second_rec_block_index]) {
    9269           0 :         best_mv_16x64[second_rec_block_index] = curr_mv;
    9270           0 :         best_dist_16x64[second_rec_block_index] = dist_16x64[second_rec_block_index];
    9271             :     }
    9272             : 
    9273           0 :     dist_16x64[third_rec_block_index] = dist_16x32[start_index + 2] + dist_16x32[start_index + 6];
    9274             : 
    9275           0 :     if (dist_16x64[third_rec_block_index] < best_dist_16x64[third_rec_block_index]) {
    9276           0 :         best_mv_16x64[third_rec_block_index] = curr_mv;
    9277           0 :         best_dist_16x64[third_rec_block_index] = dist_16x64[third_rec_block_index];
    9278             :     }
    9279             : 
    9280           0 :     dist_16x64[fourth_rec_block_index] = dist_16x32[start_index + 3] + dist_16x32[start_index + 7];
    9281             : 
    9282           0 :     if (dist_16x64[fourth_rec_block_index] < best_dist_16x64[fourth_rec_block_index]) {
    9283           0 :         best_mv_16x64[fourth_rec_block_index] = curr_mv;
    9284           0 :         best_dist_16x64[fourth_rec_block_index] = dist_16x64[fourth_rec_block_index];
    9285             :     }
    9286             : 
    9287             :     //64x32
    9288           0 :     first_rec_block_index = (block_32x32_index - 3) / 2;
    9289           0 :     second_rec_block_index = first_rec_block_index + 1;
    9290             : 
    9291           0 :     dist_64x32[first_rec_block_index] = dist_32x32[block_32x32_index - 3] + dist_32x32[block_32x32_index - 2];
    9292             : 
    9293           0 :     if (dist_64x32[first_rec_block_index] < best_dist_64x32[first_rec_block_index]) {
    9294           0 :         best_mv_64x32[first_rec_block_index] = curr_mv;
    9295           0 :         best_dist_64x32[first_rec_block_index] = dist_64x32[first_rec_block_index];
    9296             :     }
    9297             : 
    9298           0 :     dist_64x32[second_rec_block_index] = dist_32x32[block_32x32_index - 1] + dist_32x32[block_32x32_index];
    9299             : 
    9300           0 :     if (dist_64x32[second_rec_block_index] < best_dist_64x32[second_rec_block_index]) {
    9301           0 :         best_mv_64x32[second_rec_block_index] = curr_mv;
    9302           0 :         best_dist_64x32[second_rec_block_index] = dist_64x32[second_rec_block_index];
    9303             :     }
    9304             : 
    9305             :     //32x64
    9306           0 :     dist_32x64[first_rec_block_index] = dist_32x32[block_32x32_index - 3] + dist_32x32[block_32x32_index - 1];
    9307             : 
    9308           0 :     if (dist_32x64[first_rec_block_index] < best_dist_32x64[first_rec_block_index]) {
    9309           0 :         best_mv_32x64[first_rec_block_index] = curr_mv;
    9310           0 :         best_dist_32x64[first_rec_block_index] = dist_32x64[first_rec_block_index];
    9311             :     }
    9312             : 
    9313           0 :     dist_32x64[second_rec_block_index] = dist_32x32[block_32x32_index - 2] + dist_32x32[block_32x32_index];
    9314             : 
    9315           0 :     if (dist_32x64[second_rec_block_index] < best_dist_32x64[second_rec_block_index]) {
    9316           0 :         best_mv_32x64[second_rec_block_index] = curr_mv;
    9317           0 :         best_dist_32x64[second_rec_block_index] = dist_32x64[second_rec_block_index];
    9318             :     }
    9319             : 
    9320             :     //64x64
    9321           0 :     square_block_index = (block_32x32_index - 3) / 4;
    9322             : 
    9323           0 :     dist_64x64[square_block_index] = dist_64x32[first_rec_block_index] + dist_64x32[second_rec_block_index];
    9324             : 
    9325           0 :     if (dist_64x64[square_block_index] < best_dist_64x64[square_block_index]) {
    9326           0 :         best_mv_64x64[square_block_index] = curr_mv;
    9327           0 :         best_dist_64x64[square_block_index] = dist_64x64[square_block_index];
    9328             :     }
    9329           0 : }
    9330             : 
    9331             : /***************************************************************
    9332             : * in_loop_me_128xN_Nx128_distortion_update
    9333             : *  Compute the distortion at a given position and update
    9334             : *  the best for the supported 128xN and Nx128 blocks
    9335             : ***************************************************************/
    9336           0 : static void in_loop_me_128xN_Nx128_distortion_update(
    9337             :     uint32_t     curr_mv,
    9338             :     uint32_t     block_64x64_index,
    9339             :     uint32_t     block_32x32_index,
    9340             :     uint32_t    *dist_64x32,
    9341             :     uint32_t    *dist_32x64,
    9342             :     uint32_t    *dist_64x64,
    9343             :     uint32_t    *best_mv_128x64,
    9344             :     uint32_t    *best_dist_128x64,
    9345             :     uint32_t    *dist_128x64,
    9346             :     uint32_t    *best_mv_64x128,
    9347             :     uint32_t    *best_dist_64x128,
    9348             :     uint32_t    *dist_64x128,
    9349             :     uint32_t    *best_mv_128x128,
    9350             :     uint32_t    *best_dist_128x128,
    9351             :     uint32_t    *dist_128x128
    9352             : )
    9353             : {
    9354             :     uint32_t square_block_index;
    9355             :     uint32_t first_rec_block_index;
    9356             :     uint32_t second_rec_block_index;
    9357             :     UNUSED(block_32x32_index);
    9358             :     UNUSED(dist_64x32);
    9359             :     UNUSED(dist_32x64);
    9360             :     //128x64
    9361           0 :     first_rec_block_index = (block_64x64_index - 3) / 4;
    9362           0 :     second_rec_block_index = first_rec_block_index + 1;
    9363             : 
    9364           0 :     dist_128x64[first_rec_block_index] = dist_64x64[block_64x64_index - 3] + dist_64x64[block_64x64_index - 2];
    9365             : 
    9366           0 :     if (dist_128x64[first_rec_block_index] < best_dist_128x64[first_rec_block_index]) {
    9367           0 :         best_mv_128x64[first_rec_block_index] = curr_mv;
    9368           0 :         best_dist_128x64[first_rec_block_index] = dist_128x64[first_rec_block_index];
    9369             :     }
    9370             : 
    9371           0 :     dist_128x64[second_rec_block_index] = dist_64x64[block_64x64_index - 1] + dist_64x64[block_64x64_index];
    9372             : 
    9373           0 :     if (dist_128x64[second_rec_block_index] < best_dist_128x64[second_rec_block_index]) {
    9374           0 :         best_mv_128x64[second_rec_block_index] = curr_mv;
    9375           0 :         best_dist_128x64[second_rec_block_index] = dist_128x64[second_rec_block_index];
    9376             :     }
    9377             : 
    9378             :     //64x128
    9379           0 :     dist_64x128[first_rec_block_index] = dist_64x64[block_64x64_index - 3] + dist_64x64[block_64x64_index - 1];
    9380             : 
    9381           0 :     if (dist_64x128[first_rec_block_index] < best_dist_64x128[first_rec_block_index]) {
    9382           0 :         best_mv_64x128[first_rec_block_index] = curr_mv;
    9383           0 :         best_dist_64x128[first_rec_block_index] = dist_64x128[first_rec_block_index];
    9384             :     }
    9385             : 
    9386           0 :     dist_64x128[second_rec_block_index] = dist_64x64[block_64x64_index - 2] + dist_64x64[block_64x64_index];
    9387             : 
    9388           0 :     if (dist_64x128[second_rec_block_index] < best_dist_64x128[second_rec_block_index]) {
    9389           0 :         best_mv_64x128[second_rec_block_index] = curr_mv;
    9390           0 :         best_dist_64x128[second_rec_block_index] = dist_64x128[second_rec_block_index];
    9391             :     }
    9392             : 
    9393             :     //128x128
    9394           0 :     square_block_index = (block_64x64_index - 3) / 4;
    9395             : 
    9396           0 :     *dist_128x128 = dist_128x64[first_rec_block_index] + dist_128x64[second_rec_block_index];
    9397             : 
    9398           0 :     if (*dist_128x128 < best_dist_128x128[square_block_index]) {
    9399           0 :         best_mv_128x128[square_block_index] = curr_mv;
    9400           0 :         best_dist_128x128[square_block_index] = *dist_128x128;
    9401             :     }
    9402           0 : }
    9403             : /***************************************************************
    9404             : * in_loop_me_get_search_point_results_block
    9405             : *  Compute the distortion at a given position
    9406             : ***************************************************************/
    9407             : 
    9408           0 : static void in_loop_me_get_search_point_results_block(
    9409             :     SsMeContext            *context_ptr,                    // input parameter, ME context Ptr, used to get SB Ptr
    9410             :     uint32_t                   list_index,                      // input parameter, reference list index
    9411             :     uint32_t                   ref_index,
    9412             :     int32_t                   x_search_index,                  // input parameter, search region position in the horizontal direction, used to derive xMV
    9413             :     int32_t                   y_search_index,                  // input parameter, search region position in the vertical direction, used to derive yMV
    9414             :     uint32_t                   number_of_sb_quad)
    9415             : {
    9416           0 :     uint8_t  *src_ptr = context_ptr->sb_buffer;
    9417             : 
    9418             :     // NADER
    9419           0 :     uint8_t   *ref_ptr = context_ptr->integer_buffer_ptr[list_index][0] + (ME_FILTER_TAP >> 1) + ((ME_FILTER_TAP >> 1) * context_ptr->interpolated_full_stride[list_index][0]);
    9420           0 :     uint32_t   ref_luma_stride = context_ptr->interpolated_full_stride[list_index][0];
    9421           0 :     uint32_t   curr_mv_1 = (((uint16_t)y_search_index) << 18);
    9422           0 :     uint16_t   curr_mv_2 = (((uint16_t)x_search_index << 2));
    9423           0 :     uint32_t   curr_mv = curr_mv_1 | curr_mv_2;
    9424           0 :     uint32_t  *best_dist_4x4 = context_ptr->p_best_sad4x4;
    9425           0 :     uint32_t  *best_mv_4x4 = context_ptr->p_best_mv4x4;
    9426           0 :     uint32_t  *dist_4x4 = context_ptr->p_sad4x4;
    9427           0 :     uint32_t  *best_dist_8x4 = context_ptr->p_best_sad8x4;
    9428           0 :     uint32_t  *best_mv_8x4 = context_ptr->p_best_mv8x4;
    9429           0 :     uint32_t  *dist_8x4 = context_ptr->p_sad8x4;
    9430           0 :     uint32_t  *best_dist_4x8 = context_ptr->p_best_sad4x8;
    9431           0 :     uint32_t  *best_mv_4x8 = context_ptr->p_best_mv4x8;
    9432           0 :     uint32_t  *dist_4x8 = context_ptr->p_sad4x8;
    9433           0 :     uint32_t  *best_dist_8x8 = context_ptr->p_best_sad8x8;
    9434           0 :     uint32_t  *best_mv_8x8 = context_ptr->p_best_mv8x8;
    9435           0 :     uint32_t  *dist_8x8 = context_ptr->p_sad8x8;
    9436           0 :     uint32_t  *best_dist_16x16 = context_ptr->p_best_sad16x16;
    9437           0 :     uint32_t  *best_mv_16x16 = context_ptr->p_best_mv16x16;
    9438           0 :     uint32_t  *dist_16x16 = context_ptr->p_sad16x16;
    9439           0 :     uint32_t  *best_dist_16x8 = context_ptr->p_best_sad16x8;
    9440           0 :     uint32_t  *best_mv_16x8 = context_ptr->p_best_mv16x8;
    9441           0 :     uint32_t  *dist_16x8 = context_ptr->p_sad16x8;
    9442           0 :     uint32_t  *best_dist_16x4 = context_ptr->p_best_sad16x4;
    9443           0 :     uint32_t  *best_mv_16x4 = context_ptr->p_best_mv16x4;
    9444           0 :     uint32_t  *dist_16x4 = context_ptr->p_sad16x4;
    9445           0 :     uint32_t  *best_dist_8x16 = context_ptr->p_best_sad8x16;
    9446           0 :     uint32_t  *best_mv_8x16 = context_ptr->p_best_mv8x16;
    9447           0 :     uint32_t  *dist_8x16 = context_ptr->p_sad8x16;
    9448           0 :     uint32_t  *best_dist_4x16 = context_ptr->p_best_sad4x16;
    9449           0 :     uint32_t  *best_mv_4x16 = context_ptr->p_best_mv4x16;
    9450           0 :     uint32_t  *dist_4x16 = context_ptr->p_sad4x16;
    9451           0 :     uint32_t  *best_dist_32x8 = context_ptr->p_best_sad32x8;
    9452           0 :     uint32_t  *best_mv_32x8 = context_ptr->p_best_mv32x8;
    9453           0 :     uint32_t  *dist_32x8 = context_ptr->p_sad32x8;
    9454           0 :     uint32_t  *best_dist_32x16 = context_ptr->p_best_sad32x16;
    9455           0 :     uint32_t  *best_mv_32x16 = context_ptr->p_best_mv32x16;
    9456           0 :     uint32_t  *dist_32x16 = context_ptr->p_sad32x16;
    9457           0 :     uint32_t  *best_dist_16x32 = context_ptr->p_best_sad16x32;
    9458           0 :     uint32_t  *best_mv_16x32 = context_ptr->p_best_mv16x32;
    9459           0 :     uint32_t  *dist_16x32 = context_ptr->p_sad16x32;
    9460           0 :     uint32_t  *best_dist_8x32 = context_ptr->p_best_sad8x32;
    9461           0 :     uint32_t  *best_mv_8x32 = context_ptr->p_best_mv8x32;
    9462           0 :     uint32_t  *dist_8x32 = context_ptr->p_sad8x32;
    9463           0 :     uint32_t  *best_dist_32x32 = context_ptr->p_best_sad32x32;
    9464           0 :     uint32_t  *best_mv_32x32 = context_ptr->p_best_mv32x32;
    9465           0 :     uint32_t  *dist_32x32 = context_ptr->p_sad32x32;
    9466           0 :     uint32_t  *best_dist_64x16 = context_ptr->p_best_sad64x16;
    9467           0 :     uint32_t  *best_mv_64x16 = context_ptr->p_best_mv64x16;
    9468           0 :     uint32_t  *dist_64x16 = context_ptr->p_sad64x16;
    9469           0 :     uint32_t  *best_dist_64x32 = context_ptr->p_best_sad64x32;
    9470           0 :     uint32_t  *best_mv_64x32 = context_ptr->p_best_mv64x32;
    9471           0 :     uint32_t  *dist_64x32 = context_ptr->p_sad64x32;
    9472           0 :     uint32_t  *best_dist_32x64 = context_ptr->p_best_sad32x64;
    9473           0 :     uint32_t  *best_mv_32x64 = context_ptr->p_best_mv32x64;
    9474           0 :     uint32_t  *dist_32x64 = context_ptr->p_sad32x64;
    9475           0 :     uint32_t  *best_dist_16x64 = context_ptr->p_best_sad16x64;
    9476           0 :     uint32_t  *best_mv_16x64 = context_ptr->p_best_mv16x64;
    9477           0 :     uint32_t  *dist_16x64 = context_ptr->p_sad16x64;
    9478           0 :     uint32_t  *best_dist_64x64 = context_ptr->p_best_sad64x64;
    9479           0 :     uint32_t  *best_mv_64x64 = context_ptr->p_best_mv64x64;
    9480           0 :     uint32_t  *dist_64x64 = context_ptr->p_sad64x64;
    9481           0 :     uint32_t  *best_dist_128x64 = context_ptr->p_best_sad128x64;
    9482           0 :     uint32_t  *best_mv_128x64 = context_ptr->p_best_mv128x64;
    9483           0 :     uint32_t  *dist_128x64 = context_ptr->p_sad128x64;
    9484           0 :     uint32_t  *best_dist_64x128 = context_ptr->p_best_sad64x128;
    9485           0 :     uint32_t  *best_mv_64x128 = context_ptr->p_best_mv64x128;
    9486           0 :     uint32_t  *dist_64x128 = context_ptr->p_sad64x128;
    9487           0 :     uint32_t  *best_dist_128x128 = context_ptr->p_best_sad128x128;
    9488           0 :     uint32_t  *best_mv_128x128 = context_ptr->p_best_mv128x128;
    9489           0 :     uint32_t  dist_128x128 = context_ptr->p_sad128x128;
    9490           0 :     const uint32_t  src_stride = context_ptr->sb_buffer_stride;
    9491           0 :     uint32_t block_64x64_index = 0;
    9492           0 :     uint32_t block_32x32_index = 0;
    9493             :     uint32_t block_16x16_index;
    9494             :     uint32_t block_8x8_index;
    9495             :     uint32_t block_4x4_index;
    9496             :     uint32_t block_64x64_x;
    9497             :     uint32_t block_32x32_x;
    9498             :     uint32_t block_16x16_x;
    9499             :     uint32_t block_8x8_x;
    9500             :     uint32_t block_4x4_x;
    9501             :     uint32_t block_64x64_y;
    9502             :     uint32_t block_32x32_y;
    9503             :     uint32_t block_16x16_y;
    9504             :     uint32_t block_8x8_y;
    9505             :     uint32_t block_4x4_y;
    9506           0 :     uint32_t quad_offset = number_of_sb_quad > 1 ? 2 : 1;
    9507             : 
    9508           0 :     for (block_64x64_y = 0; block_64x64_y < quad_offset; block_64x64_y++) {
    9509           0 :         for (block_64x64_x = 0; block_64x64_x < quad_offset; block_64x64_x++) {
    9510           0 :             block_64x64_index = block_64x64_x + (block_64x64_y * 2);
    9511             : 
    9512           0 :             for (block_32x32_y = 0; block_32x32_y < 2; block_32x32_y++) {
    9513           0 :                 for (block_32x32_x = 0; block_32x32_x < 2; block_32x32_x++) {
    9514           0 :                     block_32x32_index = (block_64x64_index * 4) + block_32x32_x + (block_32x32_y * 2);
    9515             : 
    9516           0 :                     for (block_16x16_y = 0; block_16x16_y < 2; block_16x16_y++) {
    9517           0 :                         for (block_16x16_x = 0; block_16x16_x < 2; block_16x16_x++) {
    9518           0 :                             block_16x16_index = (block_32x32_index * 4) + block_16x16_x + (block_16x16_y * 2);
    9519             : 
    9520           0 :                             for (block_8x8_y = 0; block_8x8_y < 2; block_8x8_y++) {
    9521           0 :                                 for (block_8x8_x = 0; block_8x8_x < 2; block_8x8_x++) {
    9522           0 :                                     block_8x8_index = (block_16x16_index * 4) + block_8x8_x + (block_8x8_y * 2);
    9523             : 
    9524           0 :                                     for (block_4x4_y = 0; block_4x4_y < 2; block_4x4_y++) {
    9525           0 :                                         for (block_4x4_x = 0; block_4x4_x < 2; block_4x4_x++) {
    9526           0 :                                             block_4x4_index = (block_8x8_index * 4) + block_4x4_x + (block_4x4_y * 2);
    9527             : 
    9528           0 :                                             uint32_t block_4x4_addr_y = (block_64x64_y * 64) + (block_32x32_y * 32) + (block_16x16_y * 16) + (block_8x8_y * 8) + (block_4x4_y * 4);
    9529           0 :                                             uint32_t block_4x4_addr_x = (block_64x64_x * 64) + (block_32x32_x * 32) + (block_16x16_x * 16) + (block_8x8_x * 8) + (block_4x4_x * 4);
    9530           0 :                                             uint32_t block_4x4_addr_src = (block_4x4_addr_y * src_stride) + block_4x4_addr_x;
    9531           0 :                                             uint32_t block_4x4_addr_ref = ref_index + ((block_4x4_addr_y * ref_luma_stride) + block_4x4_addr_x);
    9532             : 
    9533             :                                             //4x4
    9534           0 :                                             dist_4x4[block_4x4_index] = eb_sad_kernel4x4(
    9535           0 :                                                 src_ptr + block_4x4_addr_src,
    9536             :                                                 src_stride,
    9537           0 :                                                 ref_ptr + block_4x4_addr_ref,
    9538             :                                                 ref_luma_stride,
    9539             :                                                 4,
    9540             :                                                 4);
    9541             : 
    9542           0 :                                             if (dist_4x4[block_4x4_index] < best_dist_4x4[block_4x4_index]) {
    9543           0 :                                                 best_mv_4x4[block_4x4_index] = curr_mv;
    9544           0 :                                                 best_dist_4x4[block_4x4_index] = dist_4x4[block_4x4_index];
    9545             :                                             }
    9546             :                                         }
    9547             :                                     }
    9548             : 
    9549             :                                     // Nader - Full-pel search for depth 4 blocks
    9550           0 :                                     in_loop_me_8xN_Nx8_distortion_update(
    9551             :                                         //Inputs
    9552             :                                         curr_mv,
    9553             :                                         block_4x4_index,
    9554             :                                         dist_4x4,
    9555             :                                         //Outputs
    9556             :                                         best_mv_8x4,
    9557             :                                         best_dist_8x4,
    9558             :                                         dist_8x4,
    9559             :                                         best_mv_4x8,
    9560             :                                         best_dist_4x8,
    9561             :                                         dist_4x8,
    9562             :                                         best_mv_8x8,
    9563             :                                         best_dist_8x8,
    9564             :                                         dist_8x8);
    9565             :                                 }
    9566             :                             }
    9567             : 
    9568             :                             // Nader - Full-pel search for depth 3 blocks
    9569           0 :                             in_loop_me_16xN_Nx16_distortion_update(
    9570             :                                 //Inputs
    9571             :                                 curr_mv,
    9572             :                                 block_8x8_index,
    9573             :                                 block_4x4_index,
    9574             :                                 dist_8x4,
    9575             :                                 dist_4x8,
    9576             :                                 dist_8x8,
    9577             :                                 //Outputs
    9578             :                                 best_mv_16x4,
    9579             :                                 best_dist_16x4,
    9580             :                                 dist_16x4,
    9581             :                                 best_mv_16x8,
    9582             :                                 best_dist_16x8,
    9583             :                                 dist_16x8,
    9584             :                                 best_mv_4x16,
    9585             :                                 best_dist_4x16,
    9586             :                                 dist_4x16,
    9587             :                                 best_mv_8x16,
    9588             :                                 best_dist_8x16,
    9589             :                                 dist_8x16,
    9590             :                                 best_mv_16x16,
    9591             :                                 best_dist_16x16,
    9592             :                                 dist_16x16);
    9593             :                         }
    9594             :                     }
    9595             : 
    9596             :                     // Nader - Full-pel search for depth 2 blocks
    9597           0 :                     in_loop_me_32xN_Nx32_distortion_update(
    9598             :                         //Inputs
    9599             :                         curr_mv,
    9600             :                         block_16x16_index,
    9601             :                         block_8x8_index,
    9602             :                         dist_16x8,
    9603             :                         dist_8x16,
    9604             :                         dist_16x16,
    9605             :                         //Outputs
    9606             :                         best_mv_32x8,
    9607             :                         best_dist_32x8,
    9608             :                         dist_32x8,
    9609             :                         best_mv_32x16,
    9610             :                         best_dist_32x16,
    9611             :                         dist_32x16,
    9612             :                         best_mv_8x32,
    9613             :                         best_dist_8x32,
    9614             :                         dist_8x32,
    9615             :                         best_mv_16x32,
    9616             :                         best_dist_16x32,
    9617             :                         dist_16x32,
    9618             :                         best_mv_32x32,
    9619             :                         best_dist_32x32,
    9620             :                         dist_32x32);
    9621             :                 }
    9622             :             }
    9623             : 
    9624             :             // Nader - Full-pel search for depth 1 blocks
    9625           0 :             in_loop_me_64xN_Nx64_distortion_update(
    9626             :                 //Inputs
    9627             :                 curr_mv,
    9628             :                 block_32x32_index,
    9629             :                 block_16x16_index,
    9630             :                 dist_32x16,
    9631             :                 dist_16x32,
    9632             :                 dist_32x32,
    9633             :                 //Outputs
    9634             :                 best_mv_64x16,
    9635             :                 best_dist_64x16,
    9636             :                 dist_64x16,
    9637             :                 best_mv_64x32,
    9638             :                 best_dist_64x32,
    9639             :                 dist_64x32,
    9640             :                 best_mv_16x64,
    9641             :                 best_dist_16x64,
    9642             :                 dist_16x64,
    9643             :                 best_mv_32x64,
    9644             :                 best_dist_32x64,
    9645             :                 dist_32x64,
    9646             :                 best_mv_64x64,
    9647             :                 best_dist_64x64,
    9648             :                 dist_64x64);
    9649             :         }
    9650             :     }
    9651             : 
    9652           0 :     if (number_of_sb_quad > 1) {
    9653             :         // Nader - Full-pel search for depth 0 blocks
    9654           0 :         in_loop_me_128xN_Nx128_distortion_update(
    9655             :             //Inputs
    9656             :             curr_mv,
    9657             :             block_64x64_index,
    9658             :             block_32x32_index,
    9659             :             dist_64x32,
    9660             :             dist_32x64,
    9661             :             dist_64x64,
    9662             :             //Outputs
    9663             :             best_mv_128x64,
    9664             :             best_dist_128x64,
    9665             :             dist_128x64,
    9666             :             best_mv_64x128,
    9667             :             best_dist_64x128,
    9668             :             dist_64x128,
    9669             :             best_mv_128x128,
    9670             :             best_dist_128x128,
    9671             :             &dist_128x128);
    9672             :     }
    9673           0 : }
    9674             : 
    9675             : /***************************************************************
    9676             : * in_loop_me_fullpel_search_sblock
    9677             : *  perform the full pel search for the whole super-block
    9678             : ***************************************************************/
    9679           0 : static void in_loop_me_fullpel_search_sblock(
    9680             :     SsMeContext            *context_ptr,
    9681             :     uint32_t                   list_index,
    9682             :     int16_t                   x_search_area_origin,
    9683             :     int16_t                     y_search_area_origin,
    9684             :     uint32_t                   search_area_width,
    9685             :     uint32_t                   search_area_height,
    9686             :     uint32_t                   number_of_sb_quad)
    9687             : {
    9688             :     uint32_t x_search_index, y_search_index;
    9689             : 
    9690           0 :     for (y_search_index = 0; y_search_index < search_area_height; y_search_index++) {
    9691           0 :         for (x_search_index = 0; x_search_index < search_area_width; x_search_index++) {
    9692           0 :             in_loop_me_get_search_point_results_block(
    9693             :                 context_ptr,
    9694             :                 list_index,
    9695           0 :                 x_search_index + y_search_index * context_ptr->interpolated_full_stride[list_index][0],
    9696           0 :                 (int32_t)x_search_index + x_search_area_origin,
    9697           0 :                 (int32_t)y_search_index + y_search_area_origin,
    9698             :                 number_of_sb_quad);
    9699             :         }
    9700             :     }
    9701           0 : }
    9702             : 
    9703           0 : static void in_loop_me_context_dctor(EbPtr p)
    9704             : {
    9705           0 :     SsMeContext* obj = (SsMeContext*)p;
    9706             :     uint32_t                   listIndex;
    9707             :     uint32_t                   refPicIndex;
    9708             : 
    9709           0 :     for (listIndex = 0; listIndex < MAX_NUM_OF_REF_PIC_LIST; listIndex++) {
    9710           0 :         for (refPicIndex = 0; refPicIndex < MAX_REF_IDX; refPicIndex++) {
    9711           0 :             EB_FREE_ARRAY(obj->integer_buffer[listIndex][refPicIndex]);
    9712           0 :             EB_FREE_ARRAY(obj->pos_b_buffer[listIndex][refPicIndex]);
    9713           0 :             EB_FREE_ARRAY(obj->pos_h_buffer[listIndex][refPicIndex]);
    9714           0 :             EB_FREE_ARRAY(obj->pos_j_buffer[listIndex][refPicIndex]);
    9715             :         }
    9716             :     }
    9717             : 
    9718           0 :     EB_FREE_ARRAY(obj->avctemp_buffer);
    9719           0 :     EB_FREE_ALIGNED(obj->sb_buffer);
    9720           0 : }
    9721             : /***************************************************************
    9722             : * in_loop_me_context_ctor
    9723             : *  in-loop motion estimation construtor
    9724             : ***************************************************************/
    9725           0 : EbErrorType in_loop_me_context_ctor(
    9726             :     SsMeContext                          *object_ptr)
    9727             : {
    9728             :     uint32_t                   listIndex;
    9729             :     uint32_t                   refPicIndex;
    9730             : 
    9731           0 :     object_ptr->dctor = in_loop_me_context_dctor;
    9732             : 
    9733             :     // Intermediate LCU-sized buffer to retain the input samples
    9734           0 :     object_ptr->sb_buffer_stride = MAX_SB_SIZE;
    9735             : 
    9736           0 :     EB_MALLOC_ALIGNED(object_ptr->sb_buffer,  MAX_SB_SIZE * object_ptr->sb_buffer_stride);
    9737             : 
    9738           0 :     EB_MEMSET(object_ptr->sb_buffer, 0, sizeof(uint8_t) * MAX_SB_SIZE * object_ptr->sb_buffer_stride);
    9739             : 
    9740           0 :     object_ptr->interpolated_stride = MAX_SEARCH_AREA_WIDTH;
    9741             : 
    9742             :     // EB_MALLOC(EbBitFraction *, object_ptr->mvd_bits_array, sizeof(EbBitFraction) * NUMBER_OF_MVD_CASES, EB_N_PTR);
    9743             :     // 15 intermediate buffers to retain the interpolated reference samples
    9744             : 
    9745             :     //      0    1    2    3
    9746             :     // 0    A    a    b    c
    9747             :     // 1    d    e    f    g
    9748             :     // 2    h    i    j    k
    9749             :     // 3    n    p    q    r
    9750             : 
    9751             :     //                  _____________
    9752             :     //                 |             |
    9753             :     // --I samples --> |Interpolation|-- O samples -->
    9754             :     //                 | ____________|
    9755             : 
    9756             :     // Before Interpolation: 2 x 3
    9757             :     //   I   I
    9758             :     //   I   I
    9759             :     //   I   I
    9760             : 
    9761             :     // After 1-D Horizontal Interpolation: (2 + 1) x 3 - a, b, and c
    9762             :     // O I O I O
    9763             :     // O I O I O
    9764             :     // O I O I O
    9765             : 
    9766             :     // After 1-D Vertical Interpolation: 2 x (3 + 1) - d, h, and n
    9767             :     //   O   O
    9768             :     //   I   I
    9769             :     //   O   O
    9770             :     //   I   I
    9771             :     //   O   O
    9772             :     //   I   I
    9773             :     //   O   O
    9774             : 
    9775             :     // After 2-D (Horizontal/Vertical) Interpolation: (2 + 1) x (3 + 1) - e, f, g, i, j, k, n, p, q, and r
    9776             :     // O   O   O
    9777             :     //   I   I
    9778             :     // O   O   O
    9779             :     //   I   I
    9780             :     // O   O   O
    9781             :     //   I   I
    9782             :     // O   O   O
    9783             : 
    9784           0 :     for (listIndex = 0; listIndex < MAX_NUM_OF_REF_PIC_LIST; listIndex++) {
    9785           0 :         for (refPicIndex = 0; refPicIndex < MAX_REF_IDX; refPicIndex++) {
    9786           0 :             EB_MALLOC_ARRAY(object_ptr->integer_buffer[listIndex][refPicIndex], object_ptr->interpolated_stride * MAX_SEARCH_AREA_HEIGHT);
    9787             : 
    9788           0 :             EB_MALLOC_ARRAY(object_ptr->pos_b_buffer[listIndex][refPicIndex], object_ptr->interpolated_stride * MAX_SEARCH_AREA_HEIGHT);
    9789             : 
    9790           0 :             EB_MALLOC_ARRAY(object_ptr->pos_h_buffer[listIndex][refPicIndex], object_ptr->interpolated_stride * MAX_SEARCH_AREA_HEIGHT);
    9791             : 
    9792           0 :             EB_MALLOC_ARRAY(object_ptr->pos_j_buffer[listIndex][refPicIndex], object_ptr->interpolated_stride * MAX_SEARCH_AREA_HEIGHT);
    9793             :         }
    9794             :     }
    9795             : 
    9796           0 :     EB_MALLOC_ARRAY(object_ptr->avctemp_buffer, object_ptr->interpolated_stride * MAX_SEARCH_AREA_HEIGHT);
    9797             : 
    9798           0 :     return EB_ErrorNone;
    9799             : }
    9800             : 
    9801             : /***************************************************************
    9802             : * in_loop_me_interpolate_search_region_avc_style
    9803             : *  performs AVC-style interpolation for the whole Search Region
    9804             : ***************************************************************/
    9805           0 : static void in_loop_me_interpolate_search_region_avc_style(
    9806             :     SsMeContext           *context_ptr,                       // input/output parameter, ME context ptr, used to get/set interpolated search area Ptr
    9807             :     uint32_t                   listIndex,                        // Refrence picture list index
    9808             :     uint8_t                   *searchRegionBuffer,               // input parameter, search region index, used to point to reference samples
    9809             :     uint32_t                   lumaStride,                       // input parameter, reference Picture stride
    9810             :     uint32_t                   search_area_width,                  // input parameter, search area width
    9811             :     uint32_t                   search_area_height,                 // input parameter, search area height
    9812             :     uint32_t                   inputBitDepth)                    // input parameter, input sample bit depth
    9813             : {
    9814             :     //      0    1    2    3
    9815             :     // 0    A    a    b    c
    9816             :     // 1    d    e    f    g
    9817             :     // 2    h    i    j    k
    9818             :     // 3    n    p    q    r
    9819             : 
    9820             :     // Position  Frac-pos Y  Frac-pos X  Horizontal filter  Vertical filter
    9821             :     // A         0           0           -                  -
    9822             :     // a         0           1           F0                 -
    9823             :     // b         0           2           F1                 -
    9824             :     // c         0           3           F2                 -
    9825             :     // d         1           0           -                  F0
    9826             :     // e         1           1           F0                 F0
    9827             :     // f         1           2           F1                 F0
    9828             :     // g         1           3           F2                 F0
    9829             :     // h         2           0           -                  F1
    9830             :     // i         2           1           F0                 F1
    9831             :     // j         2           2           F1                 F1
    9832             :     // k         2           3           F2                 F1
    9833             :     // n         3           0           -                  F2
    9834             :     // p         3           1           F0                 F2
    9835             :     // q         3           2           F1                 F2
    9836             :     // r         3           3           F2                 F2
    9837             : 
    9838             :     // Start a b c
    9839             : 
    9840             :     // The Search area needs to be a multiple of 8 to align with the ASM kernel
    9841             :     // Also the search area must be oversized by 2 to account for edge conditions
    9842           0 :     uint32_t searchAreaWidthForAsm = ROUND_UP_MUL_8(search_area_width + 2);
    9843             : 
    9844             :     (void)inputBitDepth;
    9845             :     // Half pel interpolation of the search region using f1 -> pos_b_buffer
    9846           0 :     if (searchAreaWidthForAsm) {
    9847           0 :         avc_style_luma_interpolation_filter(
    9848           0 :             searchRegionBuffer - (ME_FILTER_TAP >> 1) * lumaStride - (ME_FILTER_TAP >> 1) + 1,
    9849             :             lumaStride,
    9850             :             context_ptr->pos_b_buffer[listIndex][0],
    9851             :             context_ptr->interpolated_stride,
    9852             :             searchAreaWidthForAsm,
    9853             :             search_area_height + ME_FILTER_TAP,
    9854             :             context_ptr->avctemp_buffer,
    9855             :             EB_FALSE,
    9856             :             2,
    9857             :             2);
    9858             :     }
    9859             : 
    9860             :     // Half pel interpolation of the search region using f1 -> pos_h_buffer
    9861           0 :     if (searchAreaWidthForAsm) {
    9862           0 :         avc_style_luma_interpolation_filter(
    9863           0 :             searchRegionBuffer - (ME_FILTER_TAP >> 1) * lumaStride - 1 + lumaStride,
    9864             :             lumaStride,
    9865             :             context_ptr->pos_h_buffer[listIndex][0],
    9866             :             context_ptr->interpolated_stride,
    9867             :             searchAreaWidthForAsm,
    9868             :             search_area_height + 1,
    9869             :             context_ptr->avctemp_buffer,
    9870             :             EB_FALSE,
    9871             :             2,
    9872             :             8);
    9873             :     }
    9874             : 
    9875           0 :     if (searchAreaWidthForAsm) {
    9876             :         // Half pel interpolation of the search region using f1 -> pos_j_buffer
    9877           0 :         avc_style_luma_interpolation_filter(
    9878           0 :             context_ptr->pos_b_buffer[listIndex][0] + context_ptr->interpolated_stride,
    9879             :             context_ptr->interpolated_stride,
    9880             :             context_ptr->pos_j_buffer[listIndex][0],
    9881             :             context_ptr->interpolated_stride,
    9882             :             searchAreaWidthForAsm,
    9883             :             search_area_height + 1,
    9884             :             context_ptr->avctemp_buffer,
    9885             :             EB_FALSE,
    9886             :             2,
    9887             :             8);
    9888             :     }
    9889             : 
    9890           0 :     return;
    9891             : }
    9892             : 
    9893             : /***************************************************************
    9894             : * in_loop_me_halfpel_refinement_block
    9895             : *   performs Half Pel refinement for one block
    9896             : ***************************************************************/
    9897           0 : static void in_loop_me_halfpel_refinement_block(
    9898             :     SequenceControlSet    *sequence_control_set_ptr,             // input parameter, Sequence control set Ptr
    9899             :     SsMeContext           *context_ptr,                        // input parameter, ME context Ptr, used to get SB Ptr
    9900             :     uint32_t                   block_index_in_sb_buffer,                  // input parameter, PU origin, used to point to source samples
    9901             :     uint8_t                   *pos_b_buffer,                        // input parameter, position "b" interpolated search area Ptr
    9902             :     uint8_t                   *pos_h_buffer,                        // input parameter, position "h" interpolated search area Ptr
    9903             :     uint8_t                   *pos_j_buffer,                        // input parameter, position "j" interpolated search area Ptr
    9904             :     uint32_t                   pu_width,                           // input parameter, PU width
    9905             :     uint32_t                   pu_height,                          // input parameter, PU height
    9906             :     int16_t                   x_search_area_origin,                 // input parameter, search area origin in the horizontal direction, used to point to reference samples
    9907             :     int16_t                   y_search_area_origin,                 // input parameter, search area origin in the vertical direction, used to point to reference samples
    9908             :     uint32_t                  *pBestSad,
    9909             :     uint32_t                  *pBestMV,
    9910             :     uint8_t                   *psubPelDirection
    9911             : )
    9912             : {
    9913           0 :     EncodeContext         *encode_context_ptr = sequence_control_set_ptr->encode_context_ptr;
    9914             : 
    9915             :     int32_t searchRegionIndex;
    9916           0 :     uint64_t bestHalfSad = 0;
    9917           0 :     uint64_t distortionLeftPosition = 0;
    9918           0 :     uint64_t distortionRightPosition = 0;
    9919           0 :     uint64_t distortionTopPosition = 0;
    9920           0 :     uint64_t distortionBottomPosition = 0;
    9921           0 :     uint64_t distortionTopLeftPosition = 0;
    9922           0 :     uint64_t distortionTopRightPosition = 0;
    9923           0 :     uint64_t distortionBottomLeftPosition = 0;
    9924           0 :     uint64_t distortionBottomRightPosition = 0;
    9925             : 
    9926             :     int16_t xMvHalf[8];
    9927             :     int16_t yMvHalf[8];
    9928             : 
    9929           0 :     int16_t x_mv = _MVXT(*pBestMV);
    9930           0 :     int16_t y_mv = _MVYT(*pBestMV);
    9931           0 :     int16_t xSearchIndex = (x_mv >> 2) - x_search_area_origin;
    9932           0 :     int16_t ySearchIndex = (y_mv >> 2) - y_search_area_origin;
    9933             : 
    9934             :     (void)sequence_control_set_ptr;
    9935             :     (void)encode_context_ptr;
    9936             : 
    9937             :     //TODO : remove these, and update the MV by just shifts
    9938             : 
    9939           0 :     xMvHalf[0] = x_mv - 2; // L  position
    9940           0 :     xMvHalf[1] = x_mv + 2; // R  position
    9941           0 :     xMvHalf[2] = x_mv;     // T  position
    9942           0 :     xMvHalf[3] = x_mv;     // B  position
    9943           0 :     xMvHalf[4] = x_mv - 2; // TL position
    9944           0 :     xMvHalf[5] = x_mv + 2; // TR position
    9945           0 :     xMvHalf[6] = x_mv + 2; // BR position
    9946           0 :     xMvHalf[7] = x_mv - 2; // BL position
    9947             : 
    9948           0 :     yMvHalf[0] = y_mv;     // L  position
    9949           0 :     yMvHalf[1] = y_mv;     // R  position
    9950           0 :     yMvHalf[2] = y_mv - 2; // T  position
    9951           0 :     yMvHalf[3] = y_mv + 2; // B  position
    9952           0 :     yMvHalf[4] = y_mv - 2; // TL position
    9953           0 :     yMvHalf[5] = y_mv - 2; // TR position
    9954           0 :     yMvHalf[6] = y_mv + 2; // BR position
    9955           0 :     yMvHalf[7] = y_mv + 2; // BL position
    9956             : 
    9957             :     // L position
    9958           0 :     searchRegionIndex = xSearchIndex + (int16_t)context_ptr->interpolated_stride * ySearchIndex;
    9959           0 :     distortionLeftPosition = (nxm_sad_kernel(&(context_ptr->sb_src_ptr[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, &(pos_b_buffer[searchRegionIndex]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1;
    9960           0 :     if (distortionLeftPosition < *pBestSad) {
    9961           0 :         *pBestSad = (uint32_t)distortionLeftPosition;
    9962           0 :         *pBestMV = ((uint16_t)yMvHalf[0] << 16) | ((uint16_t)xMvHalf[0]);
    9963             :     }
    9964             : 
    9965             :     // R position
    9966           0 :     searchRegionIndex++;
    9967           0 :     distortionRightPosition = (nxm_sad_kernel(&(context_ptr->sb_src_ptr[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, &(pos_b_buffer[searchRegionIndex]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1;
    9968             : 
    9969           0 :     if (distortionRightPosition < *pBestSad) {
    9970           0 :         *pBestSad = (uint32_t)distortionRightPosition;
    9971           0 :         *pBestMV = ((uint16_t)yMvHalf[1] << 16) | ((uint16_t)xMvHalf[1]);
    9972             :     }
    9973             : 
    9974             :     // T position
    9975           0 :     searchRegionIndex = xSearchIndex + (int16_t)context_ptr->interpolated_stride * ySearchIndex;
    9976           0 :     distortionTopPosition = (nxm_sad_kernel(&(context_ptr->sb_src_ptr[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, &(pos_h_buffer[searchRegionIndex]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1;
    9977           0 :     if (distortionTopPosition < *pBestSad) {
    9978           0 :         *pBestSad = (uint32_t)distortionTopPosition;
    9979           0 :         *pBestMV = ((uint16_t)yMvHalf[2] << 16) | ((uint16_t)xMvHalf[2]);
    9980             :     }
    9981             : 
    9982             :     // B position
    9983           0 :     searchRegionIndex += (int16_t)context_ptr->interpolated_stride;
    9984           0 :     distortionBottomPosition = (nxm_sad_kernel(&(context_ptr->sb_src_ptr[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, &(pos_h_buffer[searchRegionIndex]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1;
    9985           0 :     if (distortionBottomPosition < *pBestSad) {
    9986           0 :         *pBestSad = (uint32_t)distortionBottomPosition;
    9987           0 :         *pBestMV = ((uint16_t)yMvHalf[3] << 16) | ((uint16_t)xMvHalf[3]);
    9988             :     }
    9989             : 
    9990             :     //TL position
    9991           0 :     searchRegionIndex = xSearchIndex + (int16_t)context_ptr->interpolated_stride * ySearchIndex;
    9992           0 :     distortionTopLeftPosition = (nxm_sad_kernel(&(context_ptr->sb_src_ptr[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, &(pos_j_buffer[searchRegionIndex]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1;
    9993           0 :     if (distortionTopLeftPosition < *pBestSad) {
    9994           0 :         *pBestSad = (uint32_t)distortionTopLeftPosition;
    9995           0 :         *pBestMV = ((uint16_t)yMvHalf[4] << 16) | ((uint16_t)xMvHalf[4]);
    9996             :     }
    9997             : 
    9998             :     //TR position
    9999           0 :     searchRegionIndex++;
   10000           0 :     distortionTopRightPosition = (nxm_sad_kernel(&(context_ptr->sb_src_ptr[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, &(pos_j_buffer[searchRegionIndex]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1;
   10001           0 :     if (distortionTopRightPosition < *pBestSad) {
   10002           0 :         *pBestSad = (uint32_t)distortionTopRightPosition;
   10003           0 :         *pBestMV = ((uint16_t)yMvHalf[5] << 16) | ((uint16_t)xMvHalf[5]);
   10004             :     }
   10005             : 
   10006             :     //BR position
   10007           0 :     searchRegionIndex += (int16_t)context_ptr->interpolated_stride;
   10008           0 :     distortionBottomRightPosition = (nxm_sad_kernel(&(context_ptr->sb_src_ptr[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, &(pos_j_buffer[searchRegionIndex]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1;
   10009           0 :     if (distortionBottomRightPosition < *pBestSad) {
   10010           0 :         *pBestSad = (uint32_t)distortionBottomRightPosition;
   10011           0 :         *pBestMV = ((uint16_t)yMvHalf[6] << 16) | ((uint16_t)xMvHalf[6]);
   10012             :     }
   10013             : 
   10014             :     //BL position
   10015           0 :     searchRegionIndex--;
   10016           0 :     distortionBottomLeftPosition = (nxm_sad_kernel(&(context_ptr->sb_src_ptr[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, &(pos_j_buffer[searchRegionIndex]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1;
   10017           0 :     if (distortionBottomLeftPosition < *pBestSad) {
   10018           0 :         *pBestSad = (uint32_t)distortionBottomLeftPosition;
   10019           0 :         *pBestMV = ((uint16_t)yMvHalf[7] << 16) | ((uint16_t)xMvHalf[7]);
   10020             :     }
   10021             : 
   10022           0 :     bestHalfSad = MIN(distortionLeftPosition, MIN(distortionRightPosition, MIN(distortionTopPosition, MIN(distortionBottomPosition, MIN(distortionTopLeftPosition, MIN(distortionTopRightPosition, MIN(distortionBottomLeftPosition, distortionBottomRightPosition)))))));
   10023             : 
   10024           0 :     if (bestHalfSad == distortionLeftPosition)
   10025           0 :         *psubPelDirection = LEFT_POSITION;
   10026           0 :     else if (bestHalfSad == distortionRightPosition)
   10027           0 :         *psubPelDirection = RIGHT_POSITION;
   10028           0 :     else if (bestHalfSad == distortionTopPosition)
   10029           0 :         *psubPelDirection = TOP_POSITION;
   10030           0 :     else if (bestHalfSad == distortionBottomPosition)
   10031           0 :         *psubPelDirection = BOTTOM_POSITION;
   10032           0 :     else if (bestHalfSad == distortionTopLeftPosition)
   10033           0 :         *psubPelDirection = TOP_LEFT_POSITION;
   10034           0 :     else if (bestHalfSad == distortionTopRightPosition)
   10035           0 :         *psubPelDirection = TOP_RIGHT_POSITION;
   10036           0 :     else if (bestHalfSad == distortionBottomLeftPosition)
   10037           0 :         *psubPelDirection = BOTTOM_LEFT_POSITION;
   10038           0 :     else if (bestHalfSad == distortionBottomRightPosition)
   10039           0 :         *psubPelDirection = BOTTOM_RIGHT_POSITION;
   10040           0 :     return;
   10041             : }
   10042             : 
   10043             : /***************************************************************
   10044             : * in_loop_me_halfpel_search_sblock
   10045             : *   performs Half Pel refinement
   10046             : ***************************************************************/
   10047           0 : void in_loop_me_halfpel_search_sblock(
   10048             :     SequenceControlSet    *sequence_control_set_ptr,             // input parameter, Sequence control set Ptr
   10049             :     SsMeContext           *context_ptr,                        // input/output parameter, ME context Ptr, used to get/update ME results
   10050             :     uint8_t                   *pos_b_buffer,                        // input parameter, position "b" interpolated search area Ptr
   10051             :     uint8_t                   *pos_h_buffer,                        // input parameter, position "h" interpolated search area Ptr
   10052             :     uint8_t                   *pos_j_buffer,                        // input parameter, position "j" interpolated search area Ptr
   10053             :     int16_t                   x_search_area_origin,                 // input parameter, search area origin in the horizontal direction, used to point to reference samples
   10054             :     int16_t                   y_search_area_origin)                 // input parameter, search area origin in the vertical direction, used to point to reference samples
   10055             : {
   10056             :     uint32_t idx;
   10057             :     uint32_t block_index;
   10058             :     uint32_t block_shift_x;
   10059             :     uint32_t block_shift_y;
   10060             :     uint32_t block_index_in_sb_buffer;
   10061             :     uint32_t posb_buffer_index;
   10062             :     uint32_t posh_buffer_index;
   10063             :     uint32_t posj_buffer_index;
   10064             : 
   10065           0 :     uint32_t block_offset = 0;
   10066           0 :     uint32_t x_offset = 0;
   10067           0 :     uint32_t y_offset = 0;
   10068           0 :     uint32_t quad_index = 0;
   10069           0 :     uint32_t number_of_sb_quad = context_ptr->sb_size == BLOCK_128X128 ? 4 : 1;
   10070             : 
   10071             :     // 4x4   [256 4x4 blocks]
   10072             : 
   10073           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10074           0 :         for (block_index = 0; block_index < 256; ++block_index) {
   10075           0 :             block_offset = (quad_index * 256);
   10076           0 :             x_offset = (quad_index & 0x01) << 6;
   10077           0 :             y_offset = (quad_index >> 1) << 6;
   10078           0 :             idx = tab4x4[block_index] + block_offset;
   10079           0 :             block_shift_x = ((block_index & 0xf) << 2) + x_offset;
   10080           0 :             block_shift_y = ((block_index >> 4) << 2) + y_offset;
   10081             : 
   10082           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10083             : 
   10084           0 :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10085           0 :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10086           0 :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10087             : 
   10088           0 :             in_loop_me_halfpel_refinement_block(
   10089             :                 sequence_control_set_ptr,
   10090             :                 context_ptr,
   10091             :                 block_index_in_sb_buffer,
   10092             :                 &(pos_b_buffer[posb_buffer_index]),
   10093             :                 &(pos_h_buffer[posh_buffer_index]),
   10094             :                 &(pos_j_buffer[posj_buffer_index]),
   10095             :                 4,
   10096             :                 4,
   10097             :                 x_search_area_origin,
   10098             :                 y_search_area_origin,
   10099           0 :                 &context_ptr->p_best_sad4x4[idx],
   10100           0 :                 &context_ptr->p_best_mv4x4[idx],
   10101             :                 &context_ptr->psub_pel_direction4x4[idx]);
   10102             :         }
   10103             :     }
   10104             : 
   10105             :     // 8x4   [128 8x4 blocks]
   10106             : 
   10107           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10108           0 :         for (block_index = 0; block_index < 128; ++block_index) {
   10109           0 :             block_offset = (quad_index * 128);
   10110           0 :             x_offset = (quad_index & 0x01) << 6;
   10111           0 :             y_offset = (quad_index >> 1) << 6;
   10112           0 :             idx = tab8x4[block_index] + block_offset;
   10113           0 :             block_shift_x = ((block_index & 0x07) << 3) + x_offset;;
   10114           0 :             block_shift_y = ((block_index >> 3) << 2) + y_offset;;
   10115             : 
   10116           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10117             : 
   10118           0 :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10119           0 :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10120           0 :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10121             : 
   10122           0 :             in_loop_me_halfpel_refinement_block(
   10123             :                 sequence_control_set_ptr,
   10124             :                 context_ptr,
   10125             :                 block_index_in_sb_buffer,
   10126             :                 &(pos_b_buffer[posb_buffer_index]),
   10127             :                 &(pos_h_buffer[posh_buffer_index]),
   10128             :                 &(pos_j_buffer[posj_buffer_index]),
   10129             :                 8,
   10130             :                 4,
   10131             :                 x_search_area_origin,
   10132             :                 y_search_area_origin,
   10133           0 :                 &context_ptr->p_best_sad8x4[idx],
   10134           0 :                 &context_ptr->p_best_mv8x4[idx],
   10135             :                 &context_ptr->psub_pel_direction8x4[idx]);
   10136             :         }
   10137             :     }
   10138             : 
   10139             :     // 4x8   [128 4x8 blocks]
   10140             : 
   10141           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10142           0 :         for (block_index = 0; block_index < 128; ++block_index) {
   10143           0 :             block_offset = (quad_index * 128);
   10144           0 :             x_offset = (quad_index & 0x01) << 6;
   10145           0 :             y_offset = (quad_index >> 1) << 6;
   10146           0 :             idx = tab4x8[block_index] + block_offset;
   10147           0 :             block_shift_x = ((block_index & 0xf) << 2) + x_offset;
   10148           0 :             block_shift_y = ((block_index >> 4) << 3) + y_offset;
   10149             : 
   10150           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10151             : 
   10152           0 :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10153           0 :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10154           0 :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10155             : 
   10156           0 :             in_loop_me_halfpel_refinement_block(
   10157             :                 sequence_control_set_ptr,
   10158             :                 context_ptr,
   10159             :                 block_index_in_sb_buffer,
   10160             :                 &(pos_b_buffer[posb_buffer_index]),
   10161             :                 &(pos_h_buffer[posh_buffer_index]),
   10162             :                 &(pos_j_buffer[posj_buffer_index]),
   10163             :                 4,
   10164             :                 8,
   10165             :                 x_search_area_origin,
   10166             :                 y_search_area_origin,
   10167           0 :                 &context_ptr->p_best_sad4x8[idx],
   10168           0 :                 &context_ptr->p_best_mv4x8[idx],
   10169             :                 &context_ptr->psub_pel_direction4x8[idx]);
   10170             :         }
   10171             :     }
   10172             : 
   10173             :     // 8x8   [64 8x8 blocks]
   10174           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10175           0 :         for (block_index = 0; block_index < 64; ++block_index) {
   10176           0 :             block_offset = (quad_index * 64);
   10177           0 :             x_offset = (quad_index & 0x01) << 6;
   10178           0 :             y_offset = (quad_index >> 1) << 6;
   10179           0 :             idx = tab8x8[block_index] + block_offset;
   10180           0 :             block_shift_x = ((block_index & 0x07) << 3) + x_offset;
   10181           0 :             block_shift_y = ((block_index >> 3) << 3) + y_offset;
   10182             : 
   10183           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10184             : 
   10185           0 :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10186           0 :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10187           0 :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10188             : 
   10189           0 :             in_loop_me_halfpel_refinement_block(
   10190             :                 sequence_control_set_ptr,
   10191             :                 context_ptr,
   10192             :                 block_index_in_sb_buffer,
   10193             :                 &(pos_b_buffer[posb_buffer_index]),
   10194             :                 &(pos_h_buffer[posh_buffer_index]),
   10195             :                 &(pos_j_buffer[posj_buffer_index]),
   10196             :                 8,
   10197             :                 8,
   10198             :                 x_search_area_origin,
   10199             :                 y_search_area_origin,
   10200           0 :                 &context_ptr->p_best_sad8x8[idx],
   10201           0 :                 &context_ptr->p_best_mv8x8[idx],
   10202             :                 &context_ptr->psub_pel_direction8x8[idx]);
   10203             :         }
   10204             :     }
   10205             : 
   10206             :     // 16x8 [32 partitions]
   10207           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10208           0 :         for (block_index = 0; block_index < 32; ++block_index) {
   10209           0 :             block_offset = (quad_index * 32);
   10210           0 :             x_offset = (quad_index & 0x01) << 6;
   10211           0 :             y_offset = (quad_index >> 1) << 6;
   10212           0 :             idx = tab16x8[block_index] + block_offset;
   10213           0 :             block_shift_x = ((block_index & 0x03) << 4) + x_offset;
   10214           0 :             block_shift_y = ((block_index >> 2) << 3) + y_offset;
   10215             : 
   10216           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10217             : 
   10218           0 :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10219           0 :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10220           0 :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10221             : 
   10222           0 :             in_loop_me_halfpel_refinement_block(
   10223             :                 sequence_control_set_ptr,
   10224             :                 context_ptr,
   10225             :                 block_index_in_sb_buffer,
   10226             :                 &(pos_b_buffer[posb_buffer_index]),
   10227             :                 &(pos_h_buffer[posh_buffer_index]),
   10228             :                 &(pos_j_buffer[posj_buffer_index]),
   10229             :                 16,
   10230             :                 8,
   10231             :                 x_search_area_origin,
   10232             :                 y_search_area_origin,
   10233           0 :                 &context_ptr->p_best_sad16x8[idx],
   10234           0 :                 &context_ptr->p_best_mv16x8[idx],
   10235             :                 &context_ptr->psub_pel_direction16x8[idx]);
   10236             :         }
   10237             :     }
   10238             : 
   10239             :     // 8x16 [32 partitions]
   10240             : 
   10241           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10242           0 :         for (block_index = 0; block_index < 32; ++block_index) {
   10243           0 :             block_offset = (quad_index * 32);
   10244           0 :             x_offset = (quad_index & 0x01) << 6;
   10245           0 :             y_offset = (quad_index >> 1) << 6;
   10246           0 :             idx = tab8x16[block_index] + block_offset;
   10247           0 :             block_shift_x = ((block_index & 0x07) << 3) + x_offset;
   10248           0 :             block_shift_y = ((block_index >> 3) << 4) + y_offset;
   10249             : 
   10250           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10251             : 
   10252           0 :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10253           0 :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10254           0 :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10255             : 
   10256           0 :             in_loop_me_halfpel_refinement_block(
   10257             :                 sequence_control_set_ptr,
   10258             :                 context_ptr,
   10259             :                 block_index_in_sb_buffer,
   10260             :                 &(pos_b_buffer[posb_buffer_index]),
   10261             :                 &(pos_h_buffer[posh_buffer_index]),
   10262             :                 &(pos_j_buffer[posj_buffer_index]),
   10263             :                 8,
   10264             :                 16,
   10265             :                 x_search_area_origin,
   10266             :                 y_search_area_origin,
   10267           0 :                 &context_ptr->p_best_sad8x16[idx],
   10268           0 :                 &context_ptr->p_best_mv8x16[idx],
   10269             :                 &context_ptr->psub_pel_direction8x16[idx]);
   10270             :         }
   10271             :     }
   10272             : 
   10273             :     // 32x8 [16 partitions]
   10274           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10275           0 :         for (block_index = 0; block_index < 16; ++block_index) {
   10276           0 :             block_offset = (quad_index * 16);
   10277           0 :             x_offset = (quad_index & 0x01) << 6;
   10278           0 :             y_offset = (quad_index >> 1) << 6;
   10279           0 :             idx = tab32x8[block_index] + block_offset;
   10280           0 :             block_shift_x = ((block_index & 0x01) << 5) + x_offset;
   10281           0 :             block_shift_y = ((block_index >> 1) << 3) + y_offset;
   10282             : 
   10283           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10284             : 
   10285           0 :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10286           0 :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10287           0 :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10288             : 
   10289           0 :             in_loop_me_halfpel_refinement_block(
   10290             :                 sequence_control_set_ptr,
   10291             :                 context_ptr,
   10292             :                 block_index_in_sb_buffer,
   10293             :                 &(pos_b_buffer[posb_buffer_index]),
   10294             :                 &(pos_h_buffer[posh_buffer_index]),
   10295             :                 &(pos_j_buffer[posj_buffer_index]),
   10296             :                 32,
   10297             :                 8,
   10298             :                 x_search_area_origin,
   10299             :                 y_search_area_origin,
   10300           0 :                 &context_ptr->p_best_sad32x8[idx],
   10301           0 :                 &context_ptr->p_best_mv32x8[idx],
   10302             :                 &context_ptr->psub_pel_direction32x8[idx]);
   10303             :         }
   10304             :     }
   10305             : 
   10306             :     // 8x32 [16 partitions]
   10307           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10308           0 :         for (block_index = 0; block_index < 16; ++block_index) {
   10309           0 :             block_offset = (quad_index * 16);
   10310           0 :             idx = tab8x32[block_index] + block_offset;
   10311           0 :             x_offset = (quad_index & 0x01) << 6;
   10312           0 :             y_offset = (quad_index >> 1) << 6;
   10313           0 :             block_shift_x = ((block_index & 0x07) << 3) + x_offset;
   10314           0 :             block_shift_y = ((block_index >> 3) << 5) + y_offset;
   10315             : 
   10316           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10317             : 
   10318           0 :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10319           0 :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10320           0 :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10321             : 
   10322           0 :             in_loop_me_halfpel_refinement_block(
   10323             :                 sequence_control_set_ptr,
   10324             :                 context_ptr,
   10325             :                 block_index_in_sb_buffer,
   10326             :                 &(pos_b_buffer[posb_buffer_index]),
   10327             :                 &(pos_h_buffer[posh_buffer_index]),
   10328             :                 &(pos_j_buffer[posj_buffer_index]),
   10329             :                 8,
   10330             :                 32,
   10331             :                 x_search_area_origin,
   10332             :                 y_search_area_origin,
   10333           0 :                 &context_ptr->p_best_sad8x32[idx],
   10334           0 :                 &context_ptr->p_best_mv8x32[idx],
   10335             :                 &context_ptr->psub_pel_direction8x32[idx]);
   10336             :         }
   10337             :     }
   10338             : 
   10339             :     // 16x16 [16 partitions]
   10340           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10341           0 :         for (block_index = 0; block_index < 16; ++block_index) {
   10342           0 :             block_offset = (quad_index * 16);
   10343           0 :             x_offset = (quad_index & 0x01) << 6;
   10344           0 :             y_offset = (quad_index >> 1) << 6;
   10345           0 :             idx = tab16x16[block_index] + block_offset;
   10346           0 :             block_shift_x = ((block_index & 0x03) << 4) + x_offset;
   10347           0 :             block_shift_y = ((block_index >> 2) << 4) + y_offset;
   10348             : 
   10349           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10350             : 
   10351           0 :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10352           0 :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10353           0 :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10354             : 
   10355           0 :             in_loop_me_halfpel_refinement_block(
   10356             :                 sequence_control_set_ptr,
   10357             :                 context_ptr,
   10358             :                 block_index_in_sb_buffer,
   10359             :                 &(pos_b_buffer[posb_buffer_index]),
   10360             :                 &(pos_h_buffer[posh_buffer_index]),
   10361             :                 &(pos_j_buffer[posj_buffer_index]),
   10362             :                 16,
   10363             :                 16,
   10364             :                 x_search_area_origin,
   10365             :                 y_search_area_origin,
   10366           0 :                 &context_ptr->p_best_sad16x16[idx],
   10367           0 :                 &context_ptr->p_best_mv16x16[idx],
   10368             :                 &context_ptr->psub_pel_direction16x16[idx]);
   10369             :         }
   10370             :     }
   10371             : 
   10372             :     // 32x16 [8 partitions]
   10373             : 
   10374           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10375           0 :         for (block_index = 0; block_index < 8; ++block_index) {
   10376           0 :             block_offset = (quad_index * 8);
   10377           0 :             x_offset = (quad_index & 0x01) << 6;
   10378           0 :             y_offset = (quad_index >> 1) << 6;
   10379           0 :             idx = tab32x16[block_index] + block_offset;
   10380           0 :             block_shift_x = ((block_index & 0x01) << 5) + x_offset;
   10381           0 :             block_shift_y = ((block_index >> 1) << 4) + y_offset;
   10382             : 
   10383           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10384             : 
   10385           0 :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10386           0 :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10387           0 :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10388             : 
   10389           0 :             in_loop_me_halfpel_refinement_block(
   10390             :                 sequence_control_set_ptr,
   10391             :                 context_ptr,
   10392             :                 block_index_in_sb_buffer,
   10393             :                 &(pos_b_buffer[posb_buffer_index]),
   10394             :                 &(pos_h_buffer[posh_buffer_index]),
   10395             :                 &(pos_j_buffer[posj_buffer_index]),
   10396             :                 32,
   10397             :                 16,
   10398             :                 x_search_area_origin,
   10399             :                 y_search_area_origin,
   10400           0 :                 &context_ptr->p_best_sad32x16[idx],
   10401           0 :                 &context_ptr->p_best_mv32x16[idx],
   10402             :                 &context_ptr->psub_pel_direction32x16[idx]);
   10403             :         }
   10404             :     }
   10405             : 
   10406             :     // 16x32 [8 partitions]
   10407           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10408           0 :         for (block_index = 0; block_index < 8; ++block_index) {
   10409           0 :             block_offset = (quad_index * 8);
   10410           0 :             x_offset = (quad_index & 0x01) << 6;
   10411           0 :             y_offset = (quad_index >> 1) << 6;
   10412           0 :             idx = tab16x32[block_index] + block_offset;
   10413           0 :             block_shift_x = ((block_index & 0x03) << 4) + x_offset;
   10414           0 :             block_shift_y = ((block_index >> 2) << 5) + y_offset;
   10415             : 
   10416           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10417             : 
   10418           0 :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10419           0 :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10420           0 :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10421             : 
   10422           0 :             in_loop_me_halfpel_refinement_block(
   10423             :                 sequence_control_set_ptr,
   10424             :                 context_ptr,
   10425             :                 block_index_in_sb_buffer,
   10426             :                 &(pos_b_buffer[posb_buffer_index]),
   10427             :                 &(pos_h_buffer[posh_buffer_index]),
   10428             :                 &(pos_j_buffer[posj_buffer_index]),
   10429             :                 16,
   10430             :                 32,
   10431             :                 x_search_area_origin,
   10432             :                 y_search_area_origin,
   10433           0 :                 &context_ptr->p_best_sad16x32[idx],
   10434           0 :                 &context_ptr->p_best_mv16x32[idx],
   10435             :                 &context_ptr->psub_pel_direction16x32[idx]);
   10436             :         }
   10437             :     }
   10438             : 
   10439             :     // 32x32 [4 partitions]
   10440             : 
   10441           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10442           0 :         for (block_index = 0; block_index < 4; ++block_index) {
   10443           0 :             block_offset = (quad_index * 4);
   10444           0 :             x_offset = (quad_index & 0x01) << 6;
   10445           0 :             y_offset = (quad_index >> 1) << 6;
   10446           0 :             idx = tab32x32[block_index] + block_offset;
   10447           0 :             block_shift_x = ((block_index & 0x01) << 5) + x_offset;
   10448           0 :             block_shift_y = ((block_index >> 1) << 5) + y_offset;
   10449             : 
   10450           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10451             : 
   10452           0 :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10453           0 :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10454           0 :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10455             : 
   10456           0 :             in_loop_me_halfpel_refinement_block(
   10457             :                 sequence_control_set_ptr,
   10458             :                 context_ptr,
   10459             :                 block_index_in_sb_buffer,
   10460             :                 &(pos_b_buffer[posb_buffer_index]),
   10461             :                 &(pos_h_buffer[posh_buffer_index]),
   10462             :                 &(pos_j_buffer[posj_buffer_index]),
   10463             :                 32,
   10464             :                 32,
   10465             :                 x_search_area_origin,
   10466             :                 y_search_area_origin,
   10467           0 :                 &context_ptr->p_best_sad32x32[idx],
   10468           0 :                 &context_ptr->p_best_mv32x32[idx],
   10469             :                 &context_ptr->psub_pel_direction32x32[idx]);
   10470             :         }
   10471             :     }
   10472             : 
   10473             :     // 64x32 [2 partitions]
   10474           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10475           0 :         for (block_index = 0; block_index < 2; ++block_index) {
   10476           0 :             block_offset = (quad_index * 2);
   10477           0 :             x_offset = (quad_index & 0x01) << 6;
   10478           0 :             y_offset = (quad_index >> 1) << 6;
   10479           0 :             idx = tab64x32[block_index] + block_offset;
   10480           0 :             block_shift_x = x_offset;
   10481           0 :             block_shift_y = (block_index << 5) + y_offset;
   10482             : 
   10483           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10484             : 
   10485           0 :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10486           0 :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10487           0 :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10488             : 
   10489           0 :             in_loop_me_halfpel_refinement_block(
   10490             :                 sequence_control_set_ptr,
   10491             :                 context_ptr,
   10492             :                 block_index_in_sb_buffer,
   10493             :                 &(pos_b_buffer[posb_buffer_index]),
   10494             :                 &(pos_h_buffer[posh_buffer_index]),
   10495             :                 &(pos_j_buffer[posj_buffer_index]),
   10496             :                 64,
   10497             :                 32,
   10498             :                 x_search_area_origin,
   10499             :                 y_search_area_origin,
   10500           0 :                 &context_ptr->p_best_sad64x32[idx],
   10501           0 :                 &context_ptr->p_best_mv64x32[idx],
   10502             :                 &context_ptr->psub_pel_direction64x32[idx]);
   10503             :         }
   10504             :     }
   10505             : 
   10506             :     // 32x64 [2 partitions]
   10507           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10508           0 :         for (block_index = 0; block_index < 2; ++block_index) {
   10509           0 :             block_offset = (quad_index * 2);
   10510           0 :             x_offset = (quad_index & 0x01) << 6;
   10511           0 :             y_offset = (quad_index >> 1) << 6;
   10512           0 :             idx = tab32x64[block_index] + block_offset;
   10513           0 :             block_shift_x = (block_index << 5) + x_offset;
   10514           0 :             block_shift_y = y_offset;
   10515             : 
   10516           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10517             : 
   10518           0 :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10519           0 :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10520           0 :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10521             : 
   10522           0 :             in_loop_me_halfpel_refinement_block(
   10523             :                 sequence_control_set_ptr,
   10524             :                 context_ptr,
   10525             :                 block_index_in_sb_buffer,
   10526             :                 &(pos_b_buffer[posb_buffer_index]),
   10527             :                 &(pos_h_buffer[posh_buffer_index]),
   10528             :                 &(pos_j_buffer[posj_buffer_index]),
   10529             :                 32,
   10530             :                 64,
   10531             :                 x_search_area_origin,
   10532             :                 y_search_area_origin,
   10533           0 :                 &context_ptr->p_best_sad32x64[idx],
   10534           0 :                 &context_ptr->p_best_mv32x64[idx],
   10535             :                 &context_ptr->psub_pel_direction32x64[idx]);
   10536             :         }
   10537             :     }
   10538             : 
   10539             :     // 64x64 [1 partition]
   10540           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10541           0 :         idx = quad_index;
   10542           0 :         x_offset = (quad_index & 0x01) << 6;
   10543           0 :         y_offset = (quad_index >> 1) << 6;
   10544           0 :         block_shift_x = x_offset;
   10545           0 :         block_shift_y = y_offset;
   10546             : 
   10547           0 :         block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10548             : 
   10549           0 :         posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10550           0 :         posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10551           0 :         posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10552             : 
   10553           0 :         in_loop_me_halfpel_refinement_block(
   10554             :             sequence_control_set_ptr,
   10555             :             context_ptr,
   10556             :             block_index_in_sb_buffer,
   10557             :             &(pos_b_buffer[posb_buffer_index]),
   10558             :             &(pos_h_buffer[posh_buffer_index]),
   10559             :             &(pos_j_buffer[posj_buffer_index]),
   10560             :             64,
   10561             :             64,
   10562             :             x_search_area_origin,
   10563             :             y_search_area_origin,
   10564           0 :             &context_ptr->p_best_sad64x64[idx],
   10565           0 :             &context_ptr->p_best_mv64x64[idx],
   10566             :             &context_ptr->psub_pel_direction64x64[idx]);
   10567             :     }
   10568             : 
   10569             :     if (0) {
   10570             :         // 128x64 [2 partitions]
   10571             :         for (block_index = 0; block_index < 2; ++block_index) {
   10572             :             block_shift_x = 0;
   10573             :             block_shift_y = block_index << 6;
   10574             : 
   10575             :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10576             : 
   10577             :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10578             :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10579             :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10580             : 
   10581             :             in_loop_me_halfpel_refinement_block(
   10582             :                 sequence_control_set_ptr,
   10583             :                 context_ptr,
   10584             :                 block_index_in_sb_buffer,
   10585             :                 &(pos_b_buffer[posb_buffer_index]),
   10586             :                 &(pos_h_buffer[posh_buffer_index]),
   10587             :                 &(pos_j_buffer[posj_buffer_index]),
   10588             :                 128,
   10589             :                 64,
   10590             :                 x_search_area_origin,
   10591             :                 y_search_area_origin,
   10592             :                 &context_ptr->p_best_sad128x64[block_index],
   10593             :                 &context_ptr->p_best_mv128x64[block_index],
   10594             :                 &context_ptr->psub_pel_direction128x64[block_index]);
   10595             :         }
   10596             : 
   10597             :         // 64x128 [2 partitions]
   10598             :         for (block_index = 0; block_index < 2; ++block_index) {
   10599             :             block_shift_x = block_index << 6;
   10600             :             block_shift_y = 0;
   10601             : 
   10602             :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10603             : 
   10604             :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10605             :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10606             :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10607             : 
   10608             :             in_loop_me_halfpel_refinement_block(
   10609             :                 sequence_control_set_ptr,
   10610             :                 context_ptr,
   10611             :                 block_index_in_sb_buffer,
   10612             :                 &(pos_b_buffer[posb_buffer_index]),
   10613             :                 &(pos_h_buffer[posh_buffer_index]),
   10614             :                 &(pos_j_buffer[posj_buffer_index]),
   10615             :                 64,
   10616             :                 128,
   10617             :                 x_search_area_origin,
   10618             :                 y_search_area_origin,
   10619             :                 &context_ptr->p_best_sad64x128[block_index],
   10620             :                 &context_ptr->p_best_mv64x128[block_index],
   10621             :                 &context_ptr->psub_pel_direction64x128[block_index]);
   10622             :         }
   10623             : 
   10624             :         // 128x128 [1 partition]
   10625             :         {
   10626             :             block_index = 0;
   10627             :             block_shift_x = 0;
   10628             :             block_shift_y = 0;
   10629             : 
   10630             :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10631             : 
   10632             :             posb_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10633             :             posh_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10634             :             posj_buffer_index = block_shift_x + block_shift_y * context_ptr->interpolated_stride;
   10635             : 
   10636             :             in_loop_me_halfpel_refinement_block(
   10637             :                 sequence_control_set_ptr,
   10638             :                 context_ptr,
   10639             :                 block_index_in_sb_buffer,
   10640             :                 &(pos_b_buffer[posb_buffer_index]),
   10641             :                 &(pos_h_buffer[posh_buffer_index]),
   10642             :                 &(pos_j_buffer[posj_buffer_index]),
   10643             :                 128,
   10644             :                 128,
   10645             :                 x_search_area_origin,
   10646             :                 y_search_area_origin,
   10647             :                 &context_ptr->p_best_sad128x128[block_index],
   10648             :                 &context_ptr->p_best_mv128x128[block_index],
   10649             :                 &context_ptr->psub_pel_direction128x128);
   10650             :         }
   10651             :     }
   10652           0 :     return;
   10653             :     }
   10654             : 
   10655             : /***************************************************************
   10656             : * in_loop_me_quarterpel_refinement_on_the_fly_block
   10657             : *   performs Quarter Pel refinement for each block
   10658             : ***************************************************************/
   10659           0 : static void in_loop_me_quarterpel_refinement_on_the_fly_block(
   10660             :     SsMeContext         *context_ptr,                      // [IN] ME context Ptr, used to get SB Ptr
   10661             :     uint32_t                 block_index_in_sb_buffer,                // [IN] PU origin, used to point to source samples
   10662             :     uint8_t                **buf1,                            // [IN]
   10663             :     uint32_t                *buf1Stride,
   10664             :     uint8_t                **buf2,                            // [IN]
   10665             :     uint32_t                *buf2Stride,
   10666             :     uint32_t                 pu_width,                         // [IN]  PU width
   10667             :     uint32_t                 pu_height,                        // [IN]  PU height
   10668             :     int16_t                 x_search_area_origin,               // [IN] search area origin in the horizontal direction, used to point to reference samples
   10669             :     int16_t                 y_search_area_origin,               // [IN] search area origin in the vertical direction, used to point to reference samples
   10670             :     uint32_t                *pBestSad,
   10671             :     uint32_t                *pBestMV,
   10672             :     uint8_t                  sub_pel_direction)
   10673             : {
   10674           0 :     int16_t x_mv = _MVXT(*pBestMV);
   10675           0 :     int16_t y_mv = _MVYT(*pBestMV);
   10676             : 
   10677           0 :     int16_t xSearchIndex = ((x_mv + 2) >> 2) - x_search_area_origin;
   10678           0 :     int16_t ySearchIndex = ((y_mv + 2) >> 2) - y_search_area_origin;
   10679             : 
   10680             :     uint64_t dist;
   10681             : 
   10682             :     EbBool validTL, validT, validTR, validR, validBR, validB, validBL, validL;
   10683             : 
   10684             :     int16_t xMvQuarter[8];
   10685             :     int16_t yMvQuarter[8];
   10686           0 :     int32_t searchRegionIndex1 = 0;
   10687           0 :     int32_t searchRegionIndex2 = 0;
   10688             : 
   10689           0 :     if ((y_mv & 2) + ((x_mv & 2) >> 1)) {
   10690           0 :         validTL = (EbBool)(sub_pel_direction == RIGHT_POSITION || sub_pel_direction == BOTTOM_RIGHT_POSITION || sub_pel_direction == BOTTOM_POSITION);
   10691           0 :         validT = (EbBool)(sub_pel_direction == BOTTOM_RIGHT_POSITION || sub_pel_direction == BOTTOM_POSITION || sub_pel_direction == BOTTOM_LEFT_POSITION);
   10692           0 :         validTR = (EbBool)(sub_pel_direction == BOTTOM_POSITION || sub_pel_direction == BOTTOM_LEFT_POSITION || sub_pel_direction == LEFT_POSITION);
   10693           0 :         validR = (EbBool)(sub_pel_direction == BOTTOM_LEFT_POSITION || sub_pel_direction == LEFT_POSITION || sub_pel_direction == TOP_LEFT_POSITION);
   10694           0 :         validBR = (EbBool)(sub_pel_direction == LEFT_POSITION || sub_pel_direction == TOP_LEFT_POSITION || sub_pel_direction == TOP_POSITION);
   10695           0 :         validB = (EbBool)(sub_pel_direction == TOP_LEFT_POSITION || sub_pel_direction == TOP_POSITION || sub_pel_direction == TOP_RIGHT_POSITION);
   10696           0 :         validBL = (EbBool)(sub_pel_direction == TOP_POSITION || sub_pel_direction == TOP_RIGHT_POSITION || sub_pel_direction == RIGHT_POSITION);
   10697           0 :         validL = (EbBool)(sub_pel_direction == TOP_RIGHT_POSITION || sub_pel_direction == RIGHT_POSITION || sub_pel_direction == BOTTOM_RIGHT_POSITION);
   10698             :     }
   10699             :     else {
   10700           0 :         validTL = (EbBool)(sub_pel_direction == LEFT_POSITION || sub_pel_direction == TOP_LEFT_POSITION || sub_pel_direction == TOP_POSITION);
   10701           0 :         validT = (EbBool)(sub_pel_direction == TOP_LEFT_POSITION || sub_pel_direction == TOP_POSITION || sub_pel_direction == TOP_RIGHT_POSITION);
   10702           0 :         validTR = (EbBool)(sub_pel_direction == TOP_POSITION || sub_pel_direction == TOP_RIGHT_POSITION || sub_pel_direction == RIGHT_POSITION);
   10703           0 :         validR = (EbBool)(sub_pel_direction == TOP_RIGHT_POSITION || sub_pel_direction == RIGHT_POSITION || sub_pel_direction == BOTTOM_RIGHT_POSITION);
   10704           0 :         validBR = (EbBool)(sub_pel_direction == RIGHT_POSITION || sub_pel_direction == BOTTOM_RIGHT_POSITION || sub_pel_direction == BOTTOM_POSITION);
   10705           0 :         validB = (EbBool)(sub_pel_direction == BOTTOM_RIGHT_POSITION || sub_pel_direction == BOTTOM_POSITION || sub_pel_direction == BOTTOM_LEFT_POSITION);
   10706           0 :         validBL = (EbBool)(sub_pel_direction == BOTTOM_POSITION || sub_pel_direction == BOTTOM_LEFT_POSITION || sub_pel_direction == LEFT_POSITION);
   10707           0 :         validL = (EbBool)(sub_pel_direction == BOTTOM_LEFT_POSITION || sub_pel_direction == LEFT_POSITION || sub_pel_direction == TOP_LEFT_POSITION);
   10708             :     }
   10709             : 
   10710           0 :     xMvQuarter[0] = x_mv - 1; // L  position
   10711           0 :     xMvQuarter[1] = x_mv + 1; // R  position
   10712           0 :     xMvQuarter[2] = x_mv;     // T  position
   10713           0 :     xMvQuarter[3] = x_mv;     // B  position
   10714           0 :     xMvQuarter[4] = x_mv - 1; // TL position
   10715           0 :     xMvQuarter[5] = x_mv + 1; // TR position
   10716           0 :     xMvQuarter[6] = x_mv + 1; // BR position
   10717           0 :     xMvQuarter[7] = x_mv - 1; // BL position
   10718             : 
   10719           0 :     yMvQuarter[0] = y_mv;     // L  position
   10720           0 :     yMvQuarter[1] = y_mv;     // R  position
   10721           0 :     yMvQuarter[2] = y_mv - 1; // T  position
   10722           0 :     yMvQuarter[3] = y_mv + 1; // B  position
   10723           0 :     yMvQuarter[4] = y_mv - 1; // TL position
   10724           0 :     yMvQuarter[5] = y_mv - 1; // TR position
   10725           0 :     yMvQuarter[6] = y_mv + 1; // BR position
   10726           0 :     yMvQuarter[7] = y_mv + 1; // BL position
   10727             : 
   10728             :     // L position
   10729           0 :     if (validL) {
   10730           0 :         searchRegionIndex1 = (int32_t)xSearchIndex + (int32_t)buf1Stride[0] * (int32_t)ySearchIndex;
   10731           0 :         searchRegionIndex2 = (int32_t)xSearchIndex + (int32_t)buf2Stride[0] * (int32_t)ySearchIndex;
   10732             : 
   10733           0 :         dist = nxm_sad_avg_kernel(&(context_ptr->sb_buffer[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, buf1[0] + searchRegionIndex1, buf1Stride[0] << 1, buf2[0] + searchRegionIndex2, buf2Stride[0] << 1, pu_height >> 1, pu_width);
   10734             : 
   10735           0 :         dist = dist << 1;
   10736             : 
   10737           0 :         if (dist < *pBestSad) {
   10738           0 :             *pBestSad = (uint32_t)dist;
   10739           0 :             *pBestMV = ((uint16_t)yMvQuarter[0] << 16) | ((uint16_t)xMvQuarter[0]);
   10740             :         }
   10741             :     }
   10742             : 
   10743             :     // R positions
   10744           0 :     if (validR) {
   10745           0 :         searchRegionIndex1 = (int32_t)xSearchIndex + (int32_t)buf1Stride[1] * (int32_t)ySearchIndex;
   10746           0 :         searchRegionIndex2 = (int32_t)xSearchIndex + (int32_t)buf2Stride[1] * (int32_t)ySearchIndex;
   10747           0 :         dist = nxm_sad_avg_kernel(&(context_ptr->sb_buffer[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, buf1[1] + searchRegionIndex1, buf1Stride[1] << 1, buf2[1] + searchRegionIndex2, buf2Stride[1] << 1, pu_height >> 1, pu_width);
   10748           0 :         dist = dist << 1;
   10749             : 
   10750           0 :         if (dist < *pBestSad) {
   10751           0 :             *pBestSad = (uint32_t)dist;
   10752           0 :             *pBestMV = ((uint16_t)yMvQuarter[1] << 16) | ((uint16_t)xMvQuarter[1]);
   10753             :         }
   10754             :     }
   10755             : 
   10756             :     // T position
   10757           0 :     if (validT) {
   10758           0 :         searchRegionIndex1 = (int32_t)xSearchIndex + (int32_t)buf1Stride[2] * (int32_t)ySearchIndex;
   10759           0 :         searchRegionIndex2 = (int32_t)xSearchIndex + (int32_t)buf2Stride[2] * (int32_t)ySearchIndex;
   10760             : 
   10761           0 :         dist = nxm_sad_avg_kernel(&(context_ptr->sb_buffer[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, buf1[2] + searchRegionIndex1, buf1Stride[2] << 1, buf2[2] + searchRegionIndex2, buf2Stride[2] << 1, pu_height >> 1, pu_width);
   10762           0 :         dist = dist << 1;
   10763             : 
   10764           0 :         if (dist < *pBestSad) {
   10765           0 :             *pBestSad = (uint32_t)dist;
   10766           0 :             *pBestMV = ((uint16_t)yMvQuarter[2] << 16) | ((uint16_t)xMvQuarter[2]);
   10767             :         }
   10768             :     }
   10769             : 
   10770             :     // B position
   10771           0 :     if (validB) {
   10772           0 :         searchRegionIndex1 = (int32_t)xSearchIndex + (int32_t)buf1Stride[3] * (int32_t)ySearchIndex;
   10773           0 :         searchRegionIndex2 = (int32_t)xSearchIndex + (int32_t)buf2Stride[3] * (int32_t)ySearchIndex;
   10774             : 
   10775           0 :         dist = nxm_sad_avg_kernel(&(context_ptr->sb_buffer[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, buf1[3] + searchRegionIndex1, buf1Stride[3] << 1, buf2[3] + searchRegionIndex2, buf2Stride[3] << 1, pu_height >> 1, pu_width);
   10776           0 :         dist = dist << 1;
   10777             : 
   10778           0 :         if (dist < *pBestSad) {
   10779           0 :             *pBestSad = (uint32_t)dist;
   10780           0 :             *pBestMV = ((uint16_t)yMvQuarter[3] << 16) | ((uint16_t)xMvQuarter[3]);
   10781             :         }
   10782             :     }
   10783             : 
   10784             :     //TL position
   10785           0 :     if (validTL) {
   10786           0 :         searchRegionIndex1 = (int32_t)xSearchIndex + (int32_t)buf1Stride[4] * (int32_t)ySearchIndex;
   10787           0 :         searchRegionIndex2 = (int32_t)xSearchIndex + (int32_t)buf2Stride[4] * (int32_t)ySearchIndex;
   10788             : 
   10789           0 :         dist = nxm_sad_avg_kernel(&(context_ptr->sb_buffer[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, buf1[4] + searchRegionIndex1, buf1Stride[4] << 1, buf2[4] + searchRegionIndex2, buf2Stride[4] << 1, pu_height >> 1, pu_width);
   10790           0 :         dist = dist << 1;
   10791             : 
   10792           0 :         if (dist < *pBestSad) {
   10793           0 :             *pBestSad = (uint32_t)dist;
   10794           0 :             *pBestMV = ((uint16_t)yMvQuarter[4] << 16) | ((uint16_t)xMvQuarter[4]);
   10795             :         }
   10796             :     }
   10797             : 
   10798             :     //TR position
   10799           0 :     if (validTR) {
   10800           0 :         searchRegionIndex1 = (int32_t)xSearchIndex + (int32_t)buf1Stride[5] * (int32_t)ySearchIndex;
   10801           0 :         searchRegionIndex2 = (int32_t)xSearchIndex + (int32_t)buf2Stride[5] * (int32_t)ySearchIndex;
   10802             : 
   10803           0 :         dist = nxm_sad_avg_kernel(&(context_ptr->sb_buffer[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, buf1[5] + searchRegionIndex1, buf1Stride[5] << 1, buf2[5] + searchRegionIndex2, buf2Stride[5] << 1, pu_height >> 1, pu_width);
   10804           0 :         dist = dist << 1;
   10805             : 
   10806           0 :         if (dist < *pBestSad) {
   10807           0 :             *pBestSad = (uint32_t)dist;
   10808           0 :             *pBestMV = ((uint16_t)yMvQuarter[5] << 16) | ((uint16_t)xMvQuarter[5]);
   10809             :         }
   10810             :     }
   10811             : 
   10812             :     //BR position
   10813           0 :     if (validBR) {
   10814           0 :         searchRegionIndex1 = (int32_t)xSearchIndex + (int32_t)buf1Stride[6] * (int32_t)ySearchIndex;
   10815           0 :         searchRegionIndex2 = (int32_t)xSearchIndex + (int32_t)buf2Stride[6] * (int32_t)ySearchIndex;
   10816             : 
   10817           0 :         dist = nxm_sad_avg_kernel(&(context_ptr->sb_buffer[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, buf1[6] + searchRegionIndex1, buf1Stride[6] << 1, buf2[6] + searchRegionIndex2, buf2Stride[6] << 1, pu_height >> 1, pu_width);
   10818           0 :         dist = dist << 1;
   10819             : 
   10820           0 :         if (dist < *pBestSad) {
   10821           0 :             *pBestSad = (uint32_t)dist;
   10822           0 :             *pBestMV = ((uint16_t)yMvQuarter[6] << 16) | ((uint16_t)xMvQuarter[6]);
   10823             :         }
   10824             :     }
   10825             : 
   10826             :     //BL position
   10827           0 :     if (validBL) {
   10828           0 :         searchRegionIndex1 = (int32_t)xSearchIndex + (int32_t)buf1Stride[7] * (int32_t)ySearchIndex;
   10829           0 :         searchRegionIndex2 = (int32_t)xSearchIndex + (int32_t)buf2Stride[7] * (int32_t)ySearchIndex;
   10830             : 
   10831           0 :         dist = nxm_sad_avg_kernel(&(context_ptr->sb_buffer[block_index_in_sb_buffer]), context_ptr->sb_src_stride << 1, buf1[7] + searchRegionIndex1, buf1Stride[7] << 1, buf2[7] + searchRegionIndex2, buf2Stride[7] << 1, pu_height >> 1, pu_width);
   10832           0 :         dist = dist << 1;
   10833             : 
   10834           0 :         if (dist < *pBestSad) {
   10835           0 :             *pBestSad = (uint32_t)dist;
   10836           0 :             *pBestMV = ((uint16_t)yMvQuarter[7] << 16) | ((uint16_t)xMvQuarter[7]);
   10837             :         }
   10838             :     }
   10839             : 
   10840           0 :     return;
   10841             : }
   10842             : 
   10843             : /***************************************************************
   10844             : * set_quarterpel_refinement_inputs_on_the_fly_block
   10845             : *   determine the 2 half pel buffers to perform the averaging
   10846             : *   for Quarter Pel Refinement
   10847             : ***************************************************************/
   10848           0 : static void set_quarterpel_refinement_inputs_on_the_fly_block(
   10849             :     uint8_t   *pos_Full,   //[IN] points to A
   10850             :     uint32_t   FullStride, //[IN]
   10851             :     uint8_t   *pos_b,     //[IN] points to b
   10852             :     uint8_t   *pos_h,     //[IN] points to h
   10853             :     uint8_t   *pos_j,     //[IN] points to j
   10854             :     uint32_t   Stride,    //[IN]
   10855             :     int16_t   x_mv,        //[IN]
   10856             :     int16_t   y_mv,        //[IN]
   10857             :     uint8_t   **buf1,       //[OUT]
   10858             :     uint32_t  *buf1Stride, //[OUT]
   10859             :     uint8_t   **buf2,       //[OUT]
   10860             :     uint32_t  *buf2Stride  //[OUT]
   10861             : )
   10862             : {
   10863           0 :     uint32_t  quarterPelRefinementMethod = (y_mv & 2) + ((x_mv & 2) >> 1);
   10864             : 
   10865             :     //for each one of the 8 postions, we need to determine the 2 half pel buffers to  do averaging
   10866             : 
   10867             :     //     A    a    b    c
   10868             :     //     d    e    f    g
   10869             :     //     h    i    j    k
   10870             :     //     n    p    q    r
   10871             : 
   10872           0 :     switch (quarterPelRefinementMethod) {
   10873           0 :     case EB_QUARTER_IN_FULL:
   10874             : 
   10875           0 :         /*c=b+A*/ buf1[0] = pos_b;                     buf1Stride[0] = Stride;        buf2[0] = pos_Full;             buf2Stride[0] = FullStride;
   10876           0 :         /*a=A+b*/ buf1[1] = pos_Full;                  buf1Stride[1] = FullStride;    buf2[1] = pos_b + 1;             buf2Stride[1] = Stride;
   10877           0 :         /*n=h+A*/ buf1[2] = pos_h;                      buf1Stride[2] = Stride;        buf2[2] = pos_Full;              buf2Stride[2] = FullStride;
   10878           0 :         /*d=A+h*/ buf1[3] = pos_Full;                   buf1Stride[3] = FullStride;    buf2[3] = pos_h + Stride;        buf2Stride[3] = Stride;
   10879           0 :         /*r=b+h*/ buf1[4] = pos_b;                      buf1Stride[4] = Stride;        buf2[4] = pos_h;                 buf2Stride[4] = Stride;
   10880           0 :         /*p=h+b*/ buf1[5] = pos_h;                      buf1Stride[5] = Stride;        buf2[5] = pos_b + 1;             buf2Stride[5] = Stride;
   10881           0 :         /*e=h+b*/ buf1[6] = pos_h + Stride;             buf1Stride[6] = Stride;        buf2[6] = pos_b + 1;             buf2Stride[6] = Stride;
   10882           0 :         /*g=b+h*/ buf1[7] = pos_b;                      buf1Stride[7] = Stride;        buf2[7] = pos_h + Stride;        buf2Stride[7] = Stride;
   10883             : 
   10884           0 :         break;
   10885             : 
   10886           0 :     case EB_QUARTER_IN_HALF_HORIZONTAL:
   10887             : 
   10888           0 :         /*a=A+b*/ buf1[0] = pos_Full - 1;               buf1Stride[0] = FullStride;    buf2[0] = pos_b;                buf2Stride[0] = Stride;
   10889           0 :         /*c=b+A*/ buf1[1] = pos_b;                     buf1Stride[1] = Stride;        buf2[1] = pos_Full;             buf2Stride[1] = FullStride;
   10890           0 :         /*q=j+b*/ buf1[2] = pos_j;                     buf1Stride[2] = Stride;        buf2[2] = pos_b;                buf2Stride[2] = Stride;
   10891           0 :         /*f=b+j*/ buf1[3] = pos_b;                     buf1Stride[3] = Stride;        buf2[3] = pos_j + Stride;        buf2Stride[3] = Stride;
   10892           0 :         /*p=h+b*/ buf1[4] = pos_h - 1;                  buf1Stride[4] = Stride;        buf2[4] = pos_b;                buf2Stride[4] = Stride;
   10893           0 :         /*r=b+h*/ buf1[5] = pos_b;                     buf1Stride[5] = Stride;        buf2[5] = pos_h;                buf2Stride[5] = Stride;
   10894           0 :         /*g=b+h*/ buf1[6] = pos_b;                     buf1Stride[6] = Stride;        buf2[6] = pos_h + Stride;        buf2Stride[6] = Stride;
   10895           0 :         /*e=h+b*/ buf1[7] = pos_h - 1 + Stride;         buf1Stride[7] = Stride;        buf2[7] = pos_b;                buf2Stride[7] = Stride;
   10896             : 
   10897           0 :         break;
   10898             : 
   10899           0 :     case EB_QUARTER_IN_HALF_VERTICAL:
   10900             : 
   10901           0 :         /*k=j+h*/buf1[0] = pos_j;                      buf1Stride[0] = Stride;        buf2[0] = pos_h;                 buf2Stride[0] = Stride;
   10902           0 :         /*i=h+j*/buf1[1] = pos_h;                      buf1Stride[1] = Stride;        buf2[1] = pos_j + 1;              buf2Stride[1] = Stride;
   10903           0 :         /*d=A+h*/buf1[2] = pos_Full - FullStride;      buf1Stride[2] = FullStride;    buf2[2] = pos_h;                  buf2Stride[2] = Stride;
   10904           0 :         /*n=h+A*/buf1[3] = pos_h;                       buf1Stride[3] = Stride;        buf2[3] = pos_Full;               buf2Stride[3] = FullStride;
   10905           0 :         /*g=b+h*/buf1[4] = pos_b - Stride;              buf1Stride[4] = Stride;        buf2[4] = pos_h;                  buf2Stride[4] = Stride;
   10906           0 :         /*e=h+b*/buf1[5] = pos_h;                      buf1Stride[5] = Stride;        buf2[5] = pos_b + 1 - Stride;     buf2Stride[5] = Stride;
   10907           0 :         /*p=h+b*/buf1[6] = pos_h;                      buf1Stride[6] = Stride;        buf2[6] = pos_b + 1;              buf2Stride[6] = Stride;
   10908           0 :         /*r=b+h*/buf1[7] = pos_b;                      buf1Stride[7] = Stride;        buf2[7] = pos_h;                 buf2Stride[7] = Stride;
   10909             : 
   10910           0 :         break;
   10911             : 
   10912           0 :     case EB_QUARTER_IN_HALF_DIAGONAL:
   10913             : 
   10914           0 :         /*i=h+j*/buf1[0] = pos_h - 1;                   buf1Stride[0] = Stride;        buf2[0] = pos_j;                  buf2Stride[0] = Stride;
   10915           0 :         /*k=j+h*/buf1[1] = pos_j;                       buf1Stride[1] = Stride;        buf2[1] = pos_h;                  buf2Stride[1] = Stride;
   10916           0 :         /*f=b+j*/buf1[2] = pos_b - Stride;              buf1Stride[2] = Stride;        buf2[2] = pos_j;                  buf2Stride[2] = Stride;
   10917           0 :         /*q=j+b*/buf1[3] = pos_j;                       buf1Stride[3] = Stride;        buf2[3] = pos_b;                  buf2Stride[3] = Stride;
   10918           0 :         /*e=h+b*/buf1[4] = pos_h - 1;                   buf1Stride[4] = Stride;        buf2[4] = pos_b - Stride;         buf2Stride[4] = Stride;
   10919           0 :         /*g=b+h*/buf1[5] = pos_b - Stride;              buf1Stride[5] = Stride;        buf2[5] = pos_h;                  buf2Stride[5] = Stride;
   10920           0 :         /*r=b+h*/buf1[6] = pos_b;                      buf1Stride[6] = Stride;        buf2[6] = pos_h;                  buf2Stride[6] = Stride;
   10921           0 :         /*p=h+b*/buf1[7] = pos_h - 1;                   buf1Stride[7] = Stride;        buf2[7] = pos_b;                  buf2Stride[7] = Stride;
   10922             : 
   10923           0 :         break;
   10924             : 
   10925           0 :     default:
   10926           0 :         break;
   10927             :     }
   10928             : 
   10929           0 :     return;
   10930             : }
   10931             : 
   10932             : /***************************************************************
   10933             : * in_loop_me_quarterpel_search_sblock
   10934             : *   perform the quarter-pel refinement for the whole super-block
   10935             : ***************************************************************/
   10936           0 : static void in_loop_me_quarterpel_search_sblock(
   10937             :     SsMeContext                *context_ptr,                     //[IN/OUT]  ME context Ptr, used to get/update ME results
   10938             :     uint8_t                        *pos_Full,                       //[IN]
   10939             :     uint32_t                        full_stride,                      //[IN]
   10940             :     uint8_t                        *pos_b,                          //[IN]
   10941             :     uint8_t                        *pos_h,                          //[IN]
   10942             :     uint8_t                        *pos_j,                          //[IN]
   10943             :     int16_t                        x_search_area_origin,            //[IN] search area origin in the horizontal direction, used to point to reference samples
   10944             :     int16_t                        y_search_area_origin)               //[IN] search area origin in the vertical direction, used to point to reference samples
   10945             : {
   10946             :     uint32_t  block_index;
   10947             : 
   10948             :     uint32_t  block_shift_x;
   10949             :     uint32_t  block_shift_y;
   10950             : 
   10951             :     uint32_t  block_index_in_sb_buffer;
   10952             : 
   10953             :     //for each one of the 8 positions, we need to determine the 2 buffers to  do averaging
   10954             :     uint8_t  *buf1[8];
   10955             :     uint8_t  *buf2[8];
   10956             : 
   10957             :     uint32_t  buf1Stride[8];
   10958             :     uint32_t  buf2Stride[8];
   10959             : 
   10960             :     int16_t  x_mv, y_mv;
   10961             :     uint32_t  nidx;
   10962             : 
   10963           0 :     uint32_t quad_index = 0;
   10964           0 :     uint32_t block_offset = 0;
   10965           0 :     uint32_t x_offset = 0;
   10966           0 :     uint32_t y_offset = 0;
   10967           0 :     uint32_t number_of_sb_quad = context_ptr->sb_size == BLOCK_128X128 ? 4 : 1;
   10968             : 
   10969             :     // 4x4   [256 partitions]
   10970             : 
   10971           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   10972           0 :         for (block_index = 0; block_index < 256; ++block_index) {
   10973           0 :             block_offset = (quad_index * 256);
   10974           0 :             x_offset = (quad_index & 0x01) << 6;
   10975           0 :             y_offset = (quad_index >> 1) << 6;
   10976           0 :             nidx = tab4x4[block_index] + block_offset;
   10977           0 :             block_shift_x = ((block_index & 0xf) << 2) + x_offset;
   10978           0 :             block_shift_y = ((block_index >> 4) << 2) + y_offset;
   10979             : 
   10980           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   10981             : 
   10982           0 :             x_mv = _MVXT(context_ptr->p_best_mv4x4[nidx]);
   10983           0 :             y_mv = _MVYT(context_ptr->p_best_mv4x4[nidx]);
   10984             : 
   10985           0 :             set_quarterpel_refinement_inputs_on_the_fly_block(
   10986             :                 pos_Full,
   10987             :                 full_stride,
   10988             :                 pos_b,
   10989             :                 pos_h,
   10990             :                 pos_j,
   10991             :                 context_ptr->interpolated_stride,
   10992             :                 x_mv,
   10993             :                 y_mv,
   10994             :                 buf1, buf1Stride,
   10995             :                 buf2, buf2Stride);
   10996             : 
   10997           0 :             buf1[0] = buf1[0] + block_shift_x + block_shift_y * buf1Stride[0];              buf2[0] = buf2[0] + block_shift_x + block_shift_y * buf2Stride[0];
   10998           0 :             buf1[1] = buf1[1] + block_shift_x + block_shift_y * buf1Stride[1];              buf2[1] = buf2[1] + block_shift_x + block_shift_y * buf2Stride[1];
   10999           0 :             buf1[2] = buf1[2] + block_shift_x + block_shift_y * buf1Stride[2];              buf2[2] = buf2[2] + block_shift_x + block_shift_y * buf2Stride[2];
   11000           0 :             buf1[3] = buf1[3] + block_shift_x + block_shift_y * buf1Stride[3];              buf2[3] = buf2[3] + block_shift_x + block_shift_y * buf2Stride[3];
   11001           0 :             buf1[4] = buf1[4] + block_shift_x + block_shift_y * buf1Stride[4];              buf2[4] = buf2[4] + block_shift_x + block_shift_y * buf2Stride[4];
   11002           0 :             buf1[5] = buf1[5] + block_shift_x + block_shift_y * buf1Stride[5];              buf2[5] = buf2[5] + block_shift_x + block_shift_y * buf2Stride[5];
   11003           0 :             buf1[6] = buf1[6] + block_shift_x + block_shift_y * buf1Stride[6];              buf2[6] = buf2[6] + block_shift_x + block_shift_y * buf2Stride[6];
   11004           0 :             buf1[7] = buf1[7] + block_shift_x + block_shift_y * buf1Stride[7];              buf2[7] = buf2[7] + block_shift_x + block_shift_y * buf2Stride[7];
   11005             : 
   11006           0 :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11007             :                 context_ptr,
   11008             :                 block_index_in_sb_buffer,
   11009             :                 buf1, buf1Stride,
   11010             :                 buf2, buf2Stride,
   11011             :                 4, 4,
   11012             :                 x_search_area_origin,
   11013             :                 y_search_area_origin,
   11014           0 :                 &context_ptr->p_best_sad4x4[nidx],
   11015           0 :                 &context_ptr->p_best_mv4x4[nidx],
   11016           0 :                 context_ptr->psub_pel_direction4x4[nidx]);
   11017             :         }
   11018             :     }
   11019             : 
   11020             :     // 8x4   [128 8x4 blocks]
   11021             : 
   11022           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   11023           0 :         for (block_index = 0; block_index < 128; ++block_index) {
   11024           0 :             block_offset = (quad_index * 128);
   11025           0 :             x_offset = (quad_index & 0x01) << 6;
   11026           0 :             y_offset = (quad_index >> 1) << 6;
   11027           0 :             nidx = tab8x4[block_index] + block_offset;
   11028           0 :             block_shift_x = ((block_index & 0x07) << 3) + x_offset;
   11029           0 :             block_shift_y = ((block_index >> 3) << 2) + y_offset;
   11030             : 
   11031           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11032             : 
   11033           0 :             x_mv = _MVXT(context_ptr->p_best_mv8x4[nidx]);
   11034           0 :             y_mv = _MVYT(context_ptr->p_best_mv8x4[nidx]);
   11035             : 
   11036           0 :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11037             :                 pos_Full,
   11038             :                 full_stride,
   11039             :                 pos_b,
   11040             :                 pos_h,
   11041             :                 pos_j,
   11042             :                 context_ptr->interpolated_stride,
   11043             :                 x_mv,
   11044             :                 y_mv,
   11045             :                 buf1, buf1Stride,
   11046             :                 buf2, buf2Stride);
   11047             : 
   11048           0 :             buf1[0] = buf1[0] + block_shift_x + block_shift_y * buf1Stride[0];              buf2[0] = buf2[0] + block_shift_x + block_shift_y * buf2Stride[0];
   11049           0 :             buf1[1] = buf1[1] + block_shift_x + block_shift_y * buf1Stride[1];              buf2[1] = buf2[1] + block_shift_x + block_shift_y * buf2Stride[1];
   11050           0 :             buf1[2] = buf1[2] + block_shift_x + block_shift_y * buf1Stride[2];              buf2[2] = buf2[2] + block_shift_x + block_shift_y * buf2Stride[2];
   11051           0 :             buf1[3] = buf1[3] + block_shift_x + block_shift_y * buf1Stride[3];              buf2[3] = buf2[3] + block_shift_x + block_shift_y * buf2Stride[3];
   11052           0 :             buf1[4] = buf1[4] + block_shift_x + block_shift_y * buf1Stride[4];              buf2[4] = buf2[4] + block_shift_x + block_shift_y * buf2Stride[4];
   11053           0 :             buf1[5] = buf1[5] + block_shift_x + block_shift_y * buf1Stride[5];              buf2[5] = buf2[5] + block_shift_x + block_shift_y * buf2Stride[5];
   11054           0 :             buf1[6] = buf1[6] + block_shift_x + block_shift_y * buf1Stride[6];              buf2[6] = buf2[6] + block_shift_x + block_shift_y * buf2Stride[6];
   11055           0 :             buf1[7] = buf1[7] + block_shift_x + block_shift_y * buf1Stride[7];              buf2[7] = buf2[7] + block_shift_x + block_shift_y * buf2Stride[7];
   11056             : 
   11057           0 :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11058             :                 context_ptr,
   11059             :                 block_index_in_sb_buffer,
   11060             :                 buf1, buf1Stride,
   11061             :                 buf2, buf2Stride,
   11062             :                 8, 4,
   11063             :                 x_search_area_origin,
   11064             :                 y_search_area_origin,
   11065           0 :                 &context_ptr->p_best_sad8x4[nidx],
   11066           0 :                 &context_ptr->p_best_mv8x4[nidx],
   11067           0 :                 context_ptr->psub_pel_direction8x4[nidx]);
   11068             :         }
   11069             :     }
   11070             : 
   11071             :     // 4x8   [128 4x8 blocks]
   11072           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   11073           0 :         for (block_index = 0; block_index < 128; ++block_index) {
   11074           0 :             block_offset = (quad_index * 128);
   11075           0 :             x_offset = (quad_index & 0x01) << 6;
   11076           0 :             y_offset = (quad_index >> 1) << 6;
   11077           0 :             nidx = tab4x8[block_index] + block_offset;
   11078           0 :             block_shift_x = ((block_index & 0xf) << 2) + x_offset;
   11079           0 :             block_shift_y = ((block_index >> 4) << 3) + y_offset;
   11080             : 
   11081           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11082             : 
   11083           0 :             x_mv = _MVXT(context_ptr->p_best_mv4x8[nidx]);
   11084           0 :             y_mv = _MVYT(context_ptr->p_best_mv4x8[nidx]);
   11085             : 
   11086           0 :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11087             :                 pos_Full,
   11088             :                 full_stride,
   11089             :                 pos_b,
   11090             :                 pos_h,
   11091             :                 pos_j,
   11092             :                 context_ptr->interpolated_stride,
   11093             :                 x_mv,
   11094             :                 y_mv,
   11095             :                 buf1, buf1Stride,
   11096             :                 buf2, buf2Stride);
   11097             : 
   11098           0 :             buf1[0] = buf1[0] + block_shift_x + block_shift_y * buf1Stride[0];              buf2[0] = buf2[0] + block_shift_x + block_shift_y * buf2Stride[0];
   11099           0 :             buf1[1] = buf1[1] + block_shift_x + block_shift_y * buf1Stride[1];              buf2[1] = buf2[1] + block_shift_x + block_shift_y * buf2Stride[1];
   11100           0 :             buf1[2] = buf1[2] + block_shift_x + block_shift_y * buf1Stride[2];              buf2[2] = buf2[2] + block_shift_x + block_shift_y * buf2Stride[2];
   11101           0 :             buf1[3] = buf1[3] + block_shift_x + block_shift_y * buf1Stride[3];              buf2[3] = buf2[3] + block_shift_x + block_shift_y * buf2Stride[3];
   11102           0 :             buf1[4] = buf1[4] + block_shift_x + block_shift_y * buf1Stride[4];              buf2[4] = buf2[4] + block_shift_x + block_shift_y * buf2Stride[4];
   11103           0 :             buf1[5] = buf1[5] + block_shift_x + block_shift_y * buf1Stride[5];              buf2[5] = buf2[5] + block_shift_x + block_shift_y * buf2Stride[5];
   11104           0 :             buf1[6] = buf1[6] + block_shift_x + block_shift_y * buf1Stride[6];              buf2[6] = buf2[6] + block_shift_x + block_shift_y * buf2Stride[6];
   11105           0 :             buf1[7] = buf1[7] + block_shift_x + block_shift_y * buf1Stride[7];              buf2[7] = buf2[7] + block_shift_x + block_shift_y * buf2Stride[7];
   11106             : 
   11107           0 :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11108             :                 context_ptr,
   11109             :                 block_index_in_sb_buffer,
   11110             :                 buf1, buf1Stride,
   11111             :                 buf2, buf2Stride,
   11112             :                 4, 8,
   11113             :                 x_search_area_origin,
   11114             :                 y_search_area_origin,
   11115           0 :                 &context_ptr->p_best_sad4x8[nidx],
   11116           0 :                 &context_ptr->p_best_mv4x8[nidx],
   11117           0 :                 context_ptr->psub_pel_direction4x8[nidx]);
   11118             :         }
   11119             :     }
   11120             : 
   11121             :     // 8x8   [64 8x8 blocks]
   11122             : 
   11123           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   11124           0 :         for (block_index = 0; block_index < 64; ++block_index) {
   11125           0 :             block_offset = (quad_index * 64);
   11126           0 :             x_offset = (quad_index & 0x01) << 6;
   11127           0 :             y_offset = (quad_index >> 1) << 6;
   11128           0 :             nidx = tab8x8[block_index] + block_offset;
   11129           0 :             block_shift_x = ((block_index & 0x07) << 3) + x_offset;
   11130           0 :             block_shift_y = ((block_index >> 3) << 3) + y_offset;
   11131             : 
   11132           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11133             : 
   11134           0 :             x_mv = _MVXT(context_ptr->p_best_mv8x8[nidx]);
   11135           0 :             y_mv = _MVYT(context_ptr->p_best_mv8x8[nidx]);
   11136             : 
   11137           0 :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11138             :                 pos_Full,
   11139             :                 full_stride,
   11140             :                 pos_b,
   11141             :                 pos_h,
   11142             :                 pos_j,
   11143             :                 context_ptr->interpolated_stride,
   11144             :                 x_mv,
   11145             :                 y_mv,
   11146             :                 buf1, buf1Stride,
   11147             :                 buf2, buf2Stride);
   11148             : 
   11149           0 :             buf1[0] = buf1[0] + block_shift_x + block_shift_y * buf1Stride[0];              buf2[0] = buf2[0] + block_shift_x + block_shift_y * buf2Stride[0];
   11150           0 :             buf1[1] = buf1[1] + block_shift_x + block_shift_y * buf1Stride[1];              buf2[1] = buf2[1] + block_shift_x + block_shift_y * buf2Stride[1];
   11151           0 :             buf1[2] = buf1[2] + block_shift_x + block_shift_y * buf1Stride[2];              buf2[2] = buf2[2] + block_shift_x + block_shift_y * buf2Stride[2];
   11152           0 :             buf1[3] = buf1[3] + block_shift_x + block_shift_y * buf1Stride[3];              buf2[3] = buf2[3] + block_shift_x + block_shift_y * buf2Stride[3];
   11153           0 :             buf1[4] = buf1[4] + block_shift_x + block_shift_y * buf1Stride[4];              buf2[4] = buf2[4] + block_shift_x + block_shift_y * buf2Stride[4];
   11154           0 :             buf1[5] = buf1[5] + block_shift_x + block_shift_y * buf1Stride[5];              buf2[5] = buf2[5] + block_shift_x + block_shift_y * buf2Stride[5];
   11155           0 :             buf1[6] = buf1[6] + block_shift_x + block_shift_y * buf1Stride[6];              buf2[6] = buf2[6] + block_shift_x + block_shift_y * buf2Stride[6];
   11156           0 :             buf1[7] = buf1[7] + block_shift_x + block_shift_y * buf1Stride[7];              buf2[7] = buf2[7] + block_shift_x + block_shift_y * buf2Stride[7];
   11157             : 
   11158           0 :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11159             :                 context_ptr,
   11160             :                 block_index_in_sb_buffer,
   11161             :                 buf1, buf1Stride,
   11162             :                 buf2, buf2Stride,
   11163             :                 8, 8,
   11164             :                 x_search_area_origin,
   11165             :                 y_search_area_origin,
   11166           0 :                 &context_ptr->p_best_sad8x8[nidx],
   11167           0 :                 &context_ptr->p_best_mv8x8[nidx],
   11168           0 :                 context_ptr->psub_pel_direction8x8[nidx]);
   11169             :         }
   11170             :     }
   11171             : 
   11172             :     // 16x8 [32 partitions]
   11173             : 
   11174           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   11175           0 :         for (block_index = 0; block_index < 32; ++block_index) {
   11176           0 :             block_offset = (quad_index * 32);
   11177           0 :             x_offset = (quad_index & 0x01) << 6;
   11178           0 :             y_offset = (quad_index >> 1) << 6;
   11179           0 :             nidx = tab16x8[block_index] + block_offset;
   11180           0 :             block_shift_x = ((block_index & 0x03) << 4) + x_offset;
   11181           0 :             block_shift_y = ((block_index >> 2) << 3) + y_offset;
   11182             : 
   11183           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11184             : 
   11185           0 :             x_mv = _MVXT(context_ptr->p_best_mv16x8[nidx]);
   11186           0 :             y_mv = _MVYT(context_ptr->p_best_mv16x8[nidx]);
   11187             : 
   11188           0 :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11189             :                 pos_Full,
   11190             :                 full_stride,
   11191             :                 pos_b,
   11192             :                 pos_h,
   11193             :                 pos_j,
   11194             :                 context_ptr->interpolated_stride,
   11195             :                 x_mv,
   11196             :                 y_mv,
   11197             :                 buf1, buf1Stride,
   11198             :                 buf2, buf2Stride);
   11199             : 
   11200           0 :             buf1[0] = buf1[0] + block_shift_x + block_shift_y * buf1Stride[0];              buf2[0] = buf2[0] + block_shift_x + block_shift_y * buf2Stride[0];
   11201           0 :             buf1[1] = buf1[1] + block_shift_x + block_shift_y * buf1Stride[1];              buf2[1] = buf2[1] + block_shift_x + block_shift_y * buf2Stride[1];
   11202           0 :             buf1[2] = buf1[2] + block_shift_x + block_shift_y * buf1Stride[2];              buf2[2] = buf2[2] + block_shift_x + block_shift_y * buf2Stride[2];
   11203           0 :             buf1[3] = buf1[3] + block_shift_x + block_shift_y * buf1Stride[3];              buf2[3] = buf2[3] + block_shift_x + block_shift_y * buf2Stride[3];
   11204           0 :             buf1[4] = buf1[4] + block_shift_x + block_shift_y * buf1Stride[4];              buf2[4] = buf2[4] + block_shift_x + block_shift_y * buf2Stride[4];
   11205           0 :             buf1[5] = buf1[5] + block_shift_x + block_shift_y * buf1Stride[5];              buf2[5] = buf2[5] + block_shift_x + block_shift_y * buf2Stride[5];
   11206           0 :             buf1[6] = buf1[6] + block_shift_x + block_shift_y * buf1Stride[6];              buf2[6] = buf2[6] + block_shift_x + block_shift_y * buf2Stride[6];
   11207           0 :             buf1[7] = buf1[7] + block_shift_x + block_shift_y * buf1Stride[7];              buf2[7] = buf2[7] + block_shift_x + block_shift_y * buf2Stride[7];
   11208             : 
   11209           0 :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11210             :                 context_ptr,
   11211             :                 block_index_in_sb_buffer,
   11212             :                 buf1, buf1Stride,
   11213             :                 buf2, buf2Stride,
   11214             :                 16, 8,
   11215             :                 x_search_area_origin,
   11216             :                 y_search_area_origin,
   11217           0 :                 &context_ptr->p_best_sad16x8[nidx],
   11218           0 :                 &context_ptr->p_best_mv16x8[nidx],
   11219           0 :                 context_ptr->psub_pel_direction16x8[nidx]);
   11220             :         }
   11221             :     }
   11222             : 
   11223             :     // 8x16 [32 partitions]
   11224             : 
   11225           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   11226           0 :         for (block_index = 0; block_index < 32; ++block_index) {
   11227           0 :             block_offset = (quad_index * 32);
   11228           0 :             x_offset = (quad_index & 0x01) << 6;
   11229           0 :             y_offset = (quad_index >> 1) << 6;
   11230           0 :             nidx = tab8x16[block_index] + block_offset;
   11231           0 :             block_shift_x = ((block_index & 0x07) << 3) + x_offset;
   11232           0 :             block_shift_y = ((block_index >> 3) << 4) + y_offset;
   11233             : 
   11234           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11235             : 
   11236           0 :             x_mv = _MVXT(context_ptr->p_best_mv8x16[nidx]);
   11237           0 :             y_mv = _MVYT(context_ptr->p_best_mv8x16[nidx]);
   11238             : 
   11239           0 :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11240             :                 pos_Full,
   11241             :                 full_stride,
   11242             :                 pos_b,
   11243             :                 pos_h,
   11244             :                 pos_j,
   11245             :                 context_ptr->interpolated_stride,
   11246             :                 x_mv,
   11247             :                 y_mv,
   11248             :                 buf1, buf1Stride,
   11249             :                 buf2, buf2Stride);
   11250             : 
   11251           0 :             buf1[0] = buf1[0] + block_shift_x + block_shift_y * buf1Stride[0];              buf2[0] = buf2[0] + block_shift_x + block_shift_y * buf2Stride[0];
   11252           0 :             buf1[1] = buf1[1] + block_shift_x + block_shift_y * buf1Stride[1];              buf2[1] = buf2[1] + block_shift_x + block_shift_y * buf2Stride[1];
   11253           0 :             buf1[2] = buf1[2] + block_shift_x + block_shift_y * buf1Stride[2];              buf2[2] = buf2[2] + block_shift_x + block_shift_y * buf2Stride[2];
   11254           0 :             buf1[3] = buf1[3] + block_shift_x + block_shift_y * buf1Stride[3];              buf2[3] = buf2[3] + block_shift_x + block_shift_y * buf2Stride[3];
   11255           0 :             buf1[4] = buf1[4] + block_shift_x + block_shift_y * buf1Stride[4];              buf2[4] = buf2[4] + block_shift_x + block_shift_y * buf2Stride[4];
   11256           0 :             buf1[5] = buf1[5] + block_shift_x + block_shift_y * buf1Stride[5];              buf2[5] = buf2[5] + block_shift_x + block_shift_y * buf2Stride[5];
   11257           0 :             buf1[6] = buf1[6] + block_shift_x + block_shift_y * buf1Stride[6];              buf2[6] = buf2[6] + block_shift_x + block_shift_y * buf2Stride[6];
   11258           0 :             buf1[7] = buf1[7] + block_shift_x + block_shift_y * buf1Stride[7];              buf2[7] = buf2[7] + block_shift_x + block_shift_y * buf2Stride[7];
   11259             : 
   11260           0 :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11261             :                 context_ptr,
   11262             :                 block_index_in_sb_buffer,
   11263             :                 buf1, buf1Stride,
   11264             :                 buf2, buf2Stride,
   11265             :                 8, 16,
   11266             :                 x_search_area_origin,
   11267             :                 y_search_area_origin,
   11268           0 :                 &context_ptr->p_best_sad8x16[nidx],
   11269           0 :                 &context_ptr->p_best_mv8x16[nidx],
   11270           0 :                 context_ptr->psub_pel_direction8x16[nidx]);
   11271             :         }
   11272             :     }
   11273             : 
   11274             :     // 32x8 [16 partitions]
   11275           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   11276           0 :         for (block_index = 0; block_index < 16; ++block_index) {
   11277           0 :             block_offset = (quad_index * 16);
   11278           0 :             x_offset = (quad_index & 0x01) << 6;
   11279           0 :             y_offset = (quad_index >> 1) << 6;
   11280           0 :             nidx = tab32x8[block_index] + block_offset;
   11281           0 :             block_shift_x = ((block_index & 0x01) << 5) + x_offset;
   11282           0 :             block_shift_y = ((block_index >> 1) << 3) + y_offset;
   11283             : 
   11284           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11285             : 
   11286           0 :             x_mv = _MVXT(context_ptr->p_best_mv32x8[nidx]);
   11287           0 :             y_mv = _MVYT(context_ptr->p_best_mv32x8[nidx]);
   11288             : 
   11289           0 :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11290             :                 pos_Full,
   11291             :                 full_stride,
   11292             :                 pos_b,
   11293             :                 pos_h,
   11294             :                 pos_j,
   11295             :                 context_ptr->interpolated_stride,
   11296             :                 x_mv,
   11297             :                 y_mv,
   11298             :                 buf1, buf1Stride,
   11299             :                 buf2, buf2Stride);
   11300             : 
   11301           0 :             buf1[0] = buf1[0] + block_shift_x + block_shift_y * buf1Stride[0];              buf2[0] = buf2[0] + block_shift_x + block_shift_y * buf2Stride[0];
   11302           0 :             buf1[1] = buf1[1] + block_shift_x + block_shift_y * buf1Stride[1];              buf2[1] = buf2[1] + block_shift_x + block_shift_y * buf2Stride[1];
   11303           0 :             buf1[2] = buf1[2] + block_shift_x + block_shift_y * buf1Stride[2];              buf2[2] = buf2[2] + block_shift_x + block_shift_y * buf2Stride[2];
   11304           0 :             buf1[3] = buf1[3] + block_shift_x + block_shift_y * buf1Stride[3];              buf2[3] = buf2[3] + block_shift_x + block_shift_y * buf2Stride[3];
   11305           0 :             buf1[4] = buf1[4] + block_shift_x + block_shift_y * buf1Stride[4];              buf2[4] = buf2[4] + block_shift_x + block_shift_y * buf2Stride[4];
   11306           0 :             buf1[5] = buf1[5] + block_shift_x + block_shift_y * buf1Stride[5];              buf2[5] = buf2[5] + block_shift_x + block_shift_y * buf2Stride[5];
   11307           0 :             buf1[6] = buf1[6] + block_shift_x + block_shift_y * buf1Stride[6];              buf2[6] = buf2[6] + block_shift_x + block_shift_y * buf2Stride[6];
   11308           0 :             buf1[7] = buf1[7] + block_shift_x + block_shift_y * buf1Stride[7];              buf2[7] = buf2[7] + block_shift_x + block_shift_y * buf2Stride[7];
   11309             : 
   11310           0 :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11311             :                 context_ptr,
   11312             :                 block_index_in_sb_buffer,
   11313             :                 buf1, buf1Stride,
   11314             :                 buf2, buf2Stride,
   11315             :                 32, 8,
   11316             :                 x_search_area_origin,
   11317             :                 y_search_area_origin,
   11318           0 :                 &context_ptr->p_best_sad32x8[nidx],
   11319           0 :                 &context_ptr->p_best_mv32x8[nidx],
   11320           0 :                 context_ptr->psub_pel_direction32x8[nidx]);
   11321             :         }
   11322             :     }
   11323             : 
   11324             :     // 8x32 [16 partitions]
   11325           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   11326           0 :         for (block_index = 0; block_index < 16; ++block_index) {
   11327           0 :             block_offset = (quad_index * 16);
   11328           0 :             x_offset = (quad_index & 0x01) << 6;
   11329           0 :             y_offset = (quad_index >> 1) << 6;
   11330           0 :             nidx = tab8x32[block_index] + block_offset;
   11331           0 :             block_shift_x = ((block_index & 0x07) << 3) + x_offset;
   11332           0 :             block_shift_y = ((block_index >> 3) << 5) + y_offset;
   11333             : 
   11334           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11335             : 
   11336           0 :             x_mv = _MVXT(context_ptr->p_best_mv8x32[nidx]);
   11337           0 :             y_mv = _MVYT(context_ptr->p_best_mv8x32[nidx]);
   11338             : 
   11339           0 :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11340             :                 pos_Full,
   11341             :                 full_stride,
   11342             :                 pos_b,
   11343             :                 pos_h,
   11344             :                 pos_j,
   11345             :                 context_ptr->interpolated_stride,
   11346             :                 x_mv,
   11347             :                 y_mv,
   11348             :                 buf1, buf1Stride,
   11349             :                 buf2, buf2Stride);
   11350             : 
   11351           0 :             buf1[0] = buf1[0] + block_shift_x + block_shift_y * buf1Stride[0];              buf2[0] = buf2[0] + block_shift_x + block_shift_y * buf2Stride[0];
   11352           0 :             buf1[1] = buf1[1] + block_shift_x + block_shift_y * buf1Stride[1];              buf2[1] = buf2[1] + block_shift_x + block_shift_y * buf2Stride[1];
   11353           0 :             buf1[2] = buf1[2] + block_shift_x + block_shift_y * buf1Stride[2];              buf2[2] = buf2[2] + block_shift_x + block_shift_y * buf2Stride[2];
   11354           0 :             buf1[3] = buf1[3] + block_shift_x + block_shift_y * buf1Stride[3];              buf2[3] = buf2[3] + block_shift_x + block_shift_y * buf2Stride[3];
   11355           0 :             buf1[4] = buf1[4] + block_shift_x + block_shift_y * buf1Stride[4];              buf2[4] = buf2[4] + block_shift_x + block_shift_y * buf2Stride[4];
   11356           0 :             buf1[5] = buf1[5] + block_shift_x + block_shift_y * buf1Stride[5];              buf2[5] = buf2[5] + block_shift_x + block_shift_y * buf2Stride[5];
   11357           0 :             buf1[6] = buf1[6] + block_shift_x + block_shift_y * buf1Stride[6];              buf2[6] = buf2[6] + block_shift_x + block_shift_y * buf2Stride[6];
   11358           0 :             buf1[7] = buf1[7] + block_shift_x + block_shift_y * buf1Stride[7];              buf2[7] = buf2[7] + block_shift_x + block_shift_y * buf2Stride[7];
   11359             : 
   11360           0 :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11361             :                 context_ptr,
   11362             :                 block_index_in_sb_buffer,
   11363             :                 buf1, buf1Stride,
   11364             :                 buf2, buf2Stride,
   11365             :                 8, 32,
   11366             :                 x_search_area_origin,
   11367             :                 y_search_area_origin,
   11368           0 :                 &context_ptr->p_best_sad8x32[nidx],
   11369           0 :                 &context_ptr->p_best_mv8x32[nidx],
   11370           0 :                 context_ptr->psub_pel_direction8x32[nidx]);
   11371             :         }
   11372             :     }
   11373             : 
   11374             :     // 16x16 [16 partitions]
   11375           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   11376           0 :         for (block_index = 0; block_index < 16; ++block_index) {
   11377           0 :             block_offset = (quad_index * 16);
   11378           0 :             x_offset = (quad_index & 0x01) << 6;
   11379           0 :             y_offset = (quad_index >> 1) << 6;
   11380           0 :             nidx = tab16x16[block_index] + block_offset;
   11381           0 :             block_shift_x = ((block_index & 0x03) << 4) + x_offset;
   11382           0 :             block_shift_y = ((block_index >> 2) << 4) + y_offset;
   11383             : 
   11384           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11385             : 
   11386           0 :             x_mv = _MVXT(context_ptr->p_best_mv16x16[nidx]);
   11387           0 :             y_mv = _MVYT(context_ptr->p_best_mv16x16[nidx]);
   11388             : 
   11389           0 :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11390             :                 pos_Full,
   11391             :                 full_stride,
   11392             :                 pos_b,
   11393             :                 pos_h,
   11394             :                 pos_j,
   11395             :                 context_ptr->interpolated_stride,
   11396             :                 x_mv,
   11397             :                 y_mv,
   11398             :                 buf1, buf1Stride,
   11399             :                 buf2, buf2Stride);
   11400             : 
   11401           0 :             buf1[0] = buf1[0] + block_shift_x + block_shift_y * buf1Stride[0];              buf2[0] = buf2[0] + block_shift_x + block_shift_y * buf2Stride[0];
   11402           0 :             buf1[1] = buf1[1] + block_shift_x + block_shift_y * buf1Stride[1];              buf2[1] = buf2[1] + block_shift_x + block_shift_y * buf2Stride[1];
   11403           0 :             buf1[2] = buf1[2] + block_shift_x + block_shift_y * buf1Stride[2];              buf2[2] = buf2[2] + block_shift_x + block_shift_y * buf2Stride[2];
   11404           0 :             buf1[3] = buf1[3] + block_shift_x + block_shift_y * buf1Stride[3];              buf2[3] = buf2[3] + block_shift_x + block_shift_y * buf2Stride[3];
   11405           0 :             buf1[4] = buf1[4] + block_shift_x + block_shift_y * buf1Stride[4];              buf2[4] = buf2[4] + block_shift_x + block_shift_y * buf2Stride[4];
   11406           0 :             buf1[5] = buf1[5] + block_shift_x + block_shift_y * buf1Stride[5];              buf2[5] = buf2[5] + block_shift_x + block_shift_y * buf2Stride[5];
   11407           0 :             buf1[6] = buf1[6] + block_shift_x + block_shift_y * buf1Stride[6];              buf2[6] = buf2[6] + block_shift_x + block_shift_y * buf2Stride[6];
   11408           0 :             buf1[7] = buf1[7] + block_shift_x + block_shift_y * buf1Stride[7];              buf2[7] = buf2[7] + block_shift_x + block_shift_y * buf2Stride[7];
   11409             : 
   11410           0 :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11411             :                 context_ptr,
   11412             :                 block_index_in_sb_buffer,
   11413             :                 buf1, buf1Stride,
   11414             :                 buf2, buf2Stride,
   11415             :                 16, 16,
   11416             :                 x_search_area_origin,
   11417             :                 y_search_area_origin,
   11418           0 :                 &context_ptr->p_best_sad16x16[nidx],
   11419           0 :                 &context_ptr->p_best_mv16x16[nidx],
   11420           0 :                 context_ptr->psub_pel_direction16x16[nidx]);
   11421             :         }
   11422             :     }
   11423             : 
   11424             :     // 32x16 [8 partitions]
   11425             : 
   11426           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   11427           0 :         for (block_index = 0; block_index < 8; ++block_index) {
   11428           0 :             block_offset = (quad_index * 8);
   11429           0 :             x_offset = (quad_index & 0x01) << 6;
   11430           0 :             y_offset = (quad_index >> 1) << 6;
   11431           0 :             nidx = tab32x16[block_index] + block_offset;
   11432           0 :             block_shift_x = ((block_index & 0x01) << 5) + x_offset;
   11433           0 :             block_shift_y = ((block_index >> 1) << 4) + y_offset;
   11434             : 
   11435           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11436             : 
   11437           0 :             x_mv = _MVXT(context_ptr->p_best_mv32x16[nidx]);
   11438           0 :             y_mv = _MVYT(context_ptr->p_best_mv32x16[nidx]);
   11439             : 
   11440           0 :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11441             :                 pos_Full,
   11442             :                 full_stride,
   11443             :                 pos_b,
   11444             :                 pos_h,
   11445             :                 pos_j,
   11446             :                 context_ptr->interpolated_stride,
   11447             :                 x_mv,
   11448             :                 y_mv,
   11449             :                 buf1, buf1Stride,
   11450             :                 buf2, buf2Stride);
   11451             : 
   11452           0 :             buf1[0] = buf1[0] + block_shift_x + block_shift_y * buf1Stride[0];              buf2[0] = buf2[0] + block_shift_x + block_shift_y * buf2Stride[0];
   11453           0 :             buf1[1] = buf1[1] + block_shift_x + block_shift_y * buf1Stride[1];              buf2[1] = buf2[1] + block_shift_x + block_shift_y * buf2Stride[1];
   11454           0 :             buf1[2] = buf1[2] + block_shift_x + block_shift_y * buf1Stride[2];              buf2[2] = buf2[2] + block_shift_x + block_shift_y * buf2Stride[2];
   11455           0 :             buf1[3] = buf1[3] + block_shift_x + block_shift_y * buf1Stride[3];              buf2[3] = buf2[3] + block_shift_x + block_shift_y * buf2Stride[3];
   11456           0 :             buf1[4] = buf1[4] + block_shift_x + block_shift_y * buf1Stride[4];              buf2[4] = buf2[4] + block_shift_x + block_shift_y * buf2Stride[4];
   11457           0 :             buf1[5] = buf1[5] + block_shift_x + block_shift_y * buf1Stride[5];              buf2[5] = buf2[5] + block_shift_x + block_shift_y * buf2Stride[5];
   11458           0 :             buf1[6] = buf1[6] + block_shift_x + block_shift_y * buf1Stride[6];              buf2[6] = buf2[6] + block_shift_x + block_shift_y * buf2Stride[6];
   11459           0 :             buf1[7] = buf1[7] + block_shift_x + block_shift_y * buf1Stride[7];              buf2[7] = buf2[7] + block_shift_x + block_shift_y * buf2Stride[7];
   11460             : 
   11461           0 :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11462             :                 context_ptr,
   11463             :                 block_index_in_sb_buffer,
   11464             :                 buf1, buf1Stride,
   11465             :                 buf2, buf2Stride,
   11466             :                 32, 16,
   11467             :                 x_search_area_origin,
   11468             :                 y_search_area_origin,
   11469           0 :                 &context_ptr->p_best_sad32x16[nidx],
   11470           0 :                 &context_ptr->p_best_mv32x16[nidx],
   11471           0 :                 context_ptr->psub_pel_direction32x16[nidx]);
   11472             :         }
   11473             :     }
   11474             : 
   11475             :     // 16x32 [8 partitions]
   11476           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   11477           0 :         for (block_index = 0; block_index < 8; ++block_index) {
   11478           0 :             block_offset = (quad_index * 8);
   11479           0 :             x_offset = (quad_index & 0x01) << 6;
   11480           0 :             y_offset = (quad_index >> 1) << 6;
   11481           0 :             nidx = tab16x32[block_index] + block_offset;
   11482           0 :             block_shift_x = ((block_index & 0x03) << 4) + x_offset;
   11483           0 :             block_shift_y = ((block_index >> 2) << 5) + y_offset;
   11484             : 
   11485           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11486             : 
   11487           0 :             x_mv = _MVXT(context_ptr->p_best_mv16x32[nidx]);
   11488           0 :             y_mv = _MVYT(context_ptr->p_best_mv16x32[nidx]);
   11489             : 
   11490           0 :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11491             :                 pos_Full,
   11492             :                 full_stride,
   11493             :                 pos_b,
   11494             :                 pos_h,
   11495             :                 pos_j,
   11496             :                 context_ptr->interpolated_stride,
   11497             :                 x_mv,
   11498             :                 y_mv,
   11499             :                 buf1, buf1Stride,
   11500             :                 buf2, buf2Stride);
   11501             : 
   11502           0 :             buf1[0] = buf1[0] + block_shift_x + block_shift_y * buf1Stride[0];              buf2[0] = buf2[0] + block_shift_x + block_shift_y * buf2Stride[0];
   11503           0 :             buf1[1] = buf1[1] + block_shift_x + block_shift_y * buf1Stride[1];              buf2[1] = buf2[1] + block_shift_x + block_shift_y * buf2Stride[1];
   11504           0 :             buf1[2] = buf1[2] + block_shift_x + block_shift_y * buf1Stride[2];              buf2[2] = buf2[2] + block_shift_x + block_shift_y * buf2Stride[2];
   11505           0 :             buf1[3] = buf1[3] + block_shift_x + block_shift_y * buf1Stride[3];              buf2[3] = buf2[3] + block_shift_x + block_shift_y * buf2Stride[3];
   11506           0 :             buf1[4] = buf1[4] + block_shift_x + block_shift_y * buf1Stride[4];              buf2[4] = buf2[4] + block_shift_x + block_shift_y * buf2Stride[4];
   11507           0 :             buf1[5] = buf1[5] + block_shift_x + block_shift_y * buf1Stride[5];              buf2[5] = buf2[5] + block_shift_x + block_shift_y * buf2Stride[5];
   11508           0 :             buf1[6] = buf1[6] + block_shift_x + block_shift_y * buf1Stride[6];              buf2[6] = buf2[6] + block_shift_x + block_shift_y * buf2Stride[6];
   11509           0 :             buf1[7] = buf1[7] + block_shift_x + block_shift_y * buf1Stride[7];              buf2[7] = buf2[7] + block_shift_x + block_shift_y * buf2Stride[7];
   11510             : 
   11511           0 :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11512             :                 context_ptr,
   11513             :                 block_index_in_sb_buffer,
   11514             :                 buf1, buf1Stride,
   11515             :                 buf2, buf2Stride,
   11516             :                 16, 32,
   11517             :                 x_search_area_origin,
   11518             :                 y_search_area_origin,
   11519           0 :                 &context_ptr->p_best_sad16x32[nidx],
   11520           0 :                 &context_ptr->p_best_mv16x32[nidx],
   11521           0 :                 context_ptr->psub_pel_direction16x32[nidx]);
   11522             :         }
   11523             :     }
   11524             : 
   11525             :     // 32x32 [4 partitions]
   11526           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   11527           0 :         for (block_index = 0; block_index < 4; ++block_index) {
   11528           0 :             block_offset = (quad_index * 4);
   11529           0 :             x_offset = (quad_index & 0x01) << 6;
   11530           0 :             y_offset = (quad_index >> 1) << 6;
   11531           0 :             nidx = tab32x32[block_index] + block_offset;
   11532           0 :             block_shift_x = ((block_index & 0x01) << 5) + x_offset;
   11533           0 :             block_shift_y = ((block_index >> 1)) + y_offset;
   11534           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11535             : 
   11536           0 :             x_mv = _MVXT(context_ptr->p_best_mv32x32[nidx]);
   11537           0 :             y_mv = _MVYT(context_ptr->p_best_mv32x32[nidx]);
   11538             : 
   11539           0 :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11540             :                 pos_Full,
   11541             :                 full_stride,
   11542             :                 pos_b,
   11543             :                 pos_h,
   11544             :                 pos_j,
   11545             :                 context_ptr->interpolated_stride,
   11546             :                 x_mv,
   11547             :                 y_mv,
   11548             :                 buf1, buf1Stride,
   11549             :                 buf2, buf2Stride);
   11550             : 
   11551           0 :             buf1[0] = buf1[0] + block_shift_x + block_shift_y * buf1Stride[0];              buf2[0] = buf2[0] + block_shift_x + block_shift_y * buf2Stride[0];
   11552           0 :             buf1[1] = buf1[1] + block_shift_x + block_shift_y * buf1Stride[1];              buf2[1] = buf2[1] + block_shift_x + block_shift_y * buf2Stride[1];
   11553           0 :             buf1[2] = buf1[2] + block_shift_x + block_shift_y * buf1Stride[2];              buf2[2] = buf2[2] + block_shift_x + block_shift_y * buf2Stride[2];
   11554           0 :             buf1[3] = buf1[3] + block_shift_x + block_shift_y * buf1Stride[3];              buf2[3] = buf2[3] + block_shift_x + block_shift_y * buf2Stride[3];
   11555           0 :             buf1[4] = buf1[4] + block_shift_x + block_shift_y * buf1Stride[4];              buf2[4] = buf2[4] + block_shift_x + block_shift_y * buf2Stride[4];
   11556           0 :             buf1[5] = buf1[5] + block_shift_x + block_shift_y * buf1Stride[5];              buf2[5] = buf2[5] + block_shift_x + block_shift_y * buf2Stride[5];
   11557           0 :             buf1[6] = buf1[6] + block_shift_x + block_shift_y * buf1Stride[6];              buf2[6] = buf2[6] + block_shift_x + block_shift_y * buf2Stride[6];
   11558           0 :             buf1[7] = buf1[7] + block_shift_x + block_shift_y * buf1Stride[7];              buf2[7] = buf2[7] + block_shift_x + block_shift_y * buf2Stride[7];
   11559             : 
   11560           0 :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11561             :                 context_ptr,
   11562             :                 block_index_in_sb_buffer,
   11563             :                 buf1, buf1Stride,
   11564             :                 buf2, buf2Stride,
   11565             :                 32, 32,
   11566             :                 x_search_area_origin,
   11567             :                 y_search_area_origin,
   11568           0 :                 &context_ptr->p_best_sad32x32[nidx],
   11569           0 :                 &context_ptr->p_best_mv32x32[nidx],
   11570           0 :                 context_ptr->psub_pel_direction32x32[nidx]);
   11571             :         }
   11572             :     }
   11573             : 
   11574             :     // 64x32 [2 partitions]
   11575           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   11576           0 :         for (block_index = 0; block_index < 2; ++block_index) {
   11577           0 :             block_offset = (quad_index * 2);
   11578           0 :             x_offset = (quad_index & 0x01) << 6;
   11579           0 :             y_offset = (quad_index >> 1) << 6;
   11580           0 :             nidx = tab64x32[block_index] + block_offset;
   11581           0 :             block_shift_x = x_offset;
   11582           0 :             block_shift_y = (block_index << 5) + y_offset;
   11583             : 
   11584           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11585             : 
   11586           0 :             x_mv = _MVXT(context_ptr->p_best_mv64x32[nidx]);
   11587           0 :             y_mv = _MVYT(context_ptr->p_best_mv64x32[nidx]);
   11588             : 
   11589           0 :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11590             :                 pos_Full,
   11591             :                 full_stride,
   11592             :                 pos_b,
   11593             :                 pos_h,
   11594             :                 pos_j,
   11595             :                 context_ptr->interpolated_stride,
   11596             :                 x_mv,
   11597             :                 y_mv,
   11598             :                 buf1, buf1Stride,
   11599             :                 buf2, buf2Stride);
   11600             : 
   11601           0 :             buf1[0] = buf1[0] + block_shift_x + block_shift_y * buf1Stride[0];              buf2[0] = buf2[0] + block_shift_x + block_shift_y * buf2Stride[0];
   11602           0 :             buf1[1] = buf1[1] + block_shift_x + block_shift_y * buf1Stride[1];              buf2[1] = buf2[1] + block_shift_x + block_shift_y * buf2Stride[1];
   11603           0 :             buf1[2] = buf1[2] + block_shift_x + block_shift_y * buf1Stride[2];              buf2[2] = buf2[2] + block_shift_x + block_shift_y * buf2Stride[2];
   11604           0 :             buf1[3] = buf1[3] + block_shift_x + block_shift_y * buf1Stride[3];              buf2[3] = buf2[3] + block_shift_x + block_shift_y * buf2Stride[3];
   11605           0 :             buf1[4] = buf1[4] + block_shift_x + block_shift_y * buf1Stride[4];              buf2[4] = buf2[4] + block_shift_x + block_shift_y * buf2Stride[4];
   11606           0 :             buf1[5] = buf1[5] + block_shift_x + block_shift_y * buf1Stride[5];              buf2[5] = buf2[5] + block_shift_x + block_shift_y * buf2Stride[5];
   11607           0 :             buf1[6] = buf1[6] + block_shift_x + block_shift_y * buf1Stride[6];              buf2[6] = buf2[6] + block_shift_x + block_shift_y * buf2Stride[6];
   11608           0 :             buf1[7] = buf1[7] + block_shift_x + block_shift_y * buf1Stride[7];              buf2[7] = buf2[7] + block_shift_x + block_shift_y * buf2Stride[7];
   11609             : 
   11610           0 :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11611             :                 context_ptr,
   11612             :                 block_index_in_sb_buffer,
   11613             :                 buf1, buf1Stride,
   11614             :                 buf2, buf2Stride,
   11615             :                 64, 32,
   11616             :                 x_search_area_origin,
   11617             :                 y_search_area_origin,
   11618           0 :                 &context_ptr->p_best_sad64x32[nidx],
   11619           0 :                 &context_ptr->p_best_mv64x32[nidx],
   11620           0 :                 context_ptr->psub_pel_direction64x32[nidx]);
   11621             :         }
   11622             :     }
   11623             : 
   11624             :     // 32x64 [2 partitions]
   11625           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   11626           0 :         for (block_index = 0; block_index < 2; ++block_index) {
   11627           0 :             block_offset = (quad_index * 2);
   11628           0 :             x_offset = (quad_index & 0x01) << 6;
   11629           0 :             y_offset = (quad_index >> 1) << 6;
   11630           0 :             nidx = tab32x64[block_index] + block_offset;
   11631           0 :             block_shift_x = (block_index << 5) + x_offset;
   11632           0 :             block_shift_y = y_offset;
   11633             : 
   11634           0 :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11635             : 
   11636           0 :             x_mv = _MVXT(context_ptr->p_best_mv32x64[nidx]);
   11637           0 :             y_mv = _MVYT(context_ptr->p_best_mv32x64[nidx]);
   11638             : 
   11639           0 :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11640             :                 pos_Full,
   11641             :                 full_stride,
   11642             :                 pos_b,
   11643             :                 pos_h,
   11644             :                 pos_j,
   11645             :                 context_ptr->interpolated_stride,
   11646             :                 x_mv,
   11647             :                 y_mv,
   11648             :                 buf1, buf1Stride,
   11649             :                 buf2, buf2Stride);
   11650             : 
   11651           0 :             buf1[0] = buf1[0] + block_shift_x + block_shift_y * buf1Stride[0];              buf2[0] = buf2[0] + block_shift_x + block_shift_y * buf2Stride[0];
   11652           0 :             buf1[1] = buf1[1] + block_shift_x + block_shift_y * buf1Stride[1];              buf2[1] = buf2[1] + block_shift_x + block_shift_y * buf2Stride[1];
   11653           0 :             buf1[2] = buf1[2] + block_shift_x + block_shift_y * buf1Stride[2];              buf2[2] = buf2[2] + block_shift_x + block_shift_y * buf2Stride[2];
   11654           0 :             buf1[3] = buf1[3] + block_shift_x + block_shift_y * buf1Stride[3];              buf2[3] = buf2[3] + block_shift_x + block_shift_y * buf2Stride[3];
   11655           0 :             buf1[4] = buf1[4] + block_shift_x + block_shift_y * buf1Stride[4];              buf2[4] = buf2[4] + block_shift_x + block_shift_y * buf2Stride[4];
   11656           0 :             buf1[5] = buf1[5] + block_shift_x + block_shift_y * buf1Stride[5];              buf2[5] = buf2[5] + block_shift_x + block_shift_y * buf2Stride[5];
   11657           0 :             buf1[6] = buf1[6] + block_shift_x + block_shift_y * buf1Stride[6];              buf2[6] = buf2[6] + block_shift_x + block_shift_y * buf2Stride[6];
   11658           0 :             buf1[7] = buf1[7] + block_shift_x + block_shift_y * buf1Stride[7];              buf2[7] = buf2[7] + block_shift_x + block_shift_y * buf2Stride[7];
   11659             : 
   11660           0 :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11661             :                 context_ptr,
   11662             :                 block_index_in_sb_buffer,
   11663             :                 buf1, buf1Stride,
   11664             :                 buf2, buf2Stride,
   11665             :                 32, 64,
   11666             :                 x_search_area_origin,
   11667             :                 y_search_area_origin,
   11668           0 :                 &context_ptr->p_best_sad32x64[nidx],
   11669           0 :                 &context_ptr->p_best_mv32x64[nidx],
   11670           0 :                 context_ptr->psub_pel_direction32x64[nidx]);
   11671             :         }
   11672             :     }
   11673             : 
   11674             :     // 64x64 [1 partitions]
   11675           0 :     for (quad_index = 0; quad_index < number_of_sb_quad; quad_index++) {
   11676           0 :         block_index = 0;
   11677             : 
   11678           0 :         block_offset = quad_index;
   11679           0 :         x_offset = (quad_index & 0x01) << 6;
   11680           0 :         y_offset = (quad_index >> 1) << 6;
   11681           0 :         nidx = block_offset;
   11682           0 :         block_shift_x = x_offset;
   11683           0 :         block_shift_y = y_offset;
   11684             : 
   11685           0 :         block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11686             : 
   11687           0 :         x_mv = _MVXT(context_ptr->p_best_mv64x64[nidx]);
   11688           0 :         y_mv = _MVYT(context_ptr->p_best_mv64x64[nidx]);
   11689             : 
   11690           0 :         set_quarterpel_refinement_inputs_on_the_fly_block(
   11691             :             pos_Full,
   11692             :             full_stride,
   11693             :             pos_b,
   11694             :             pos_h,
   11695             :             pos_j,
   11696             :             context_ptr->interpolated_stride,
   11697             :             x_mv,
   11698             :             y_mv,
   11699             :             buf1, buf1Stride,
   11700             :             buf2, buf2Stride);
   11701             : 
   11702           0 :         in_loop_me_quarterpel_refinement_on_the_fly_block(
   11703             :             context_ptr,
   11704             :             block_index_in_sb_buffer,
   11705             :             buf1, buf1Stride,
   11706             :             buf2, buf2Stride,
   11707             :             64, 64,
   11708             :             x_search_area_origin,
   11709             :             y_search_area_origin,
   11710           0 :             &context_ptr->p_best_sad64x64[nidx],
   11711           0 :             &context_ptr->p_best_mv64x64[nidx],
   11712           0 :             context_ptr->psub_pel_direction64x64[nidx]);
   11713             :     }
   11714             : 
   11715             :     if (0) {
   11716             :         // 128x64 [2 partitions]
   11717             :         for (block_index = 0; block_index < 2; ++block_index) {
   11718             :             block_index = 0;
   11719             : 
   11720             :             block_shift_x = 0;
   11721             :             block_shift_y = block_index << 6;
   11722             : 
   11723             :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11724             : 
   11725             :             x_mv = _MVXT(context_ptr->p_best_mv128x64[block_index]);
   11726             :             y_mv = _MVYT(context_ptr->p_best_mv128x64[block_index]);
   11727             : 
   11728             :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11729             :                 pos_Full,
   11730             :                 full_stride,
   11731             :                 pos_b,
   11732             :                 pos_h,
   11733             :                 pos_j,
   11734             :                 context_ptr->interpolated_stride,
   11735             :                 x_mv,
   11736             :                 y_mv,
   11737             :                 buf1, buf1Stride,
   11738             :                 buf2, buf2Stride);
   11739             : 
   11740             :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11741             :                 context_ptr,
   11742             :                 block_index_in_sb_buffer,
   11743             :                 buf1, buf1Stride,
   11744             :                 buf2, buf2Stride,
   11745             :                 128, 64,
   11746             :                 x_search_area_origin,
   11747             :                 y_search_area_origin,
   11748             :                 &context_ptr->p_best_sad128x64[block_index],
   11749             :                 &context_ptr->p_best_mv128x64[block_index],
   11750             :                 context_ptr->psub_pel_direction128x64[block_index]);
   11751             :         }
   11752             :         // 64x128 [2 partitions]
   11753             :         for (block_index = 0; block_index < 2; ++block_index) {
   11754             :             block_index = 0;
   11755             : 
   11756             :             block_shift_x = block_index << 6;
   11757             :             block_shift_y = 0;
   11758             : 
   11759             :             block_index_in_sb_buffer = block_shift_x + block_shift_y * context_ptr->sb_src_stride;
   11760             : 
   11761             :             x_mv = _MVXT(context_ptr->p_best_mv64x128[block_index]);
   11762             :             y_mv = _MVYT(context_ptr->p_best_mv64x128[block_index]);
   11763             : 
   11764             :             set_quarterpel_refinement_inputs_on_the_fly_block(
   11765             :                 pos_Full,
   11766             :                 full_stride,
   11767             :                 pos_b,
   11768             :                 pos_h,
   11769             :                 pos_j,
   11770             :                 context_ptr->interpolated_stride,
   11771             :                 x_mv,
   11772             :                 y_mv,
   11773             :                 buf1, buf1Stride,
   11774             :                 buf2, buf2Stride);
   11775             : 
   11776             :             in_loop_me_quarterpel_refinement_on_the_fly_block(
   11777             :                 context_ptr,
   11778             :                 block_index_in_sb_buffer,
   11779             :                 buf1, buf1Stride,
   11780             :                 buf2, buf2Stride,
   11781             :                 64, 128,
   11782             :                 x_search_area_origin,
   11783             :                 y_search_area_origin,
   11784             :                 &context_ptr->p_best_sad64x128[block_index],
   11785             :                 &context_ptr->p_best_mv64x128[block_index],
   11786             :                 context_ptr->psub_pel_direction64x128[block_index]);
   11787             :         }
   11788             :         // 128x128 [1 partitions]
   11789             :         block_index = 0;
   11790             : 
   11791             :         block_shift_x = 0;
   11792             :         block_shift_y = 0;
   11793             : 
   11794             :         block_index_in_sb_buffer = 0;
   11795             : 
   11796             :         x_mv = _MVXT(context_ptr->p_best_mv128x128[block_index]);
   11797             :         y_mv = _MVYT(context_ptr->p_best_mv128x128[block_index]);
   11798             : 
   11799             :         set_quarterpel_refinement_inputs_on_the_fly_block(
   11800             :             pos_Full,
   11801             :             full_stride,
   11802             :             pos_b,
   11803             :             pos_h,
   11804             :             pos_j,
   11805             :             context_ptr->interpolated_stride,
   11806             :             x_mv,
   11807             :             y_mv,
   11808             :             buf1, buf1Stride,
   11809             :             buf2, buf2Stride);
   11810             : 
   11811             :         in_loop_me_quarterpel_refinement_on_the_fly_block(
   11812             :             context_ptr,
   11813             :             block_index_in_sb_buffer,
   11814             :             buf1, buf1Stride,
   11815             :             buf2, buf2Stride,
   11816             :             128, 128,
   11817             :             x_search_area_origin,
   11818             :             y_search_area_origin,
   11819             :             &context_ptr->p_best_sad128x128[block_index],
   11820             :             &context_ptr->p_best_mv128x128[block_index],
   11821             :             context_ptr->psub_pel_direction128x128);
   11822             :     }
   11823           0 :     return;
   11824             :     }
   11825             : 
   11826             : #define MAX_SEARCH_POINT_WIDTH  128
   11827             : #define MAX_SEARCH_POINT_HEIGHT 128
   11828             : 
   11829             : #define MAX_TATAL_SEARCH_AREA_WIDTH        (MAX_SB_SIZE + MAX_SEARCH_POINT_WIDTH  + ME_FILTER_TAP)
   11830             : #define MAX_TATAL_SEARCH_AREA_HEIGHT       (MAX_SB_SIZE + MAX_SEARCH_POINT_HEIGHT  + ME_FILTER_TAP)
   11831             : 
   11832             : #define MAX_SEARCH_AREA_SIZE     MAX_TATAL_SEARCH_AREA_WIDTH * MAX_TATAL_SEARCH_AREA_HEIGHT
   11833             : /***************************************************************
   11834             : * in_loop_motion_estimation_sblock
   11835             : *  perform the full-pel serach for the whole super-block
   11836             : *  on the reference reconstructed pictures
   11837             : ***************************************************************/
   11838           0 : EB_EXTERN EbErrorType in_loop_motion_estimation_sblock(
   11839             :     PictureControlSet         *picture_control_set_ptr,  // input parameter, Picture Control Set Ptr
   11840             :     uint32_t                       sb_origin_x,            // input parameter, SB Origin X
   11841             :     uint32_t                       sb_origin_y,            // input parameter, SB Origin X
   11842             :     int16_t                       x_mv_l0,
   11843             :     int16_t                       y_mv_l0,
   11844             :     int16_t                       x_mv_l1,
   11845             :     int16_t                       y_mv_l1,
   11846             :     SsMeContext                 *context_ptr)           // input parameter, ME Context Ptr, used to store decimated/interpolated LCU/SR
   11847             : 
   11848             : {
   11849           0 :     EbErrorType return_error = EB_ErrorNone;
   11850             : 
   11851           0 :     SequenceControlSet    *sequence_control_set_ptr = (SequenceControlSet*)picture_control_set_ptr->sequence_control_set_wrapper_ptr->object_ptr;
   11852             : 
   11853             :     int16_t                  xTopLeftSearchRegion;
   11854             :     int16_t                  yTopLeftSearchRegion;
   11855             :     uint32_t                  searchRegionIndex;
   11856           0 :     int16_t                  picture_width = (int16_t)((SequenceControlSet*)picture_control_set_ptr->sequence_control_set_wrapper_ptr->object_ptr)->seq_header.max_frame_width;
   11857           0 :     int16_t                  picture_height = (int16_t)((SequenceControlSet*)picture_control_set_ptr->sequence_control_set_wrapper_ptr->object_ptr)->seq_header.max_frame_height;
   11858             : 
   11859           0 :     int16_t                  padWidth = (int16_t)BLOCK_SIZE_64 - 1;
   11860           0 :     int16_t                  padHeight = (int16_t)BLOCK_SIZE_64 - 1;
   11861             :     int16_t                  search_area_width;
   11862             :     int16_t                  search_area_height;
   11863             :     int16_t                  x_search_area_origin;
   11864             :     int16_t                  y_search_area_origin;
   11865           0 :     int16_t                  origin_x = (int16_t)sb_origin_x;
   11866           0 :     int16_t                  origin_y = (int16_t)sb_origin_y;
   11867             : 
   11868           0 :     uint8_t                   refPicIndex = 0;
   11869             :     // Final ME Search Center
   11870           0 :     int16_t                  x_search_center = 0;
   11871           0 :     int16_t                  y_search_center = 0;
   11872             : 
   11873             :     uint32_t                  numOfListToSearch;
   11874             :     uint32_t                  listIndex;
   11875             :     EbPictureBufferDesc  *refPicPtr;
   11876             :     EbReferenceObject    *referenceObject;
   11877             : 
   11878           0 :     uint32_t                  number_of_sb_quad = sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128 ? 4 : 1;
   11879           0 :     context_ptr->sb_size = sequence_control_set_ptr->seq_header.sb_size;
   11880           0 :     context_ptr->sb_side = sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128 ? 128 : 64;
   11881             : 
   11882           0 :     const uint32_t start_idx_8x8 = 256 * number_of_sb_quad;
   11883           0 :     const uint32_t start_idx_16x16 = 320 * number_of_sb_quad;
   11884           0 :     const uint32_t start_idx_32x32 = 336 * number_of_sb_quad;
   11885           0 :     const uint32_t start_idx_64x64 = 340 * number_of_sb_quad;
   11886           0 :     const uint32_t start_idx_8x4 = 341 * number_of_sb_quad;
   11887           0 :     const uint32_t start_idx_4x8 = 469 * number_of_sb_quad;
   11888           0 :     const uint32_t start_idx_4x16 = 597 * number_of_sb_quad;
   11889           0 :     const uint32_t start_idx_16x4 = 661 * number_of_sb_quad;
   11890           0 :     const uint32_t start_idx_16x8 = 725 * number_of_sb_quad;
   11891           0 :     const uint32_t start_idx_8x16 = 757 * number_of_sb_quad;
   11892           0 :     const uint32_t start_idx_32x8 = 789 * number_of_sb_quad;
   11893           0 :     const uint32_t start_idx_8x32 = 805 * number_of_sb_quad;
   11894           0 :     const uint32_t start_idx_32x16 = 821 * number_of_sb_quad;
   11895           0 :     const uint32_t start_idx_16x32 = 829 * number_of_sb_quad;
   11896           0 :     const uint32_t start_idx_64x16 = 837 * number_of_sb_quad;
   11897           0 :     const uint32_t start_idx_16x64 = 841 * number_of_sb_quad;
   11898           0 :     const uint32_t start_idx_64x32 = 845 * number_of_sb_quad;
   11899           0 :     const uint32_t start_idx_32x64 = 847 * number_of_sb_quad;
   11900           0 :     const uint32_t start_idx_128x64 = 849 * number_of_sb_quad;
   11901             : 
   11902           0 :     context_ptr->fractional_search_method = SSD_SEARCH; // all in-loop
   11903             : 
   11904           0 :     numOfListToSearch = (picture_control_set_ptr->slice_type == P_SLICE) ? (uint32_t)REF_LIST_0 : (uint32_t)REF_LIST_1;
   11905             : 
   11906             :     // Uni-Prediction motion estimation loop
   11907             :     // List Loop
   11908           0 :     for (listIndex = REF_LIST_0; listIndex <= numOfListToSearch; ++listIndex) {
   11909           0 :         EbBool  is16bit = (EbBool)(sequence_control_set_ptr->static_config.encoder_bit_depth > EB_8BIT);
   11910           0 :         referenceObject = (EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[listIndex][0]->object_ptr;
   11911           0 :         refPicPtr = is16bit ? (EbPictureBufferDesc*)referenceObject->reference_picture16bit : (EbPictureBufferDesc*)referenceObject->reference_picture;
   11912           0 :         search_area_width = (int16_t)MIN(context_ptr->search_area_width, 127);
   11913           0 :         search_area_height = (int16_t)MIN(context_ptr->search_area_height, 127);
   11914           0 :         x_search_center = listIndex == REF_LIST_0 ? x_mv_l0 : x_mv_l1;
   11915           0 :         y_search_center = listIndex == REF_LIST_0 ? y_mv_l0 : y_mv_l1;
   11916             : 
   11917           0 :         x_search_area_origin = x_search_center - (search_area_width >> 1);
   11918           0 :         y_search_area_origin = y_search_center - (search_area_height >> 1);
   11919             : 
   11920             :         // Correct the left edge of the Search Area if it is not on the reference Picture
   11921           0 :         x_search_area_origin = ((origin_x + x_search_area_origin) < -padWidth) ?
   11922           0 :             -padWidth - origin_x :
   11923             :             x_search_area_origin;
   11924             : 
   11925           0 :         search_area_width = ((origin_x + x_search_area_origin) < -padWidth) ?
   11926           0 :             search_area_width - (-padWidth - (origin_x + x_search_area_origin)) :
   11927             :             search_area_width;
   11928             : 
   11929             :         // Correct the right edge of the Search Area if its not on the reference Picture
   11930           0 :         x_search_area_origin = ((origin_x + x_search_area_origin) > picture_width - 1) ?
   11931           0 :             x_search_area_origin - ((origin_x + x_search_area_origin) - (picture_width - 1)) :
   11932             :             x_search_area_origin;
   11933             : 
   11934             :         // //check whether the needed search area is coverd by the reference picture and adjust its origin to satisfy the condition if not.
   11935           0 :         if (sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128) {
   11936           0 :             int32_t righ_sa_pos_x = refPicPtr->origin_x + origin_x + x_search_area_origin + search_area_width + (context_ptr->sb_side - 1) + (ME_FILTER_TAP >> 1);
   11937           0 :             int32_t righ_ref_pos_x = picture_width - 1 + (2 * refPicPtr->origin_x);
   11938             : 
   11939           0 :             x_search_area_origin = righ_sa_pos_x > righ_ref_pos_x ? x_search_area_origin - (righ_sa_pos_x - righ_ref_pos_x) : x_search_area_origin;
   11940             : 
   11941           0 :             int32_t bottom_sa_pos_x = refPicPtr->origin_y + origin_y + y_search_area_origin + search_area_height + (context_ptr->sb_side - 1) + (ME_FILTER_TAP >> 1);
   11942           0 :             int32_t bottom_ref_pos_x = picture_height - 1 + (2 * refPicPtr->origin_y);
   11943             : 
   11944           0 :             y_search_area_origin = bottom_sa_pos_x > bottom_ref_pos_x ? y_search_area_origin - (bottom_sa_pos_x - bottom_ref_pos_x) : y_search_area_origin;
   11945             :         }
   11946             : 
   11947           0 :         search_area_width = ((origin_x + x_search_area_origin + search_area_width) > picture_width) ?
   11948           0 :             MAX(1, search_area_width - ((origin_x + x_search_area_origin + search_area_width) - picture_width)) :
   11949             :             search_area_width;
   11950             : 
   11951             :         // Correct the top edge of the Search Area if it is not on the reference Picture
   11952           0 :         y_search_area_origin = ((origin_y + y_search_area_origin) < -padHeight) ?
   11953           0 :             -padHeight - origin_y :
   11954             :             y_search_area_origin;
   11955             : 
   11956           0 :         search_area_height = ((origin_y + y_search_area_origin) < -padHeight) ?
   11957           0 :             search_area_height - (-padHeight - (origin_y + y_search_area_origin)) :
   11958             :             search_area_height;
   11959             : 
   11960             :         // Correct the bottom edge of the Search Area if its not on the reference Picture
   11961           0 :         y_search_area_origin = ((origin_y + y_search_area_origin) > picture_height - 1) ?
   11962           0 :             y_search_area_origin - ((origin_y + y_search_area_origin) - (picture_height - 1)) :
   11963             :             y_search_area_origin;
   11964             : 
   11965           0 :         search_area_height = (origin_y + y_search_area_origin + search_area_height > picture_height) ?
   11966           0 :             MAX(1, search_area_height - ((origin_y + y_search_area_origin + search_area_height) - picture_height)) :
   11967             :             search_area_height;
   11968             : 
   11969           0 :         context_ptr->x_search_area_origin[listIndex][0] = x_search_area_origin;
   11970           0 :         context_ptr->y_search_area_origin[listIndex][0] = y_search_area_origin;
   11971             : 
   11972           0 :         xTopLeftSearchRegion = (int16_t)(refPicPtr->origin_x + sb_origin_x) - (ME_FILTER_TAP >> 1) + x_search_area_origin;
   11973           0 :         yTopLeftSearchRegion = (int16_t)(refPicPtr->origin_y + sb_origin_y) - (ME_FILTER_TAP >> 1) + y_search_area_origin;
   11974           0 :         searchRegionIndex = (xTopLeftSearchRegion)+(yTopLeftSearchRegion)* refPicPtr->stride_y;
   11975             : 
   11976             :         // Umpack the reference for 16bit reference picture.
   11977           0 :         if (is16bit) {
   11978           0 :             uint16_t *ptr16 = (uint16_t *)refPicPtr->buffer_y + searchRegionIndex;
   11979             : 
   11980             :             uint8_t searchAreaBuffer[MAX_SEARCH_AREA_SIZE];
   11981             : 
   11982           0 :             extract8_bitdata_safe_sub(
   11983             :                 ptr16,
   11984           0 :                 refPicPtr->stride_y,
   11985             :                 searchAreaBuffer,
   11986             :                 MAX_TATAL_SEARCH_AREA_WIDTH,
   11987           0 :                 search_area_width + context_ptr->sb_side + ME_FILTER_TAP,
   11988           0 :                 search_area_height + context_ptr->sb_side + ME_FILTER_TAP,
   11989             :                 EB_FALSE);
   11990             : 
   11991           0 :             context_ptr->integer_buffer_ptr[listIndex][0] = &(searchAreaBuffer[0]);
   11992           0 :             context_ptr->interpolated_full_stride[listIndex][0] = MAX_TATAL_SEARCH_AREA_WIDTH;
   11993             :         }
   11994             :         else {
   11995           0 :             context_ptr->integer_buffer_ptr[listIndex][0] = &(refPicPtr->buffer_y[searchRegionIndex]);
   11996           0 :             context_ptr->interpolated_full_stride[listIndex][0] = refPicPtr->stride_y;
   11997             :         }
   11998             : 
   11999             :         // Move to the top left of the search region
   12000           0 :         xTopLeftSearchRegion = (int16_t)(refPicPtr->origin_x + sb_origin_x) + x_search_area_origin;
   12001           0 :         yTopLeftSearchRegion = (int16_t)(refPicPtr->origin_y + sb_origin_y) + y_search_area_origin;
   12002           0 :         searchRegionIndex = xTopLeftSearchRegion + yTopLeftSearchRegion * refPicPtr->stride_y;
   12003             : 
   12004             :         //849 * 4 + 5 block are supported
   12005           0 :         initialize_buffer_32bits(context_ptr->p_sb_best_sad[listIndex][refPicIndex], (MAX_SS_ME_PU_COUNT / 4), 1, MAX_SAD_VALUE);
   12006             : 
   12007           0 :         context_ptr->p_best_sad4x4 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][0]);
   12008           0 :         context_ptr->p_best_mv4x4 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][0]);
   12009             : 
   12010           0 :         context_ptr->p_best_sad8x8 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][256 * number_of_sb_quad]);
   12011           0 :         context_ptr->p_best_mv8x8 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][256 * number_of_sb_quad]);
   12012             : 
   12013           0 :         context_ptr->p_best_sad16x16 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][320 * number_of_sb_quad]);
   12014           0 :         context_ptr->p_best_mv16x16 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][320 * number_of_sb_quad]);
   12015             : 
   12016           0 :         context_ptr->p_best_sad32x32 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][336 * number_of_sb_quad]);
   12017           0 :         context_ptr->p_best_mv32x32 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][336 * number_of_sb_quad]);
   12018             : 
   12019           0 :         context_ptr->p_best_sad64x64 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][340 * number_of_sb_quad]);
   12020           0 :         context_ptr->p_best_mv64x64 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][340 * number_of_sb_quad]);
   12021             : 
   12022           0 :         context_ptr->p_best_sad8x4 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][341 * number_of_sb_quad]);
   12023           0 :         context_ptr->p_best_mv8x4 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][341 * number_of_sb_quad]);
   12024             : 
   12025           0 :         context_ptr->p_best_sad4x8 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][469 * number_of_sb_quad]);
   12026           0 :         context_ptr->p_best_mv4x8 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][469 * number_of_sb_quad]);
   12027             : 
   12028           0 :         context_ptr->p_best_sad4x16 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][597 * number_of_sb_quad]);
   12029           0 :         context_ptr->p_best_mv4x16 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][597 * number_of_sb_quad]);
   12030             : 
   12031           0 :         context_ptr->p_best_sad16x4 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][661 * number_of_sb_quad]);
   12032           0 :         context_ptr->p_best_mv16x4 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][661 * number_of_sb_quad]);
   12033             : 
   12034           0 :         context_ptr->p_best_sad16x8 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][725 * number_of_sb_quad]);
   12035           0 :         context_ptr->p_best_mv16x8 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][725 * number_of_sb_quad]);
   12036             : 
   12037           0 :         context_ptr->p_best_sad8x16 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][757 * number_of_sb_quad]);
   12038           0 :         context_ptr->p_best_mv8x16 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][757 * number_of_sb_quad]);
   12039             : 
   12040           0 :         context_ptr->p_best_sad32x8 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][789 * number_of_sb_quad]);
   12041           0 :         context_ptr->p_best_mv32x8 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][789 * number_of_sb_quad]);
   12042             : 
   12043           0 :         context_ptr->p_best_sad8x32 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][805 * number_of_sb_quad]);
   12044           0 :         context_ptr->p_best_mv8x32 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][805 * number_of_sb_quad]);
   12045             : 
   12046           0 :         context_ptr->p_best_sad32x16 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][821 * number_of_sb_quad]);
   12047           0 :         context_ptr->p_best_mv32x16 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][821 * number_of_sb_quad]);
   12048             : 
   12049           0 :         context_ptr->p_best_sad16x32 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][829 * number_of_sb_quad]);
   12050           0 :         context_ptr->p_best_mv16x32 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][829 * number_of_sb_quad]);
   12051             : 
   12052           0 :         context_ptr->p_best_sad64x16 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][837 * number_of_sb_quad]);
   12053           0 :         context_ptr->p_best_mv64x16 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][837 * number_of_sb_quad]);
   12054             : 
   12055           0 :         context_ptr->p_best_sad16x64 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][841 * number_of_sb_quad]);
   12056           0 :         context_ptr->p_best_mv16x64 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][841 * number_of_sb_quad]);
   12057             : 
   12058           0 :         context_ptr->p_best_sad64x32 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][845 * number_of_sb_quad]);
   12059           0 :         context_ptr->p_best_mv64x32 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][845 * number_of_sb_quad]);
   12060             : 
   12061           0 :         context_ptr->p_best_sad32x64 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][847 * number_of_sb_quad]);
   12062           0 :         context_ptr->p_best_mv32x64 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][847 * number_of_sb_quad]);
   12063             : 
   12064           0 :         if (sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128) {
   12065           0 :             context_ptr->p_best_sad128x64 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][849 * number_of_sb_quad]);
   12066           0 :             context_ptr->p_best_mv128x64 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][849 * number_of_sb_quad]);
   12067             : 
   12068           0 :             context_ptr->p_best_sad64x128 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][(849 * number_of_sb_quad) + 2]);
   12069           0 :             context_ptr->p_best_mv64x128 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][(849 * number_of_sb_quad) + 2]);
   12070             : 
   12071           0 :             context_ptr->p_best_sad128x128 = &(context_ptr->p_sb_best_sad[listIndex][refPicIndex][(849 * number_of_sb_quad) + 4]);
   12072           0 :             context_ptr->p_best_mv128x128 = &(context_ptr->p_sb_best_mv[listIndex][refPicIndex][(849 * number_of_sb_quad) + 4]);
   12073             :         }
   12074             : 
   12075           0 :         in_loop_me_fullpel_search_sblock(
   12076             :             context_ptr,
   12077             :             listIndex,
   12078             :             x_search_area_origin,
   12079             :             y_search_area_origin,
   12080             :             search_area_width,
   12081             :             search_area_height,
   12082             :             number_of_sb_quad);
   12083             : 
   12084           0 :         if (context_ptr->use_subpel_flag == 1) {
   12085             :             // Move to the top left of the search region
   12086           0 :             xTopLeftSearchRegion = (int16_t)(refPicPtr->origin_x + sb_origin_x) + x_search_area_origin;
   12087           0 :             yTopLeftSearchRegion = (int16_t)(refPicPtr->origin_y + sb_origin_y) + y_search_area_origin;
   12088           0 :             searchRegionIndex = xTopLeftSearchRegion + yTopLeftSearchRegion * refPicPtr->stride_y;
   12089             : 
   12090             :             // Interpolate the search region for Half-Pel Refinements
   12091             :             // H - AVC Style
   12092             : 
   12093           0 :             in_loop_me_interpolate_search_region_avc_style(
   12094             :                 context_ptr,
   12095             :                 listIndex,
   12096           0 :                 context_ptr->integer_buffer_ptr[listIndex][0] + (ME_FILTER_TAP >> 1) + ((ME_FILTER_TAP >> 1) * context_ptr->interpolated_full_stride[listIndex][0]),
   12097             :                 context_ptr->interpolated_full_stride[listIndex][0],
   12098           0 :                 (uint32_t)search_area_width + (context_ptr->sb_side - 1),
   12099           0 :                 (uint32_t)search_area_height + (context_ptr->sb_side - 1),
   12100             :                 8);
   12101             : 
   12102             :             // Half-Pel Refinement [8 search positions]
   12103           0 :             in_loop_me_halfpel_search_sblock(
   12104             :                 sequence_control_set_ptr,
   12105             :                 context_ptr,
   12106           0 :                 &(context_ptr->pos_b_buffer[listIndex][0][(ME_FILTER_TAP >> 1) * context_ptr->interpolated_stride]),
   12107           0 :                 &(context_ptr->pos_h_buffer[listIndex][0][1]),
   12108             :                 &(context_ptr->pos_j_buffer[listIndex][0][0]),
   12109             :                 x_search_area_origin,
   12110             :                 y_search_area_origin);
   12111             : 
   12112             :             // Quarter-Pel Refinement [8 search positions]
   12113           0 :             in_loop_me_quarterpel_search_sblock(
   12114             :                 context_ptr,
   12115           0 :                 context_ptr->integer_buffer_ptr[listIndex][0] + (ME_FILTER_TAP >> 1) + ((ME_FILTER_TAP >> 1) * context_ptr->interpolated_full_stride[listIndex][0]),
   12116             :                 context_ptr->interpolated_full_stride[listIndex][0],
   12117           0 :                 &(context_ptr->pos_b_buffer[listIndex][0][(ME_FILTER_TAP >> 1) * context_ptr->interpolated_stride]),  //points to b position of the figure above
   12118           0 :                 &(context_ptr->pos_h_buffer[listIndex][0][1]),                                                      //points to h position of the figure above
   12119             :                 &(context_ptr->pos_j_buffer[listIndex][0][0]),                                                      //points to j position of the figure above
   12120             :                 x_search_area_origin,
   12121             :                 y_search_area_origin);
   12122             :         }
   12123             :     }
   12124             : 
   12125             :     // Nader - Bipred candidate can be generated here if needed.
   12126           0 :     uint32_t max_number_of_block_in_sb = sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128 ? MAX_SS_ME_PU_COUNT : 849;
   12127             : 
   12128           0 :     for (listIndex = REF_LIST_0; listIndex <= numOfListToSearch; ++listIndex) {
   12129             :         uint32_t block_index;
   12130             :         uint32_t block_offset;
   12131             :         uint32_t nidx;
   12132           0 :         uint32_t candidate_cnt = 0;
   12133             : 
   12134           0 :         for (block_index = 0; block_index < max_number_of_block_in_sb; ++block_index) {
   12135             :             //4x4
   12136           0 :             if (block_index < start_idx_8x8) {
   12137           0 :                 block_offset = (block_index / 256) * 256;
   12138           0 :                 nidx = tab4x4[block_index - block_offset] + block_offset;
   12139             :             } //8x8
   12140           0 :             else if (block_index < start_idx_16x16) {
   12141           0 :                 block_offset = ((block_index - start_idx_8x8) / 64) * 64;
   12142           0 :                 nidx = tab8x8[block_index - start_idx_8x8 - block_offset] + block_offset + start_idx_8x8;
   12143             :             }//16x16
   12144           0 :             else if (block_index < start_idx_32x32) {
   12145           0 :                 block_offset = ((block_index - start_idx_16x16) / 16) * 16;
   12146           0 :                 nidx = tab16x16[block_index - start_idx_16x16 - block_offset] + block_offset + start_idx_16x16;
   12147             :             }//32x32
   12148           0 :             else if (block_index < start_idx_64x64) {
   12149           0 :                 block_offset = ((block_index - start_idx_32x32) / 4) * 4;
   12150           0 :                 nidx = tab32x32[block_index - start_idx_32x32 - block_offset] + block_offset + start_idx_32x32;
   12151             :             } //64x64
   12152           0 :             else if (block_index < start_idx_8x4) {
   12153           0 :                 block_offset = (block_index - start_idx_64x64);
   12154           0 :                 nidx = block_offset + start_idx_64x64;
   12155             :             } //8x4
   12156           0 :             else if (block_index < start_idx_4x8) {
   12157           0 :                 block_offset = ((block_index - start_idx_8x4) / 128) * 128;
   12158           0 :                 nidx = tab8x4[block_index - start_idx_8x4 - block_offset] + block_offset + start_idx_8x4;
   12159             :             }//4x8
   12160           0 :             else if (block_index < start_idx_4x16) {
   12161           0 :                 block_offset = ((block_index - start_idx_4x8) / 128) * 128;
   12162           0 :                 nidx = tab4x8[block_index - start_idx_4x8 - block_offset] + block_offset + start_idx_4x8;
   12163             :             }//4x16
   12164           0 :             else if (block_index < start_idx_16x4) {
   12165           0 :                 block_offset = ((block_index - start_idx_4x16) / 64) * 64;
   12166           0 :                 nidx = tab4x16[block_index - start_idx_4x16 - block_offset] + block_offset + start_idx_4x16;
   12167             :             }//16x4
   12168           0 :             else if (block_index < start_idx_16x8) {
   12169           0 :                 block_offset = ((block_index - start_idx_16x4) / 64) * 64;
   12170           0 :                 nidx = tab16x4[block_index - start_idx_16x4 - block_offset] + block_offset + start_idx_16x4;
   12171             :             }//16x8
   12172           0 :             else if (block_index < start_idx_8x16) {
   12173           0 :                 block_offset = ((block_index - start_idx_16x8) / 32) * 32;
   12174           0 :                 nidx = tab16x8[block_index - start_idx_16x8 - block_offset] + block_offset + start_idx_16x8;
   12175             :             }//8x16
   12176           0 :             else if (block_index < start_idx_32x8) {
   12177           0 :                 block_offset = ((block_index - start_idx_8x16) / 32) * 32;
   12178           0 :                 nidx = tab8x16[block_index - start_idx_8x16 - block_offset] + block_offset + start_idx_8x16;
   12179             :             }//32x8
   12180           0 :             else if (block_index < start_idx_8x32) {
   12181           0 :                 block_offset = ((block_index - start_idx_32x8) / 16) * 16;
   12182           0 :                 nidx = tab32x8[block_index - start_idx_32x8 - block_offset] + block_offset + start_idx_32x8;
   12183             :             }//8x32
   12184           0 :             else if (block_index < start_idx_32x16) {
   12185           0 :                 block_offset = ((block_index - start_idx_8x32) / 16) * 16;
   12186           0 :                 nidx = tab8x32[block_index - start_idx_8x32 - block_offset] + block_offset + start_idx_8x32;
   12187             :             }//32x16
   12188           0 :             else if (block_index < start_idx_16x32) {
   12189           0 :                 block_offset = ((block_index - start_idx_32x16) / 8) * 8;
   12190           0 :                 nidx = tab32x16[block_index - start_idx_32x16 - block_offset] + block_offset + start_idx_32x16;
   12191             :             }//16x32
   12192           0 :             else if (block_index < start_idx_64x16) {
   12193           0 :                 block_offset = ((block_index - start_idx_16x32) / 8) * 8;
   12194           0 :                 nidx = tab16x32[block_index - start_idx_16x32 - block_offset] + block_offset + start_idx_16x32;
   12195             :             }//64x16
   12196           0 :             else if (block_index < start_idx_16x64) {
   12197           0 :                 block_offset = ((block_index - start_idx_64x16) / 4) * 4;
   12198           0 :                 nidx = tab64x16[block_index - start_idx_64x16 - block_offset] + block_offset + start_idx_64x16;
   12199             :             }//16x64
   12200           0 :             else if (block_index < start_idx_64x32) {
   12201           0 :                 block_offset = ((block_index - start_idx_16x64) / 4) * 4;
   12202           0 :                 nidx = tab16x64[block_index - start_idx_16x64 - block_offset] + block_offset + start_idx_16x64;
   12203             :             }//64x32
   12204           0 :             else if (block_index < start_idx_32x64) {
   12205           0 :                 block_offset = ((block_index - start_idx_64x32) / 2) * 2;
   12206           0 :                 nidx = tab64x32[block_index - start_idx_64x32 - block_offset] + block_offset + start_idx_64x32;
   12207             :             }//32x64
   12208           0 :             else if (block_index < start_idx_128x64) {
   12209           0 :                 block_offset = ((block_index - start_idx_32x64) / 2) * 2;
   12210           0 :                 nidx = tab32x64[block_index - start_idx_32x64 - block_offset] + block_offset + start_idx_32x64;
   12211             :             }//128x64, //64x128 and 128x128
   12212             :             else
   12213           0 :                 nidx = block_index;
   12214           0 :             context_ptr->inloop_me_mv[0][0][candidate_cnt][0] = _MVXT(context_ptr->p_sb_best_mv[0][0][nidx]);
   12215           0 :             context_ptr->inloop_me_mv[0][0][candidate_cnt][1] = _MVYT(context_ptr->p_sb_best_mv[0][0][nidx]);
   12216           0 :             context_ptr->inloop_me_mv[1][0][candidate_cnt][0] = _MVXT(context_ptr->p_sb_best_mv[1][0][nidx]);
   12217           0 :             context_ptr->inloop_me_mv[1][0][candidate_cnt][1] = _MVYT(context_ptr->p_sb_best_mv[1][0][nidx]);
   12218           0 :             candidate_cnt++;
   12219             :         }
   12220             :     }
   12221             : 
   12222           0 :     return return_error;
   12223             : }
   12224             : 
   12225             : #if PREDICT_NSQ_SHAPE
   12226           0 : uint64_t spatial_full_distortion_helper(
   12227             :     uint8_t  *input,
   12228             :     uint32_t input_offset,
   12229             :     uint32_t  input_stride,
   12230             :     uint8_t  *recon,
   12231             :     uint32_t recon_offset,
   12232             :     uint32_t  recon_stride,
   12233             :     uint32_t  area_width,
   12234             :     uint32_t  area_height,
   12235             :     uint8_t  choice) {
   12236             : 
   12237           0 :     uint64_t sfd = 0;
   12238             : 
   12239           0 :     switch (choice) {
   12240           0 :     case 0:
   12241           0 :         sfd = spatial_full_distortion_kernel4x_n_sse2_intrin(input, input_offset, input_stride, recon, recon_offset, recon_stride, area_width, area_height);break;
   12242           0 :     case 1:
   12243           0 :         sfd = spatial_full_distortion_kernel8x_n_sse2_intrin(input, input_offset, input_stride, recon, recon_offset, recon_stride, area_width, area_height);break;
   12244           0 :     case 2:
   12245           0 :         sfd = spatial_full_distortion_kernel16x_n_sse2_intrin(input, input_offset, input_stride, recon, recon_offset, recon_stride, area_width, area_height);break;
   12246           0 :     case 3:
   12247           0 :         sfd = spatial_full_distortion_kernel32x_n_sse2_intrin(input, input_offset, input_stride, recon, recon_offset, recon_stride, area_width, area_height);break;
   12248           0 :     case 4:
   12249           0 :         sfd = spatial_full_distortion_kernel64x_n_sse2_intrin(input, input_offset, input_stride, recon, recon_offset, recon_stride, area_width, area_height);break;
   12250           0 :     case 5:
   12251           0 :         sfd = spatial_full_distortion_kernel128x_n_sse2_intrin(input, input_offset, input_stride, recon, recon_offset, recon_stride, area_width, area_height);break;
   12252             :     }
   12253             : 
   12254           0 :     return sfd;
   12255             : }
   12256             : 
   12257           0 : uint64_t spatial_full_distortion_avx2_helper(
   12258             :     uint8_t  *input,
   12259             :     uint32_t input_offset,
   12260             :     uint32_t  input_stride,
   12261             :     uint8_t  *recon,
   12262             :     uint32_t recon_offset,
   12263             :     uint32_t  recon_stride,
   12264             :     uint32_t  area_width,
   12265             :     uint32_t  area_height,
   12266             :     uint8_t  choice) {
   12267             : 
   12268           0 :     uint64_t sfd = 0;
   12269             : 
   12270           0 :     switch (choice) {
   12271           0 :     case 0:
   12272           0 :         sfd = spatial_full_distortion_kernel4x_n_avx2_intrin(input, input_offset, input_stride, recon, recon_offset, recon_stride, area_width, area_height);break;
   12273           0 :     case 1:
   12274           0 :         sfd = spatial_full_distortion_kernel8x_n_avx2_intrin(input, input_offset, input_stride, recon, recon_offset, recon_stride, area_width, area_height);break;
   12275           0 :     case 2:
   12276           0 :         sfd = spatial_full_distortion_kernel16x_n_avx2_intrin(input, input_offset, input_stride, recon, recon_offset, recon_stride, area_width, area_height);break;
   12277           0 :     case 3:
   12278           0 :         sfd = spatial_full_distortion_kernel32x_n_avx2_intrin(input, input_offset, input_stride, recon, recon_offset, recon_stride, area_width, area_height);break;
   12279           0 :     case 4:
   12280           0 :         sfd = spatial_full_distortion_kernel64x_n_avx2_intrin(input, input_offset, input_stride, recon, recon_offset, recon_stride, area_width, area_height);break;
   12281           0 :     case 5:
   12282           0 :         sfd = spatial_full_distortion_kernel128x_n_avx2_intrin(input, input_offset, input_stride, recon, recon_offset, recon_stride, area_width, area_height);break;
   12283             :     }
   12284             : 
   12285           0 :     return sfd;
   12286             : }
   12287             : #endif

Generated by: LCOV version 1.14