LCOV - code coverage report
Current view: top level - Codec - EbInterPrediction.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 1555 3117 49.9 %
Date: 2019-11-25 17:38:06 Functions: 68 121 56.2 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : /*
       7             : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       8             : *
       9             : * This source code is subject to the terms of the BSD 2 Clause License and
      10             : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      11             : * was not distributed with this source code in the LICENSE file, you can
      12             : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      13             : * Media Patent License 1.0 was not distributed with this source code in the
      14             : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      15             : */
      16             : 
      17             : #include <stdlib.h>
      18             : 
      19             : #include "EbPictureControlSet.h"
      20             : #include "EbReferenceObject.h"
      21             : 
      22             : #include "EbInterPrediction.h"
      23             : #include "EbSvtAv1.h"
      24             : #include "EbDefinitions.h"
      25             : #include "EbAdaptiveMotionVectorPrediction.h"
      26             : 
      27             : #include "EbModeDecisionProcess.h"
      28             : 
      29             : #include "convolve.h"
      30             : #include "aom_dsp_rtcd.h"
      31             : #include "EbRateDistortionCost.h"
      32             : 
      33             : #define MVBOUNDLOW    36    //  (80-71)<<2 // 80 = ReferencePadding ; minus 71 is derived from the expression -64 + 1 - 8, and plus 7 is derived from expression -1 + 8
      34             : #define MVBOUNDHIGH   348   //  (80+7)<<2
      35             : #define REFPADD_QPEL  320   //  (16+64)<<2
      36             : 
      37             : #define AOM_INTERP_EXTEND 4
      38             : 
      39             : #define SCALE_NUMERATOR 8
      40             : 
      41             : #define SCALE_SUBPEL_BITS 10
      42             : #define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
      43             : #define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
      44             : #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
      45             : #define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
      46             : 
      47             : #define BIL_SUBPEL_BITS 3
      48             : #define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
      49             : 
      50             : #define ROUND0_BITS 3
      51             : #define COMPOUND_ROUND1_BITS 7
      52             : 
      53             : static EB_AV1_INTER_PREDICTION_FUNC_PTR   av1_inter_prediction_function_table[2] =
      54             : {
      55             :     av1_inter_prediction,
      56             :     av1_inter_prediction_hbd
      57             : };
      58             : 
      59             : /* TODO: Add scaling of reference frame support later */
      60             : // Note: Expect val to be in q4 precision
      61           0 : static INLINE int32_t scaled_x(int32_t val, const ScaleFactors *sf) {
      62           0 :     const int off =
      63           0 :         (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
      64           0 :     const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
      65           0 :     return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
      66             :         REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
      67             : }
      68             : 
      69             : // Note: Expect val to be in q4 precision
      70           0 : static INLINE int32_t scaled_y(int32_t val, const ScaleFactors *sf) {
      71           0 :     const int32_t off =
      72           0 :         (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
      73           0 :     const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
      74           0 :     return (int32_t)ROUND_POWER_OF_TWO_SIGNED_64(tval,
      75             :         REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
      76             : }
      77             : 
      78             : // Note: Expect val to be in q4 precision
      79           0 : static int32_t unscaled_value(int32_t val, const ScaleFactors *sf) {
      80             :     (void)sf;
      81           0 :     return val << SCALE_EXTRA_BITS;
      82             : }
      83             : 
      84         812 : static int32_t get_fixed_point_scale_factor(int32_t other_size, int32_t this_size) {
      85             :     // Calculate scaling factor once for each reference frame
      86             :     // and use fixed point scaling factors in decoding and encoding routines.
      87             :     // Hardware implementations can calculate scale factor in device driver
      88             :     // and use multiplication and shifting on hardware instead of division.
      89         812 :     return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size;
      90             : }
      91             : 
      92             : // Given the fixed point scale, calculate coarse point scale.
      93         812 : static int32_t fixed_point_scale_to_coarse_point_scale(int32_t scale_fp) {
      94         812 :     return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS);
      95             : }
      96             : 
      97             : // Note: x and y are integer precision, mvq4 is q4 precision.
      98           0 : MV32 av1_scale_mv(const MV *mvq4, int x, int y,
      99             :     const ScaleFactors *sf) {
     100           0 :     const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf);
     101           0 :     const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf);
     102           0 :     const MV32 res = { scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4,
     103           0 :         scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4 };
     104           0 :     return res;
     105             : }
     106             : 
     107         406 : void av1_setup_scale_factors_for_frame(ScaleFactors *sf, int other_w,
     108             :     int other_h, int this_w, int this_h) {
     109         406 :     if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
     110           0 :         sf->x_scale_fp = REF_INVALID_SCALE;
     111           0 :         sf->y_scale_fp = REF_INVALID_SCALE;
     112           0 :         return;
     113             :     }
     114             : 
     115         406 :     sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
     116         406 :     sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
     117             : 
     118         406 :     sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp);
     119         406 :     sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp);
     120             : 
     121         406 :     if (av1_is_scaled(sf)) {
     122           0 :         sf->scale_value_x = scaled_x;
     123           0 :         sf->scale_value_y = scaled_y;
     124             :     }
     125             :     else {
     126         406 :         sf->scale_value_x = unscaled_value;
     127         406 :         sf->scale_value_y = unscaled_value;
     128             :     }
     129             : }
     130             : 
     131       39670 : static INLINE int32_t has_scale(int32_t xs, int32_t ys) {
     132       39670 :     return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
     133             : }
     134             : 
     135       39670 : static INLINE void revert_scale_extra_bits(SubpelParams *sp) {
     136       39670 :     sp->subpel_x >>= SCALE_EXTRA_BITS;
     137       39670 :     sp->subpel_y >>= SCALE_EXTRA_BITS;
     138       39670 :     sp->xs >>= SCALE_EXTRA_BITS;
     139       39670 :     sp->ys >>= SCALE_EXTRA_BITS;
     140       39670 :     assert(sp->subpel_x < SUBPEL_SHIFTS);
     141       39670 :     assert(sp->subpel_y < SUBPEL_SHIFTS);
     142       39670 :     assert(sp->xs <= SUBPEL_SHIFTS);
     143       39670 :     assert(sp->ys <= SUBPEL_SHIFTS);
     144       39670 : }
     145             : 
     146             : extern void av1_set_ref_frame(MvReferenceFrame *rf,
     147             :     int8_t ref_frame_type);
     148             : 
     149   443962000 : static INLINE MV clamp_mv_to_umv_border_sb(const MacroBlockD *xd,
     150             :     const MV *src_mv, int32_t bw, int32_t bh,
     151             :     int32_t ss_x, int32_t ss_y) {
     152             :     // If the MV points so far into the UMV border that no visible pixels
     153             :     // are used for reconstruction, the subpel part of the MV can be
     154             :     // discarded and the MV limited to 16 pixels with equivalent results.
     155   443962000 :     const int32_t spel_left = (AOM_INTERP_EXTEND + bw) << SUBPEL_BITS;
     156   443962000 :     const int32_t spel_right = spel_left - SUBPEL_SHIFTS;
     157   443962000 :     const int32_t spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
     158   443962000 :     const int32_t spel_bottom = spel_top - SUBPEL_SHIFTS;
     159   443962000 :     MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))),
     160   443962000 :         (int16_t)(src_mv->col * (1 << (1 - ss_x))) };
     161   443962000 :     assert(ss_x <= 1);
     162   443962000 :     assert(ss_y <= 1);
     163             : 
     164   443962000 :     clamp_mv(&clamped_mv,
     165   443962000 :         xd->mb_to_left_edge   * (1 << (1 - ss_x)) - spel_left,
     166   443962000 :         xd->mb_to_right_edge  * (1 << (1 - ss_x)) + spel_right,
     167   443962000 :         xd->mb_to_top_edge    * (1 << (1 - ss_y)) - spel_top,
     168   443962000 :         xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom);
     169             : 
     170   444154000 :     return clamped_mv;
     171             : }
     172             : 
     173             : DECLARE_ALIGNED(256, const InterpKernel,
     174             : sub_pel_filters_8[SUBPEL_SHIFTS]) = {
     175             :     { 0, 0, 0, 128, 0, 0, 0, 0 },{ 0, 2, -6, 126, 8, -2, 0, 0 },
     176             :     { 0, 2, -10, 122, 18, -4, 0, 0 },{ 0, 2, -12, 116, 28, -8, 2, 0 },
     177             :     { 0, 2, -14, 110, 38, -10, 2, 0 },{ 0, 2, -14, 102, 48, -12, 2, 0 },
     178             :     { 0, 2, -16, 94, 58, -12, 2, 0 },{ 0, 2, -14, 84, 66, -12, 2, 0 },
     179             :     { 0, 2, -14, 76, 76, -14, 2, 0 },{ 0, 2, -12, 66, 84, -14, 2, 0 },
     180             :     { 0, 2, -12, 58, 94, -16, 2, 0 },{ 0, 2, -12, 48, 102, -14, 2, 0 },
     181             :     { 0, 2, -10, 38, 110, -14, 2, 0 },{ 0, 2, -8, 28, 116, -12, 2, 0 },
     182             :     { 0, 0, -4, 18, 122, -10, 2, 0 },{ 0, 0, -2, 8, 126, -6, 2, 0 }
     183             : };
     184             : DECLARE_ALIGNED(256, const InterpKernel,
     185             : sub_pel_filters_4[SUBPEL_SHIFTS]) = {
     186             :     { 0, 0, 0, 128, 0, 0, 0, 0 },{ 0, 0, -4, 126, 8, -2, 0, 0 },
     187             :     { 0, 0, -8, 122, 18, -4, 0, 0 },{ 0, 0, -10, 116, 28, -6, 0, 0 },
     188             :     { 0, 0, -12, 110, 38, -8, 0, 0 },{ 0, 0, -12, 102, 48, -10, 0, 0 },
     189             :     { 0, 0, -14, 94, 58, -10, 0, 0 },{ 0, 0, -12, 84, 66, -10, 0, 0 },
     190             :     { 0, 0, -12, 76, 76, -12, 0, 0 },{ 0, 0, -10, 66, 84, -12, 0, 0 },
     191             :     { 0, 0, -10, 58, 94, -14, 0, 0 },{ 0, 0, -10, 48, 102, -12, 0, 0 },
     192             :     { 0, 0, -8, 38, 110, -12, 0, 0 },{ 0, 0, -6, 28, 116, -10, 0, 0 },
     193             :     { 0, 0, -4, 18, 122, -8, 0, 0 },{ 0, 0, -2, 8, 126, -4, 0, 0 }
     194             : };
     195             : 
     196             : #define MAX_FILTER_TAP 8
     197   149484000 : int get_relative_dist_enc(SeqHeader *seq_header, int ref_hint, int order_hint)
     198             : {
     199             :     int diff, m;
     200   149484000 :     if (!seq_header->order_hint_info.enable_order_hint)
     201           0 :         return 0;
     202   149484000 :     diff = ref_hint - order_hint;
     203   149484000 :     m = 1 << (seq_header->order_hint_info.order_hint_bits - 1);
     204   149484000 :     diff = (diff & (m - 1)) - (diff & m);
     205   149484000 :     return diff;
     206             : }
     207             : 
     208             : static const int quant_dist_weight[4][2] = {
     209             :   { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE }
     210             : };
     211             : static const int quant_dist_lookup_table[2][4][2] = {
     212             :   { { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 } },
     213             :   { { 7, 9 }, { 5, 11 }, { 4, 12 }, { 3, 13 } },
     214             : };
     215             : 
     216   208721000 : void av1_dist_wtd_comp_weight_assign(
     217             :     SeqHeader *seq_header,
     218             :     int cur_frame_index,
     219             :     int bck_frame_index,
     220             :     int fwd_frame_index,
     221             :     int compound_idx,
     222             :     int order_idx,
     223             :     int *fwd_offset, int *bck_offset,
     224             :     int *use_dist_wtd_comp_avg,
     225             :     int is_compound) {
     226             : 
     227   208721000 :     assert(fwd_offset != NULL && bck_offset != NULL);
     228   208867000 :     if (!is_compound || compound_idx) {
     229   174338000 :         *use_dist_wtd_comp_avg = 0;
     230   174338000 :         return;
     231             :     }
     232             : 
     233    34529200 :     *use_dist_wtd_comp_avg = 1;
     234             : 
     235    34529200 :     int d0 = clamp(abs(get_relative_dist_enc(seq_header,
     236             :         fwd_frame_index, cur_frame_index)),
     237             :         0, MAX_FRAME_DISTANCE);
     238    34522200 :     int d1 = clamp(abs(get_relative_dist_enc(seq_header,
     239             :         cur_frame_index, bck_frame_index)),
     240             :         0, MAX_FRAME_DISTANCE);
     241             : 
     242    34512100 :     const int order = d0 <= d1;
     243             : 
     244    34512100 :     if (d0 == 0 || d1 == 0) {
     245           0 :         *fwd_offset = quant_dist_lookup_table[order_idx][3][order];
     246           0 :         *bck_offset = quant_dist_lookup_table[order_idx][3][1 - order];
     247           0 :         return;
     248             :     }
     249             : 
     250             :     int i;
     251    74198800 :     for (i = 0; i < 3; ++i) {
     252    66973600 :         int c0 = quant_dist_weight[i][order];
     253    66973600 :         int c1 = quant_dist_weight[i][!order];
     254    66973600 :         int d0_c0 = d0 * c0;
     255    66973600 :         int d1_c1 = d1 * c1;
     256    66973600 :         if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
     257             :     }
     258             : 
     259    34512100 :     *fwd_offset = quant_dist_lookup_table[order_idx][i][order];
     260    34512100 :     *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
     261             : }
     262             : 
     263           0 : void eb_av1_convolve_2d_sr_c(const uint8_t *src, int32_t src_stride, uint8_t *dst,
     264             :     int32_t dst_stride, int32_t w, int32_t h,
     265             :     InterpFilterParams *filter_params_x,
     266             :     InterpFilterParams *filter_params_y,
     267             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
     268             :     ConvolveParams *conv_params)
     269             : {
     270             :     int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
     271           0 :     int32_t im_h = h + filter_params_y->taps - 1;
     272           0 :     int32_t im_stride = w;
     273           0 :     const int32_t fo_vert = filter_params_y->taps / 2 - 1;
     274           0 :     const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
     275           0 :     const int32_t bd = 8;
     276           0 :     const int32_t bits =
     277           0 :         FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
     278             : 
     279             :     // horizontal filter
     280           0 :     const uint8_t *src_horiz = src - fo_vert * src_stride;
     281           0 :     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
     282             :         *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
     283           0 :     for (int32_t y = 0; y < im_h; ++y) {
     284           0 :         for (int32_t x = 0; x < w; ++x) {
     285           0 :             int32_t sum = (1 << (bd + FILTER_BITS - 1));
     286           0 :             for (int32_t k = 0; k < filter_params_x->taps; ++k)
     287           0 :                 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
     288           0 :             assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
     289           0 :             im_block[y * im_stride + x] =
     290           0 :                 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     291             :         }
     292             :     }
     293             : 
     294             :     // vertical filter
     295           0 :     int16_t *src_vert = im_block + fo_vert * im_stride;
     296           0 :     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
     297             :         *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
     298           0 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
     299           0 :     for (int32_t y = 0; y < h; ++y) {
     300           0 :         for (int32_t x = 0; x < w; ++x) {
     301           0 :             int32_t sum = 1 << offset_bits;
     302           0 :             for (int32_t k = 0; k < filter_params_y->taps; ++k)
     303           0 :                 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
     304           0 :             assert(0 <= sum && sum < (1 << (offset_bits + 2)));
     305           0 :             int16_t res = (ConvBufType)(ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
     306           0 :                 ((1 << (offset_bits - conv_params->round_1)) +
     307           0 :                 (1 << (offset_bits - conv_params->round_1 - 1))));
     308           0 :             dst[y * dst_stride + x] = (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), 8);
     309             :         }
     310             :     }
     311           0 : }
     312             : 
     313           0 : void eb_av1_convolve_y_sr_c(const uint8_t *src, int32_t src_stride, uint8_t *dst,
     314             :     int32_t dst_stride, int32_t w, int32_t h,
     315             :     InterpFilterParams *filter_params_x,
     316             :     InterpFilterParams *filter_params_y,
     317             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
     318             :     ConvolveParams *conv_params)
     319             : {
     320           0 :     assert(filter_params_y != NULL);
     321           0 :     const int32_t fo_vert = filter_params_y->taps / 2 - 1;
     322             :     (void)filter_params_x;
     323             :     (void)subpel_x_q4;
     324             :     (void)conv_params;
     325             : 
     326           0 :     assert(conv_params->round_0 <= FILTER_BITS);
     327           0 :     assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
     328             :         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
     329             : 
     330             :     // vertical filter
     331           0 :     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
     332             :         *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
     333             : 
     334           0 :     for (int32_t y = 0; y < h; ++y) {
     335           0 :         for (int32_t x = 0; x < w; ++x) {
     336           0 :             int32_t res = 0;
     337           0 :             for (int32_t k = 0; k < filter_params_y->taps; ++k)
     338           0 :                 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
     339           0 :             dst[y * dst_stride + x] =
     340           0 :                 (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), 8);
     341             :         }
     342             :     }
     343           0 : }
     344             : 
     345           0 : void eb_av1_convolve_x_sr_c(const uint8_t *src, int32_t src_stride, uint8_t *dst,
     346             :     int32_t dst_stride, int32_t w, int32_t h,
     347             :     InterpFilterParams *filter_params_x,
     348             :     InterpFilterParams *filter_params_y,
     349             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
     350             :     ConvolveParams *conv_params)
     351             : {
     352           0 :     const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
     353           0 :     const int32_t bits = FILTER_BITS - conv_params->round_0;
     354             :     (void)filter_params_y;
     355             :     (void)subpel_y_q4;
     356             :     (void)conv_params;
     357             : 
     358           0 :     assert(bits >= 0);
     359           0 :     assert((FILTER_BITS - conv_params->round_1) >= 0 ||
     360             :         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
     361             : 
     362             :     // horizontal filter
     363           0 :     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
     364             :         *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
     365             : 
     366           0 :     for (int32_t y = 0; y < h; ++y) {
     367           0 :         for (int32_t x = 0; x < w; ++x) {
     368           0 :             int32_t res = 0;
     369           0 :             for (int32_t k = 0; k < filter_params_x->taps; ++k)
     370           0 :                 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
     371           0 :             res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
     372           0 :             dst[y * dst_stride + x] = (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), 8);
     373             :         }
     374             :     }
     375           0 : }
     376             : 
     377           0 : void eb_av1_convolve_2d_copy_sr_c(const uint8_t *src, int32_t src_stride, uint8_t *dst,
     378             :     int32_t dst_stride, int32_t w, int32_t h,
     379             :     InterpFilterParams *filter_params_x,
     380             :     InterpFilterParams *filter_params_y,
     381             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
     382             :     ConvolveParams *conv_params) {
     383             :     (void)filter_params_x;
     384             :     (void)filter_params_y;
     385             :     (void)subpel_x_q4;
     386             :     (void)subpel_y_q4;
     387             :     (void)conv_params;
     388             : 
     389           0 :     for (int32_t y = 0; y < h; ++y) {
     390           0 :         for (int32_t x = 0; x < w; ++x)
     391           0 :             dst[y * dst_stride + x] = src[y * src_stride + x];
     392             :     }
     393           0 : }
     394             : 
     395           0 : void eb_av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
     396             :     uint8_t *dst8,
     397             :     int dst8_stride, int w, int h,
     398             :     const InterpFilterParams *filter_params_x,
     399             :     const InterpFilterParams *filter_params_y,
     400             :     const int subpel_x_qn, const int x_step_qn,
     401             :     const int subpel_y_qn, const int y_step_qn,
     402             :     ConvolveParams *conv_params)
     403             : {
     404             :     int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
     405           0 :     int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
     406           0 :         filter_params_y->taps;
     407           0 :     CONV_BUF_TYPE *dst16 = conv_params->dst;
     408           0 :     const int dst16_stride = conv_params->dst_stride;
     409           0 :     const int bits =
     410           0 :         FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
     411           0 :     assert(bits >= 0);
     412           0 :     int im_stride = w;
     413           0 :     const int fo_vert = filter_params_y->taps / 2 - 1;
     414           0 :     const int fo_horiz = filter_params_x->taps / 2 - 1;
     415           0 :     const int bd = 8;
     416             : 
     417             :     // horizontal filter
     418           0 :     const uint8_t *src_horiz = src - fo_vert * src_stride;
     419           0 :     for (int y = 0; y < im_h; ++y) {
     420           0 :         int x_qn = subpel_x_qn;
     421           0 :         for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
     422           0 :             const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
     423           0 :             const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     424           0 :             assert(x_filter_idx < SUBPEL_SHIFTS);
     425             :             const int16_t *x_filter =
     426           0 :                 av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
     427           0 :             int32_t sum = (1 << (bd + FILTER_BITS - 1));
     428           0 :             for (int k = 0; k < filter_params_x->taps; ++k) {
     429           0 :                 sum += x_filter[k] * src_x[k - fo_horiz];
     430             :             }
     431           0 :             assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
     432           0 :             im_block[y * im_stride + x] =
     433           0 :                 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     434             :         }
     435           0 :         src_horiz += src_stride;
     436             :     }
     437             : 
     438             :     // vertical filter
     439           0 :     int16_t *src_vert = im_block + fo_vert * im_stride;
     440           0 :     const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
     441           0 :     for (int x = 0; x < w; ++x) {
     442           0 :         int y_qn = subpel_y_qn;
     443           0 :         for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
     444           0 :             const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
     445           0 :             const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     446           0 :             assert(y_filter_idx < SUBPEL_SHIFTS);
     447             :             const int16_t *y_filter =
     448           0 :                 av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
     449           0 :             int32_t sum = 1 << offset_bits;
     450           0 :             for (int k = 0; k < filter_params_y->taps; ++k) {
     451           0 :                 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
     452             :             }
     453           0 :             assert(0 <= sum && sum < (1 << (offset_bits + 2)));
     454           0 :             CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
     455           0 :             if (conv_params->is_compound) {
     456           0 :                 if (conv_params->do_average) {
     457           0 :                     int32_t tmp = dst16[y * dst16_stride + x];
     458           0 :                     if (conv_params->use_dist_wtd_comp_avg) {
     459           0 :                         tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
     460           0 :                         tmp = tmp >> DIST_PRECISION_BITS;
     461             :                     }
     462             :                     else {
     463           0 :                         tmp += res;
     464           0 :                         tmp = tmp >> 1;
     465             :                     }
     466             :                     /* Subtract round offset and convolve round */
     467           0 :                     tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
     468           0 :                         (1 << (offset_bits - conv_params->round_1 - 1)));
     469           0 :                     dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
     470             :                 }
     471             :                 else {
     472           0 :                     dst16[y * dst16_stride + x] = res;
     473             :                 }
     474             :             }
     475             :             else {
     476             :                 /* Subtract round offset and convolve round */
     477           0 :                 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
     478           0 :                     (1 << (offset_bits - conv_params->round_1 - 1)));
     479           0 :                 dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
     480             :             }
     481             :         }
     482           0 :         src_vert++;
     483             :     }
     484           0 : }
     485             : 
     486           0 : void eb_av1_jnt_convolve_2d_c(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
     487             :     int32_t dst8_stride, int32_t w, int32_t h,
     488             :     InterpFilterParams *filter_params_x,
     489             :     InterpFilterParams *filter_params_y,
     490             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
     491             :     ConvolveParams *conv_params)
     492             : {
     493           0 :     ConvBufType *dst = conv_params->dst;
     494           0 :     int32_t dst_stride = conv_params->dst_stride;
     495             :     int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
     496           0 :     int32_t im_h = h + filter_params_y->taps - 1;
     497           0 :     int32_t im_stride = w;
     498           0 :     const int32_t fo_vert = filter_params_y->taps / 2 - 1;
     499           0 :     const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
     500           0 :     const int32_t bd = 8;
     501           0 :     const int32_t round_bits =
     502           0 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     503             : 
     504             :     // horizontal filter
     505           0 :     const uint8_t *src_horiz = src - fo_vert * src_stride;
     506           0 :     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
     507             :         *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
     508           0 :     for (int32_t y = 0; y < im_h; ++y) {
     509           0 :         for (int32_t x = 0; x < w; ++x) {
     510           0 :             int32_t sum = (1 << (bd + FILTER_BITS - 1));
     511           0 :             for (int32_t k = 0; k < filter_params_x->taps; ++k)
     512           0 :                 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
     513           0 :             assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
     514           0 :             im_block[y * im_stride + x] =
     515           0 :                 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     516             :         }
     517             :     }
     518             : 
     519             :     // vertical filter
     520           0 :     int16_t *src_vert = im_block + fo_vert * im_stride;
     521           0 :     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
     522             :         *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
     523           0 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
     524           0 :     for (int32_t y = 0; y < h; ++y) {
     525           0 :         for (int32_t x = 0; x < w; ++x) {
     526           0 :             int32_t sum = 1 << offset_bits;
     527           0 :             for (int32_t k = 0; k < filter_params_y->taps; ++k)
     528           0 :                 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
     529           0 :             assert(0 <= sum && sum < (1 << (offset_bits + 2)));
     530           0 :             ConvBufType res = (ConvBufType)ROUND_POWER_OF_TWO(sum, conv_params->round_1);
     531           0 :             if (conv_params->do_average) {
     532           0 :                 int32_t tmp = dst[y * dst_stride + x];
     533           0 :                 if (conv_params->use_jnt_comp_avg) {
     534           0 :                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
     535           0 :                     tmp = tmp >> DIST_PRECISION_BITS;
     536             :                 }
     537             :                 else {
     538           0 :                     tmp += res;
     539           0 :                     tmp = tmp >> 1;
     540             :                 }
     541           0 :                 tmp -= (1 << (offset_bits - conv_params->round_1)) +
     542           0 :                     (1 << (offset_bits - conv_params->round_1 - 1));
     543           0 :                 dst8[y * dst8_stride + x] =
     544           0 :                     (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), 8);
     545             :             }
     546             :             else
     547           0 :                 dst[y * dst_stride + x] = res;
     548             :         }
     549             :     }
     550           0 : }
     551             : 
     552           0 : void eb_av1_jnt_convolve_y_c(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
     553             :     int32_t dst8_stride, int32_t w, int32_t h,
     554             :     InterpFilterParams *filter_params_x,
     555             :     InterpFilterParams *filter_params_y,
     556             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
     557             :     ConvolveParams *conv_params)
     558             : {
     559           0 :     ConvBufType *dst = conv_params->dst;
     560           0 :     int32_t dst_stride = conv_params->dst_stride;
     561           0 :     const int32_t fo_vert = filter_params_y->taps / 2 - 1;
     562           0 :     const int32_t bits = FILTER_BITS - conv_params->round_0;
     563           0 :     const int32_t bd = 8;
     564           0 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
     565           0 :     const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
     566           0 :         (1 << (offset_bits - conv_params->round_1 - 1));
     567           0 :     const int32_t round_bits =
     568           0 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     569             :     (void)filter_params_x;
     570             :     (void)subpel_x_q4;
     571             : 
     572             :     // vertical filter
     573           0 :     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
     574             :         *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
     575           0 :     for (int32_t y = 0; y < h; ++y) {
     576           0 :         for (int32_t x = 0; x < w; ++x) {
     577           0 :             int32_t res = 0;
     578           0 :             for (int32_t k = 0; k < filter_params_y->taps; ++k)
     579           0 :                 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
     580           0 :             res *= (1 << bits);
     581           0 :             res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
     582             : 
     583           0 :             if (conv_params->do_average) {
     584           0 :                 int32_t tmp = dst[y * dst_stride + x];
     585           0 :                 if (conv_params->use_jnt_comp_avg) {
     586           0 :                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
     587           0 :                     tmp = tmp >> DIST_PRECISION_BITS;
     588             :                 }
     589             :                 else {
     590           0 :                     tmp += res;
     591           0 :                     tmp = tmp >> 1;
     592             :                 }
     593           0 :                 tmp -= round_offset;
     594           0 :                 dst8[y * dst8_stride + x] =
     595           0 :                     (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), 8);
     596             :             }
     597             :             else
     598           0 :                 dst[y * dst_stride + x] = (ConvBufType)res;
     599             :         }
     600             :     }
     601           0 : }
     602             : 
     603           0 : void eb_av1_jnt_convolve_x_c(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
     604             :     int32_t dst8_stride, int32_t w, int32_t h,
     605             :     InterpFilterParams *filter_params_x,
     606             :     InterpFilterParams *filter_params_y,
     607             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
     608             :     ConvolveParams *conv_params)
     609             : {
     610           0 :     ConvBufType *dst = conv_params->dst;
     611           0 :     int32_t dst_stride = conv_params->dst_stride;
     612           0 :     const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
     613           0 :     const int32_t bits = FILTER_BITS - conv_params->round_1;
     614           0 :     const int32_t bd = 8;
     615           0 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
     616           0 :     const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
     617           0 :         (1 << (offset_bits - conv_params->round_1 - 1));
     618           0 :     const int32_t round_bits =
     619           0 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     620             :     (void)filter_params_y;
     621             :     (void)subpel_y_q4;
     622             : 
     623             :     // horizontal filter
     624           0 :     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
     625             :         *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
     626           0 :     for (int32_t y = 0; y < h; ++y) {
     627           0 :         for (int32_t x = 0; x < w; ++x) {
     628           0 :             int32_t res = 0;
     629           0 :             for (int32_t k = 0; k < filter_params_x->taps; ++k)
     630           0 :                 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
     631           0 :             res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
     632           0 :             res += round_offset;
     633             : 
     634           0 :             if (conv_params->do_average) {
     635           0 :                 int32_t tmp = dst[y * dst_stride + x];
     636           0 :                 if (conv_params->use_jnt_comp_avg) {
     637           0 :                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
     638           0 :                     tmp = tmp >> DIST_PRECISION_BITS;
     639             :                 }
     640             :                 else {
     641           0 :                     tmp += res;
     642           0 :                     tmp = tmp >> 1;
     643             :                 }
     644           0 :                 tmp -= round_offset;
     645           0 :                 dst8[y * dst8_stride + x] =
     646           0 :                     (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), 8);
     647             :             }
     648             :             else
     649           0 :                 dst[y * dst_stride + x] = (ConvBufType)res;
     650             :         }
     651             :     }
     652           0 : }
     653             : 
     654           0 : void eb_av1_jnt_convolve_2d_copy_c(const uint8_t *src, int32_t src_stride,
     655             :     uint8_t *dst8, int32_t dst8_stride, int32_t w, int32_t h,
     656             :     InterpFilterParams *filter_params_x,
     657             :     InterpFilterParams *filter_params_y,
     658             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
     659             :     ConvolveParams *conv_params)
     660             : {
     661           0 :     ConvBufType *dst = conv_params->dst;
     662           0 :     int32_t dst_stride = conv_params->dst_stride;
     663           0 :     const int32_t bits =
     664           0 :         FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
     665           0 :     const int32_t bd = 8;
     666           0 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
     667           0 :     const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
     668           0 :         (1 << (offset_bits - conv_params->round_1 - 1));
     669             :     (void)filter_params_x;
     670             :     (void)filter_params_y;
     671             :     (void)subpel_x_q4;
     672             :     (void)subpel_y_q4;
     673             : 
     674           0 :     for (int32_t y = 0; y < h; ++y) {
     675           0 :         for (int32_t x = 0; x < w; ++x) {
     676           0 :             ConvBufType res = src[y * src_stride + x] << bits;
     677           0 :             res += (ConvBufType)round_offset;
     678             : 
     679           0 :             if (conv_params->do_average) {
     680           0 :                 int32_t tmp = dst[y * dst_stride + x];
     681           0 :                 if (conv_params->use_jnt_comp_avg) {
     682           0 :                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
     683           0 :                     tmp = tmp >> DIST_PRECISION_BITS;
     684             :                 }
     685             :                 else {
     686           0 :                     tmp += res;
     687           0 :                     tmp = tmp >> 1;
     688             :                 }
     689           0 :                 tmp -= round_offset;
     690           0 :                 dst8[y * dst8_stride + x] = (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), 8);
     691             :             }
     692             :             else
     693           0 :                 dst[y * dst_stride + x] = res;
     694             :         }
     695             :     }
     696           0 : }
     697             : 
     698           0 : void eb_av1_highbd_convolve_2d_copy_sr_c(
     699             :     const uint16_t *src, int32_t src_stride, uint16_t *dst, int32_t dst_stride, int32_t w,
     700             :     int32_t h, const InterpFilterParams *filter_params_x,
     701             :     const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
     702             :     const int32_t subpel_y_q4, ConvolveParams *conv_params, int32_t bd) {
     703             :     (void)filter_params_x;
     704             :     (void)filter_params_y;
     705             :     (void)subpel_x_q4;
     706             :     (void)subpel_y_q4;
     707             :     (void)conv_params;
     708             :     (void)bd;
     709             : 
     710           0 :     for (int32_t y = 0; y < h; ++y) {
     711           0 :         for (int32_t x = 0; x < w; ++x)
     712           0 :             dst[y * dst_stride + x] = src[y * src_stride + x];
     713             :     }
     714           0 : }
     715             : 
     716           0 : void eb_av1_highbd_convolve_x_sr_c(const uint16_t *src, int32_t src_stride,
     717             :     uint16_t *dst, int32_t dst_stride, int32_t w, int32_t h,
     718             :     const InterpFilterParams *filter_params_x,
     719             :     const InterpFilterParams *filter_params_y,
     720             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
     721             :     ConvolveParams *conv_params, int32_t bd) {
     722           0 :     const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
     723           0 :     const int32_t bits = FILTER_BITS - conv_params->round_0;
     724             :     (void)filter_params_y;
     725             :     (void)subpel_y_q4;
     726             : 
     727           0 :     assert(bits >= 0);
     728           0 :     assert((FILTER_BITS - conv_params->round_1) >= 0 ||
     729             :         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
     730             : 
     731             :     // horizontal filter
     732           0 :     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
     733             :         *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
     734           0 :     for (int32_t y = 0; y < h; ++y) {
     735           0 :         for (int32_t x = 0; x < w; ++x) {
     736           0 :             int32_t res = 0;
     737           0 :             for (int32_t k = 0; k < filter_params_x->taps; ++k)
     738           0 :                 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
     739           0 :             res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
     740           0 :             dst[y * dst_stride + x] =
     741           0 :                 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
     742             :         }
     743             :     }
     744           0 : }
     745             : 
     746           0 : void eb_av1_highbd_convolve_y_sr_c(const uint16_t *src, int32_t src_stride,
     747             :     uint16_t *dst, int32_t dst_stride, int32_t w, int32_t h,
     748             :     const InterpFilterParams *filter_params_x,
     749             :     const InterpFilterParams *filter_params_y,
     750             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
     751             :     ConvolveParams *conv_params, int32_t bd) {
     752           0 :     assert(filter_params_y != NULL);
     753           0 :     const int32_t fo_vert = filter_params_y->taps / 2 - 1;
     754             :     (void)filter_params_x;
     755             :     (void)subpel_x_q4;
     756             :     (void)conv_params;
     757             : 
     758           0 :     assert(conv_params->round_0 <= FILTER_BITS);
     759           0 :     assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
     760             :         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
     761             :     // vertical filter
     762           0 :     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
     763             :         *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
     764           0 :     for (int32_t y = 0; y < h; ++y) {
     765           0 :         for (int32_t x = 0; x < w; ++x) {
     766           0 :             int32_t res = 0;
     767           0 :             for (int32_t k = 0; k < filter_params_y->taps; ++k)
     768           0 :                 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
     769           0 :             dst[y * dst_stride + x] =
     770           0 :                 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
     771             :         }
     772             :     }
     773           0 : }
     774             : 
     775           0 : void eb_av1_highbd_convolve_2d_sr_c(const uint16_t *src, int32_t src_stride,
     776             :     uint16_t *dst, int32_t dst_stride, int32_t w, int32_t h,
     777             :     const InterpFilterParams *filter_params_x,
     778             :     const InterpFilterParams *filter_params_y,
     779             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
     780             :     ConvolveParams *conv_params, int32_t bd) {
     781             :     int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
     782           0 :     int32_t im_h = h + filter_params_y->taps - 1;
     783           0 :     int32_t im_stride = w;
     784           0 :     const int32_t fo_vert = filter_params_y->taps / 2 - 1;
     785           0 :     const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
     786           0 :     const int32_t bits =
     787           0 :         FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
     788           0 :     assert(bits >= 0);
     789             : 
     790             :     // horizontal filter
     791           0 :     const uint16_t *src_horiz = src - fo_vert * src_stride;
     792           0 :     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
     793             :         *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
     794           0 :     for (int32_t y = 0; y < im_h; ++y) {
     795           0 :         for (int32_t x = 0; x < w; ++x) {
     796           0 :             int32_t sum = (1 << (bd + FILTER_BITS - 1));
     797           0 :             for (int32_t k = 0; k < filter_params_x->taps; ++k)
     798           0 :                 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
     799           0 :             assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
     800           0 :             im_block[y * im_stride + x] = (ConvBufType)
     801           0 :                 ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     802             :         }
     803             :     }
     804             : 
     805             :     // vertical filter
     806           0 :     int16_t *src_vert = im_block + fo_vert * im_stride;
     807           0 :     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
     808             :         *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
     809           0 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
     810           0 :     for (int32_t y = 0; y < h; ++y) {
     811           0 :         for (int32_t x = 0; x < w; ++x) {
     812           0 :             int32_t sum = 1 << offset_bits;
     813           0 :             for (int32_t k = 0; k < filter_params_y->taps; ++k)
     814           0 :                 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
     815           0 :             assert(0 <= sum && sum < (1 << (offset_bits + 2)));
     816           0 :             int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
     817           0 :                 ((1 << (offset_bits - conv_params->round_1)) +
     818           0 :                 (1 << (offset_bits - conv_params->round_1 - 1)));
     819           0 :             dst[y * dst_stride + x] =
     820           0 :                 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
     821             :         }
     822             :     }
     823           0 : }
     824             : 
     825           0 : void eb_av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
     826             :     uint16_t *dst, int dst_stride, int w, int h,
     827             :     const InterpFilterParams *filter_params_x,
     828             :     const InterpFilterParams *filter_params_y,
     829             :     const int subpel_x_qn, const int x_step_qn,
     830             :     const int subpel_y_qn, const int y_step_qn,
     831             :     ConvolveParams *conv_params, int bd)
     832             : {
     833             :     int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
     834           0 :     int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
     835           0 :         filter_params_y->taps;
     836           0 :     int im_stride = w;
     837           0 :     const int fo_vert = filter_params_y->taps / 2 - 1;
     838           0 :     const int fo_horiz = filter_params_x->taps / 2 - 1;
     839           0 :     CONV_BUF_TYPE *dst16 = conv_params->dst;
     840           0 :     const int dst16_stride = conv_params->dst_stride;
     841           0 :     const int bits =
     842           0 :         FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
     843           0 :     assert(bits >= 0);
     844             :     // horizontal filter
     845           0 :     const uint16_t *src_horiz = src - fo_vert * src_stride;
     846           0 :     for (int y = 0; y < im_h; ++y) {
     847           0 :         int x_qn = subpel_x_qn;
     848           0 :         for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
     849           0 :             const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
     850           0 :             const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     851           0 :             assert(x_filter_idx < SUBPEL_SHIFTS);
     852             :             const int16_t *x_filter =
     853           0 :                 av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
     854           0 :             int32_t sum = (1 << (bd + FILTER_BITS - 1));
     855           0 :             for (int k = 0; k < filter_params_x->taps; ++k) {
     856           0 :                 sum += x_filter[k] * src_x[k - fo_horiz];
     857             :             }
     858           0 :             assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
     859           0 :             im_block[y * im_stride + x] =
     860           0 :                 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     861             :         }
     862           0 :         src_horiz += src_stride;
     863             :     }
     864             : 
     865             :     // vertical filter
     866           0 :     int16_t *src_vert = im_block + fo_vert * im_stride;
     867           0 :     const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
     868           0 :     for (int x = 0; x < w; ++x) {
     869           0 :         int y_qn = subpel_y_qn;
     870           0 :         for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
     871           0 :             const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
     872           0 :             const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     873           0 :             assert(y_filter_idx < SUBPEL_SHIFTS);
     874             :             const int16_t *y_filter =
     875           0 :                 av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
     876           0 :             int32_t sum = 1 << offset_bits;
     877           0 :             for (int k = 0; k < filter_params_y->taps; ++k) {
     878           0 :                 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
     879             :             }
     880           0 :             assert(0 <= sum && sum < (1 << (offset_bits + 2)));
     881           0 :             CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
     882           0 :             if (conv_params->is_compound) {
     883           0 :                 if (conv_params->do_average) {
     884           0 :                     int32_t tmp = dst16[y * dst16_stride + x];
     885           0 :                     if (conv_params->use_dist_wtd_comp_avg) {
     886           0 :                         tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
     887           0 :                         tmp = tmp >> DIST_PRECISION_BITS;
     888             :                     }
     889             :                     else {
     890           0 :                         tmp += res;
     891           0 :                         tmp = tmp >> 1;
     892             :                     }
     893             :                     /* Subtract round offset and convolve round */
     894           0 :                     tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
     895           0 :                         (1 << (offset_bits - conv_params->round_1 - 1)));
     896           0 :                     dst[y * dst_stride + x] =
     897           0 :                         clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
     898             :                 }
     899             :                 else {
     900           0 :                     dst16[y * dst16_stride + x] = res;
     901             :                 }
     902             :             }
     903             :             else {
     904             :                 /* Subtract round offset and convolve round */
     905           0 :                 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
     906           0 :                     (1 << (offset_bits - conv_params->round_1 - 1)));
     907           0 :                 dst[y * dst_stride + x] =
     908           0 :                     clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
     909             :             }
     910             :         }
     911           0 :         src_vert++;
     912             :     }
     913           0 : }
     914             : 
     915             : 
     916           0 : void eb_av1_highbd_jnt_convolve_x_c(const uint16_t *src, int32_t src_stride,
     917             :     uint16_t *dst16, int32_t dst16_stride, int32_t w,
     918             :     int32_t h, const InterpFilterParams *filter_params_x,
     919             :     const InterpFilterParams *filter_params_y,
     920             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
     921             :     ConvolveParams *conv_params, int32_t bd) {
     922           0 :     ConvBufType *dst = conv_params->dst;
     923           0 :     int32_t dst_stride = conv_params->dst_stride;
     924           0 :     const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
     925           0 :     const int32_t bits = FILTER_BITS - conv_params->round_1;
     926           0 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
     927           0 :     const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
     928           0 :         (1 << (offset_bits - conv_params->round_1 - 1));
     929           0 :     const int32_t round_bits =
     930           0 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     931           0 :     assert(round_bits >= 0);
     932             :     (void)filter_params_y;
     933             :     (void)subpel_y_q4;
     934           0 :     assert(bits >= 0);
     935             :     // horizontal filter
     936           0 :     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
     937             :         *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
     938           0 :     for (int32_t y = 0; y < h; ++y) {
     939           0 :         for (int32_t x = 0; x < w; ++x) {
     940           0 :             int32_t res = 0;
     941           0 :             for (int32_t k = 0; k < filter_params_x->taps; ++k)
     942           0 :                 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
     943           0 :             res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
     944           0 :             res += round_offset;
     945             : 
     946           0 :             if (conv_params->do_average) {
     947           0 :                 int32_t tmp = dst[y * dst_stride + x];
     948           0 :                 if (conv_params->use_jnt_comp_avg) {
     949           0 :                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
     950           0 :                     tmp = tmp >> DIST_PRECISION_BITS;
     951             :                 }
     952             :                 else {
     953           0 :                     tmp += res;
     954           0 :                     tmp = tmp >> 1;
     955             :                 }
     956           0 :                 tmp -= round_offset;
     957           0 :                 dst16[y * dst16_stride + x] =
     958           0 :                     clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
     959             :             }
     960             :             else
     961           0 :                 dst[y * dst_stride + x] = (ConvBufType)res;
     962             :         }
     963             :     }
     964           0 : }
     965             : 
     966           0 : void eb_av1_highbd_jnt_convolve_y_c(const uint16_t *src, int32_t src_stride,
     967             :     uint16_t *dst16, int32_t dst16_stride, int32_t w,
     968             :     int32_t h, const InterpFilterParams *filter_params_x,
     969             :     const InterpFilterParams *filter_params_y,
     970             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
     971             :     ConvolveParams *conv_params, int32_t bd) {
     972           0 :     ConvBufType *dst = conv_params->dst;
     973           0 :     int32_t dst_stride = conv_params->dst_stride;
     974           0 :     const int32_t fo_vert = filter_params_y->taps / 2 - 1;
     975           0 :     const int32_t bits = FILTER_BITS - conv_params->round_0;
     976           0 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
     977           0 :     const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
     978           0 :         (1 << (offset_bits - conv_params->round_1 - 1));
     979           0 :     const int32_t round_bits =
     980           0 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     981           0 :     assert(round_bits >= 0);
     982             :     (void)filter_params_x;
     983             :     (void)subpel_x_q4;
     984           0 :     assert(bits >= 0);
     985             :     // vertical filter
     986           0 :     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
     987             :         *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
     988           0 :     for (int32_t y = 0; y < h; ++y) {
     989           0 :         for (int32_t x = 0; x < w; ++x) {
     990           0 :             int32_t res = 0;
     991           0 :             for (int32_t k = 0; k < filter_params_y->taps; ++k)
     992           0 :                 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
     993           0 :             res *= (1 << bits);
     994           0 :             res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
     995             : 
     996           0 :             if (conv_params->do_average) {
     997           0 :                 int32_t tmp = dst[y * dst_stride + x];
     998           0 :                 if (conv_params->use_jnt_comp_avg) {
     999           0 :                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    1000           0 :                     tmp = tmp >> DIST_PRECISION_BITS;
    1001             :                 }
    1002             :                 else {
    1003           0 :                     tmp += res;
    1004           0 :                     tmp = tmp >> 1;
    1005             :                 }
    1006           0 :                 tmp -= round_offset;
    1007           0 :                 dst16[y * dst16_stride + x] =
    1008           0 :                     clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
    1009             :             }
    1010             :             else
    1011           0 :                 dst[y * dst_stride + x] = (ConvBufType)res;
    1012             :         }
    1013             :     }
    1014           0 : }
    1015             : 
    1016           0 : void eb_av1_highbd_jnt_convolve_2d_copy_c(
    1017             :     const uint16_t *src, int32_t src_stride, uint16_t *dst16, int32_t dst16_stride,
    1018             :     int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
    1019             :     const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
    1020             :     const int32_t subpel_y_q4, ConvolveParams *conv_params, int32_t bd) {
    1021           0 :     ConvBufType *dst = conv_params->dst;
    1022           0 :     int32_t dst_stride = conv_params->dst_stride;
    1023           0 :     const int32_t bits =
    1024           0 :         FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
    1025           0 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    1026           0 :     const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
    1027           0 :         (1 << (offset_bits - conv_params->round_1 - 1));
    1028           0 :     assert(bits >= 0);
    1029             :     (void)filter_params_x;
    1030             :     (void)filter_params_y;
    1031             :     (void)subpel_x_q4;
    1032             :     (void)subpel_y_q4;
    1033             : 
    1034           0 :     for (int32_t y = 0; y < h; ++y) {
    1035           0 :         for (int32_t x = 0; x < w; ++x) {
    1036           0 :             ConvBufType res = src[y * src_stride + x] << bits;
    1037           0 :             res += (ConvBufType)round_offset;
    1038           0 :             if (conv_params->do_average) {
    1039           0 :                 int32_t tmp = dst[y * dst_stride + x];
    1040           0 :                 if (conv_params->use_jnt_comp_avg) {
    1041           0 :                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    1042           0 :                     tmp = tmp >> DIST_PRECISION_BITS;
    1043             :                 }
    1044             :                 else {
    1045           0 :                     tmp += res;
    1046           0 :                     tmp = tmp >> 1;
    1047             :                 }
    1048           0 :                 tmp -= round_offset;
    1049           0 :                 dst16[y * dst16_stride + x] =
    1050           0 :                     clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
    1051             :             }
    1052             :             else
    1053           0 :                 dst[y * dst_stride + x] = res;
    1054             :         }
    1055             :     }
    1056           0 : }
    1057             : 
    1058           0 : void eb_av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int32_t src_stride,
    1059             :     uint16_t *dst16, int32_t dst16_stride, int32_t w,
    1060             :     int32_t h, const InterpFilterParams *filter_params_x,
    1061             :     const InterpFilterParams *filter_params_y,
    1062             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
    1063             :     ConvolveParams *conv_params, int32_t bd)
    1064             : 
    1065             : {
    1066             :     int32_t x, y, k;
    1067             :     int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
    1068           0 :     ConvBufType *dst = conv_params->dst;
    1069           0 :     int32_t dst_stride = conv_params->dst_stride;
    1070           0 :     int32_t im_h = h + filter_params_y->taps - 1;
    1071           0 :     int32_t im_stride = w;
    1072           0 :     const int32_t fo_vert = filter_params_y->taps / 2 - 1;
    1073           0 :     const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
    1074             : 
    1075           0 :     const int32_t round_bits =
    1076           0 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    1077           0 :     assert(round_bits >= 0);
    1078             : 
    1079             :     // horizontal filter
    1080           0 :     const uint16_t *src_horiz = src - fo_vert * src_stride;
    1081           0 :     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    1082             :         *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
    1083           0 :     for (y = 0; y < im_h; ++y) {
    1084           0 :         for (x = 0; x < w; ++x) {
    1085           0 :             int32_t sum = (1 << (bd + FILTER_BITS - 1));
    1086           0 :             for (k = 0; k < filter_params_x->taps; ++k)
    1087           0 :                 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
    1088           0 :             assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
    1089             :             (void)bd;
    1090           0 :             im_block[y * im_stride + x] =
    1091           0 :                 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
    1092             :         }
    1093             :     }
    1094             : 
    1095             :     // vertical filter
    1096           0 :     int16_t *src_vert = im_block + fo_vert * im_stride;
    1097           0 :     const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    1098           0 :     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    1099             :         *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
    1100           0 :     for (y = 0; y < h; ++y) {
    1101           0 :         for (x = 0; x < w; ++x) {
    1102           0 :             int32_t sum = 1 << offset_bits;
    1103           0 :             for (k = 0; k < filter_params_y->taps; ++k)
    1104           0 :                 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
    1105           0 :             assert(0 <= sum && sum < (1 << (offset_bits + 2)));
    1106           0 :             ConvBufType res = (ConvBufType)ROUND_POWER_OF_TWO(sum, conv_params->round_1);
    1107           0 :             if (conv_params->do_average) {
    1108           0 :                 int32_t tmp = dst[y * dst_stride + x];
    1109           0 :                 if (conv_params->use_jnt_comp_avg) {
    1110           0 :                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    1111           0 :                     tmp = tmp >> DIST_PRECISION_BITS;
    1112             :                 }
    1113             :                 else {
    1114           0 :                     tmp += res;
    1115           0 :                     tmp = tmp >> 1;
    1116             :                 }
    1117           0 :                 tmp -= (1 << (offset_bits - conv_params->round_1)) +
    1118           0 :                     (1 << (offset_bits - conv_params->round_1 - 1));
    1119           0 :                 dst16[y * dst16_stride + x] =
    1120           0 :                     clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
    1121             :             }
    1122             :             else
    1123           0 :                 dst[y * dst_stride + x] = res;
    1124             :         }
    1125             :     }
    1126           0 : }
    1127             : 
    1128             : aom_highbd_convolve_fn_t convolveHbd[/*subX*/2][/*subY*/2][/*bi*/2];
    1129           3 : void asmSetConvolveHbdAsmTable(void)
    1130             : {
    1131           3 :     convolveHbd[0][0][0] = eb_av1_highbd_convolve_2d_copy_sr;
    1132           3 :     convolveHbd[0][0][1] = eb_av1_highbd_jnt_convolve_2d_copy;
    1133             : 
    1134           3 :     convolveHbd[0][1][0] = eb_av1_highbd_convolve_y_sr;
    1135           3 :     convolveHbd[0][1][1] = eb_av1_highbd_jnt_convolve_y;
    1136             : 
    1137           3 :     convolveHbd[1][0][0] = eb_av1_highbd_convolve_x_sr;
    1138           3 :     convolveHbd[1][0][1] = eb_av1_highbd_jnt_convolve_x;
    1139             : 
    1140           3 :     convolveHbd[1][1][0] = eb_av1_highbd_convolve_2d_sr;
    1141           3 :     convolveHbd[1][1][1] = eb_av1_highbd_jnt_convolve_2d;
    1142           3 : }
    1143             : 
    1144             : aom_convolve_fn_t convolve[/*subX*/2][/*subY*/2][/*bi*/2];
    1145           3 : void asmSetConvolveAsmTable(void)
    1146             : {
    1147           3 :     convolve[0][0][0] = eb_av1_convolve_2d_copy_sr;
    1148           3 :     convolve[0][0][1] = eb_av1_jnt_convolve_2d_copy;
    1149             : 
    1150           3 :     convolve[0][1][0] = eb_av1_convolve_y_sr;
    1151           3 :     convolve[0][1][1] = eb_av1_jnt_convolve_y;
    1152             : 
    1153           3 :     convolve[1][0][0] = eb_av1_convolve_x_sr;
    1154           3 :     convolve[1][0][1] = eb_av1_jnt_convolve_x;
    1155             : 
    1156           3 :     convolve[1][1][0] = eb_av1_convolve_2d_sr;
    1157           3 :     convolve[1][1][1] = eb_av1_jnt_convolve_2d;
    1158           3 : }
    1159             : 
    1160             : InterpFilterParams av1RegularFilter = { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS, EIGHTTAP_REGULAR };
    1161             : InterpFilterParams av1RegularFilterW4 = { (const int16_t *)sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS, EIGHTTAP_REGULAR };
    1162             : 
    1163             : DECLARE_ALIGNED(256, const InterpKernel,
    1164             : sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
    1165             : { 0, 0, 0, 128, 0, 0, 0, 0 },         { -2, 2, -6, 126, 8, -2, 2, 0 },
    1166             : { -2, 6, -12, 124, 16, -6, 4, -2 },   { -2, 8, -18, 120, 26, -10, 6, -2 },
    1167             : { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
    1168             : { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
    1169             : { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
    1170             : { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
    1171             : { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
    1172             : { -2, 4, -6, 16, 124, -12, 6, -2 },   { 0, 2, -2, 8, 126, -6, 2, -2 }
    1173             : };
    1174             : 
    1175             : DECLARE_ALIGNED(256, const InterpKernel,
    1176             : sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
    1177             : { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 2, 28, 62, 34, 2, 0, 0 },
    1178             : { 0, 0, 26, 62, 36, 4, 0, 0 },    { 0, 0, 22, 62, 40, 4, 0, 0 },
    1179             : { 0, 0, 20, 60, 42, 6, 0, 0 },    { 0, 0, 18, 58, 44, 8, 0, 0 },
    1180             : { 0, 0, 16, 56, 46, 10, 0, 0 },   { 0, -2, 16, 54, 48, 12, 0, 0 },
    1181             : { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
    1182             : { 0, 0, 10, 46, 56, 16, 0, 0 },   { 0, 0, 8, 44, 58, 18, 0, 0 },
    1183             : { 0, 0, 6, 42, 60, 20, 0, 0 },    { 0, 0, 4, 40, 62, 22, 0, 0 },
    1184             : { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
    1185             : };
    1186             : DECLARE_ALIGNED(256, const InterpKernel,
    1187             : bilinear_filters[SUBPEL_SHIFTS]) = {
    1188             : { 0, 0, 0, 128, 0, 0, 0, 0 },  { 0, 0, 0, 120, 8, 0, 0, 0 },
    1189             : { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
    1190             : { 0, 0, 0, 96, 32, 0, 0, 0 },  { 0, 0, 0, 88, 40, 0, 0, 0 },
    1191             : { 0, 0, 0, 80, 48, 0, 0, 0 },  { 0, 0, 0, 72, 56, 0, 0, 0 },
    1192             : { 0, 0, 0, 64, 64, 0, 0, 0 },  { 0, 0, 0, 56, 72, 0, 0, 0 },
    1193             : { 0, 0, 0, 48, 80, 0, 0, 0 },  { 0, 0, 0, 40, 88, 0, 0, 0 },
    1194             : { 0, 0, 0, 32, 96, 0, 0, 0 },  { 0, 0, 0, 24, 104, 0, 0, 0 },
    1195             : { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
    1196             : };
    1197             : DECLARE_ALIGNED(256, const InterpKernel,
    1198             : sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
    1199             : { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 0, 30, 62, 34, 2, 0, 0 },
    1200             : { 0, 0, 26, 62, 36, 4, 0, 0 },  { 0, 0, 22, 62, 40, 4, 0, 0 },
    1201             : { 0, 0, 20, 60, 42, 6, 0, 0 },  { 0, 0, 18, 58, 44, 8, 0, 0 },
    1202             : { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
    1203             : { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
    1204             : { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
    1205             : { 0, 0, 6, 42, 60, 20, 0, 0 },  { 0, 0, 4, 40, 62, 22, 0, 0 },
    1206             : { 0, 0, 4, 36, 62, 26, 0, 0 },  { 0, 0, 2, 34, 62, 30, 0, 0 }
    1207             : };
    1208             : static const InterpFilterParams
    1209             : av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
    1210             :   { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
    1211             :     EIGHTTAP_REGULAR },
    1212             :   { (const int16_t *)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
    1213             :     EIGHTTAP_SMOOTH },
    1214             :   { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
    1215             :     MULTITAP_SHARP },
    1216             :   { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
    1217             :     BILINEAR }
    1218             : };
    1219             : static const InterpFilterParams av1_interp_4tap[2] = {
    1220             :   { (const int16_t *)sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
    1221             :     EIGHTTAP_REGULAR },
    1222             :   { (const int16_t *)sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
    1223             :     EIGHTTAP_SMOOTH },
    1224             : };
    1225   879893000 : InterpFilterParams av1_get_interp_filter_params_with_block_size(
    1226             :     const InterpFilter interp_filter, const int32_t w) {
    1227   879893000 :     if (w <= 4 &&
    1228    47716100 :         (interp_filter == MULTITAP_SHARP || interp_filter == EIGHTTAP_REGULAR))
    1229    42334000 :         return av1_interp_4tap[0];
    1230   837559000 :     else if (w <= 4 && interp_filter == EIGHTTAP_SMOOTH)
    1231       80053 :         return av1_interp_4tap[1];
    1232             : 
    1233   837479000 :     return av1_interp_filter_params_list[interp_filter];
    1234             : }
    1235             : 
    1236   444419000 : void av1_get_convolve_filter_params( uint32_t interp_filters,
    1237             :     InterpFilterParams *params_x, InterpFilterParams *params_y,
    1238             :     int32_t w, int32_t h)
    1239             : {
    1240   444419000 :     InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
    1241   442871000 :     InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
    1242   441514000 :     *params_x = av1_get_interp_filter_params_with_block_size(filter_x, w);
    1243   442093000 :     *params_y = av1_get_interp_filter_params_with_block_size(filter_y, h);
    1244   442438000 : }
    1245             : 
    1246             : int32_t is_inter_block(const BlockModeInfo *mbmi);
    1247             : BlockSize scale_chroma_bsize(BlockSize bsize, int32_t subsampling_x,
    1248             :     int32_t subsampling_y);
    1249             : 
    1250             : // A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
    1251             : // MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV.
    1252             : DECLARE_ALIGNED(256, static const int16_t, av1_intrabc_bilinear_filter[2]) = {
    1253             :   64,
    1254             :   64,
    1255             : };
    1256             : 
    1257             : static const InterpFilterParams av1_intrabc_filter_params = {
    1258             :   av1_intrabc_bilinear_filter, 2, 0, BILINEAR
    1259             : };
    1260           0 : static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride,
    1261             :     uint8_t *dst, int dst_stride, int w, int h,
    1262             :     int subpel_x_q4, int subpel_y_q4,
    1263             :     ConvolveParams *conv_params)
    1264             : {
    1265           0 :     const InterpFilterParams *filter_params_x =
    1266           0 :         subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
    1267           0 :     const InterpFilterParams *filter_params_y =
    1268           0 :         subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
    1269           0 :     if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
    1270           0 :         eb_av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
    1271             :             (InterpFilterParams *)filter_params_x, (InterpFilterParams *)filter_params_y, 0, 0, conv_params);
    1272             :     }
    1273           0 :     else if (subpel_x_q4 != 0) {
    1274           0 :         eb_av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, (InterpFilterParams *)filter_params_x,
    1275             :             (InterpFilterParams *)filter_params_y, 0, 0, conv_params);
    1276             :     }
    1277             :     else {
    1278           0 :         eb_av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, (InterpFilterParams *)filter_params_x,
    1279             :             (InterpFilterParams *)filter_params_y, 0, 0, conv_params);
    1280             :     }
    1281           0 : }
    1282           0 : static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride,
    1283             :     uint16_t *dst, int dst_stride, int w,
    1284             :     int h, int subpel_x_q4,
    1285             :     int subpel_y_q4,
    1286             :     ConvolveParams *conv_params,
    1287             :     int bd) {
    1288           0 :     const InterpFilterParams *filter_params_x =
    1289           0 :         subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
    1290           0 :     const InterpFilterParams *filter_params_y =
    1291           0 :         subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
    1292           0 :     if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
    1293           0 :         eb_av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
    1294             :             filter_params_x, filter_params_y, 0, 0,
    1295             :             conv_params, bd);
    1296             :     }
    1297           0 :     else if (subpel_x_q4 != 0) {
    1298           0 :         eb_av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
    1299             :             filter_params_x, filter_params_y, 0, 0,
    1300             :             conv_params, bd);
    1301             :     }
    1302             :     else {
    1303           0 :         eb_av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
    1304             :             filter_params_x, filter_params_y, 0, 0,
    1305             :             conv_params, bd);
    1306             :     }
    1307           0 : }
    1308             : 
    1309       39670 : void svt_inter_predictor(const uint8_t *src, int32_t src_stride,
    1310             :     uint8_t *dst, int32_t dst_stride, const SubpelParams *subpel_params,
    1311             :     const ScaleFactors *sf, int32_t w, int32_t h, ConvolveParams *conv_params,
    1312             :     InterpFilters interp_filters, int32_t is_intrabc)
    1313             : {
    1314             :     InterpFilterParams filter_params_x, filter_params_y;
    1315       39670 :     const int32_t is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
    1316             : 
    1317       39670 :     av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    1318             :         &filter_params_y, w, h);
    1319             : 
    1320       39670 :     assert(conv_params->do_average == 0 || conv_params->do_average == 1);
    1321       39670 :     assert(sf);
    1322             :     UNUSED(sf);
    1323       39670 :     assert(IMPLIES(is_intrabc, !is_scaled));
    1324             : 
    1325       39670 :     if (is_scaled) {
    1326           0 :         if (is_intrabc && (subpel_params->subpel_x != 0 ||
    1327           0 :             subpel_params->subpel_y != 0))
    1328             :         {
    1329           0 :             convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
    1330             :                 subpel_params->subpel_x, subpel_params->subpel_y, conv_params);
    1331           0 :             return;
    1332             :         }
    1333           0 :         if (conv_params->is_compound) {
    1334           0 :             assert(conv_params->dst != NULL);
    1335             :         }
    1336           0 :         eb_av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
    1337             :             &filter_params_x, &filter_params_y, subpel_params->subpel_x,
    1338             :             subpel_params->xs, subpel_params->subpel_y,
    1339             :             subpel_params->ys, conv_params);
    1340             :     }
    1341             :     else {
    1342       39670 :         SubpelParams sp = *subpel_params;
    1343       39670 :         revert_scale_extra_bits(&sp);
    1344             : 
    1345       39670 :         if (is_intrabc && (sp.subpel_x != 0 || sp.subpel_y != 0)) {
    1346           0 :             convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
    1347             :                 sp.subpel_x, sp.subpel_y, conv_params);
    1348           0 :             return;
    1349             :         }
    1350             : 
    1351       39670 :         convolve[sp.subpel_x != 0][sp.subpel_y != 0][conv_params->is_compound](
    1352             :             src, src_stride, dst, dst_stride, w, h, &filter_params_x,
    1353             :             &filter_params_y, sp.subpel_x, sp.subpel_y, conv_params);
    1354             :     }
    1355             : }
    1356             : 
    1357           0 : void svt_highbd_inter_predictor(const uint16_t *src, int32_t src_stride,
    1358             :     uint16_t *dst, int32_t dst_stride, const SubpelParams *subpel_params,
    1359             :     const ScaleFactors *sf, int32_t w, int32_t h, ConvolveParams *conv_params,
    1360             :     InterpFilters interp_filters, int32_t is_intrabc, int32_t bd)
    1361             : {
    1362             : 
    1363             :     InterpFilterParams filter_params_x, filter_params_y;
    1364           0 :     const int32_t is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
    1365             : 
    1366           0 :     av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    1367             :         &filter_params_y, w, h);
    1368             : 
    1369           0 :     assert(conv_params->do_average == 0 || conv_params->do_average == 1);
    1370           0 :     assert(sf);
    1371             :     UNUSED(sf);
    1372           0 :     assert(IMPLIES(is_intrabc, !is_scaled));
    1373             : 
    1374           0 :     if (is_scaled) {
    1375           0 :         if (is_intrabc && (subpel_params->subpel_x != 0 ||
    1376           0 :             subpel_params->subpel_y != 0))
    1377             :         {
    1378           0 :             highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride,
    1379             :                 w, h, subpel_params->subpel_x, subpel_params->subpel_y,
    1380             :                 conv_params, bd);
    1381           0 :             return;
    1382             :         }
    1383           0 :         if (conv_params->is_compound) {
    1384           0 :             assert(conv_params->dst != NULL);
    1385             :         }
    1386           0 :         eb_av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
    1387             :             &filter_params_x, &filter_params_y, subpel_params->subpel_x,
    1388             :             subpel_params->xs, subpel_params->subpel_y,
    1389             :             subpel_params->ys, conv_params, bd);
    1390             :     }
    1391             :     else {
    1392           0 :         SubpelParams sp = *subpel_params;
    1393           0 :         revert_scale_extra_bits(&sp);
    1394             : 
    1395           0 :         if (is_intrabc && (sp.subpel_x != 0 || sp.subpel_y != 0)) {
    1396           0 :             highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, sp.subpel_x,
    1397             :                 sp.subpel_y, conv_params, bd);
    1398           0 :             return;
    1399             :         }
    1400             : 
    1401           0 :         convolveHbd[sp.subpel_x != 0][sp.subpel_y != 0][conv_params->is_compound](
    1402             :             src, src_stride, dst, dst_stride, w, h, &filter_params_x,
    1403             :             &filter_params_y, sp.subpel_x, sp.subpel_y, conv_params, bd);
    1404             :     }
    1405             : }
    1406             : #define USE_PRECOMPUTED_WEDGE_SIGN 1
    1407             : #define USE_PRECOMPUTED_WEDGE_MASK 1
    1408             : 
    1409             : #if USE_PRECOMPUTED_WEDGE_MASK
    1410             : static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = {
    1411             :   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1412             :   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  6,  18,
    1413             :   37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
    1414             :   64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
    1415             : };
    1416             : static const uint8_t wedge_master_oblique_even[MASK_MASTER_SIZE] = {
    1417             :   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1418             :   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  4,  11, 27,
    1419             :   46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
    1420             :   64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
    1421             : };
    1422             : static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = {
    1423             :   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1424             :   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  21,
    1425             :   43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
    1426             :   64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
    1427             : };
    1428             : 
    1429             : 
    1430         864 : void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
    1431             :     ptrdiff_t dst_stride, const int16_t *filter_x,
    1432             :     int filter_x_stride, const int16_t *filter_y,
    1433             :     int filter_y_stride, int w, int h) {
    1434             :     int r;
    1435             : 
    1436             :     (void)filter_x;
    1437             :     (void)filter_x_stride;
    1438             :     (void)filter_y;
    1439             :     (void)filter_y_stride;
    1440             : 
    1441       16992 :     for (r = h; r > 0; --r) {
    1442       16128 :         memcpy(dst, src, w);
    1443       16128 :         src += src_stride;
    1444       16128 :         dst += dst_stride;
    1445             :     }
    1446         864 : }
    1447             : 
    1448         192 : static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
    1449         192 :     if (shift >= 0) {
    1450          99 :         memcpy(dst + shift, src, width - shift);
    1451          99 :         memset(dst, src[0], shift);
    1452             :     }
    1453             :     else {
    1454          93 :         shift = -shift;
    1455          93 :         memcpy(dst, src + shift, width - shift);
    1456          93 :         memset(dst + width - shift, src[width - 1], shift);
    1457             :     }
    1458         192 : }
    1459             : #endif  // USE_PRECOMPUTED_WEDGE_MASK
    1460             : 
    1461             : 
    1462             : // [negative][direction]
    1463             : DECLARE_ALIGNED(
    1464             : 16, static uint8_t,
    1465             : wedge_mask_obl[2][WEDGE_DIRECTIONS][MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
    1466             : 
    1467             : // 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
    1468             : // on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
    1469             : DECLARE_ALIGNED(16, static uint8_t,
    1470             : wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]);
    1471             : 
    1472           3 : static void init_wedge_master_masks() {
    1473             :     int i, j;
    1474           3 :     const int w = MASK_MASTER_SIZE;
    1475           3 :     const int h = MASK_MASTER_SIZE;
    1476           3 :     const int stride = MASK_MASTER_STRIDE;
    1477             :     // Note: index [0] stores the masters, and [1] its complement.
    1478             : #if USE_PRECOMPUTED_WEDGE_MASK
    1479             :   // Generate prototype by shifting the masters
    1480           3 :     int shift = h / 4;
    1481          99 :     for (i = 0; i < h; i += 2) {
    1482          96 :         shift_copy(wedge_master_oblique_even,
    1483          96 :             &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride], shift,
    1484             :             MASK_MASTER_SIZE);
    1485          96 :         shift--;
    1486          96 :         shift_copy(wedge_master_oblique_odd,
    1487          96 :             &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride], shift,
    1488             :             MASK_MASTER_SIZE);
    1489          96 :         memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride],
    1490             :             wedge_master_vertical,
    1491             :             MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
    1492          96 :         memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride],
    1493             :             wedge_master_vertical,
    1494             :             MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
    1495             :     }
    1496             : #else
    1497             :     static const double smoother_param = 2.85;
    1498             :     const int a[2] = { 2, 1 };
    1499             :     const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
    1500             :     for (i = 0; i < h; i++) {
    1501             :         for (j = 0; j < w; ++j) {
    1502             :             int x = (2 * j + 1 - w);
    1503             :             int y = (2 * i + 1 - h);
    1504             :             double d = (a[0] * x + a[1] * y) / asqrt;
    1505             :             const int msk = (int)rint((1.0 + tanh(d / smoother_param)) * 32);
    1506             :             wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j] = msk;
    1507             :             const int mskx = (int)rint((1.0 + tanh(x / smoother_param)) * 32);
    1508             :             wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j] = mskx;
    1509             :         }
    1510             :     }
    1511             : #endif  // USE_PRECOMPUTED_WEDGE_MASK
    1512         195 :     for (i = 0; i < h; ++i) {
    1513       12480 :         for (j = 0; j < w; ++j) {
    1514       12288 :             const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
    1515       12288 :             wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk;
    1516       12288 :             wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
    1517       12288 :                 wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
    1518       12288 :                 (1 << WEDGE_WEIGHT_BITS) - msk;
    1519       12288 :             wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] =
    1520       12288 :                 wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] =
    1521       12288 :                 (1 << WEDGE_WEIGHT_BITS) - msk;
    1522       12288 :             wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
    1523       12288 :                 wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk;
    1524       12288 :             const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j];
    1525       12288 :             wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx;
    1526       12288 :             wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] =
    1527       12288 :                 wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] =
    1528       12288 :                 (1 << WEDGE_WEIGHT_BITS) - mskx;
    1529             :         }
    1530             :     }
    1531           3 : }
    1532             : 
    1533             : #if !USE_PRECOMPUTED_WEDGE_SIGN
    1534             : // If the signs for the wedges for various blocksizes are
    1535             : // inconsistent flip the sign flag. Do it only once for every
    1536             : // wedge codebook.
    1537             : static void init_wedge_signs() {
    1538             :     BLOCK_SIZE sb_type;
    1539             :     memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
    1540             :     for (sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES_ALL; ++sb_type) {
    1541             :         const int bw = block_size_wide[sb_type];
    1542             :         const int bh = block_size_high[sb_type];
    1543             :         const wedge_params_type wedge_params = wedge_params_lookup[sb_type];
    1544             :         const int wbits = wedge_params.bits;
    1545             :         const int wtypes = 1 << wbits;
    1546             :         int i, w;
    1547             :         if (wbits) {
    1548             :             for (w = 0; w < wtypes; ++w) {
    1549             :                 // Get the mask master, i.e. index [0]
    1550             :                 const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
    1551             :                 int avg = 0;
    1552             :                 for (i = 0; i < bw; ++i) avg += mask[i];
    1553             :                 for (i = 1; i < bh; ++i) avg += mask[i * MASK_MASTER_STRIDE];
    1554             :                 avg = (avg + (bw + bh - 1) / 2) / (bw + bh - 1);
    1555             :                 // Default sign of this wedge is 1 if the average < 32, 0 otherwise.
    1556             :                 // If default sign is 1:
    1557             :                 //   If sign requested is 0, we need to flip the sign and return
    1558             :                 //   the complement i.e. index [1] instead. If sign requested is 1
    1559             :                 //   we need to flip the sign and return index [0] instead.
    1560             :                 // If default sign is 0:
    1561             :                 //   If sign requested is 0, we need to return index [0] the master
    1562             :                 //   if sign requested is 1, we need to return the complement index [1]
    1563             :                 //   instead.
    1564             :                 wedge_params.signflip[w] = (avg < 32);
    1565             :             }
    1566             :         }
    1567             :     }
    1568             : }
    1569             : #endif  // !USE_PRECOMPUTED_WEDGE_SIGN
    1570             : 
    1571    15979100 : static INLINE int get_wedge_bits_lookup(BlockSize sb_type) {
    1572    15979100 :     return wedge_params_lookup[sb_type].bits;
    1573             : }
    1574             : 
    1575         864 : static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
    1576             :     BlockSize sb_type) {
    1577             :     const uint8_t *master;
    1578         864 :     const int bh = block_size_high[sb_type];
    1579         864 :     const int bw = block_size_wide[sb_type];
    1580         864 :     const WedgeCodeType *a =
    1581         864 :         wedge_params_lookup[sb_type].codebook + wedge_index;
    1582             :     int woff, hoff;
    1583         864 :     const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];
    1584             : 
    1585         864 :     assert(wedge_index >= 0 &&
    1586             :         wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
    1587         864 :     woff = (a->x_offset * bw) >> 3;
    1588         864 :     hoff = (a->y_offset * bh) >> 3;
    1589         864 :     master = wedge_mask_obl[neg ^ wsignflip][a->direction] +
    1590         864 :         MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
    1591         864 :         MASK_MASTER_SIZE / 2 - woff;
    1592         864 :     return master;
    1593             : }
    1594             : 
    1595           3 : static void init_wedge_masks() {
    1596           3 :     uint8_t *dst = wedge_mask_buf;
    1597             :     BlockSize bsize;
    1598           3 :     memset(wedge_masks, 0, sizeof(wedge_masks));
    1599          69 :     for (bsize = BLOCK_4X4; bsize < BlockSizeS_ALL; ++bsize) {
    1600             :         const uint8_t *mask;
    1601          66 :         const int bw = block_size_wide[bsize];
    1602          66 :         const int bh = block_size_high[bsize];
    1603          66 :         const WedgeParamsType *wedge_params = &wedge_params_lookup[bsize];
    1604          66 :         const int wbits = wedge_params->bits;
    1605          66 :         const int wtypes = 1 << wbits;
    1606             :         int w;
    1607          66 :         if (wbits == 0) continue;
    1608         459 :         for (w = 0; w < wtypes; ++w) {
    1609         432 :             mask = get_wedge_mask_inplace(w, 0, bsize);
    1610         432 :             aom_convolve_copy_c(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
    1611             :                 bh);
    1612         432 :             wedge_params->masks[0][w] = dst;
    1613         432 :             dst += bw * bh;
    1614             : 
    1615         432 :             mask = get_wedge_mask_inplace(w, 1, bsize);
    1616         432 :             aom_convolve_copy_c(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
    1617             :                 bh);
    1618         432 :             wedge_params->masks[1][w] = dst;
    1619         432 :             dst += bw * bh;
    1620             :         }
    1621          27 :         assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf));
    1622             :     }
    1623           3 : }
    1624             : 
    1625             : // Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
    1626           3 : void av1_init_wedge_masks() {
    1627           3 :     init_wedge_master_masks();
    1628             : #if !USE_PRECOMPUTED_WEDGE_SIGN
    1629             :     init_wedge_signs();
    1630             : #endif  // !USE_PRECOMPUTED_WEDGE_SIGN
    1631           3 :     init_wedge_masks();
    1632           3 : }
    1633             : 
    1634           0 : static void diffwtd_mask_d16(uint8_t *mask, int which_inverse, int mask_base,
    1635             :     const CONV_BUF_TYPE *src0, int src0_stride,
    1636             :     const CONV_BUF_TYPE *src1, int src1_stride, int h,
    1637             :     int w, ConvolveParams *conv_params, int bd) {
    1638           0 :     int round =
    1639           0 :         2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
    1640             :     int i, j, m, diff;
    1641           0 :     for (i = 0; i < h; ++i) {
    1642           0 :         for (j = 0; j < w; ++j) {
    1643           0 :             diff = abs(src0[i * src0_stride + j] - src1[i * src1_stride + j]);
    1644           0 :             diff = ROUND_POWER_OF_TWO(diff, round);
    1645           0 :             m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
    1646           0 :             mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
    1647             :         }
    1648             :     }
    1649           0 : }
    1650             : 
    1651           0 : void av1_build_compound_diffwtd_mask_d16_c(
    1652             :     uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
    1653             :     int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
    1654             :     ConvolveParams *conv_params, int bd) {
    1655           0 :     switch (mask_type) {
    1656           0 :     case DIFFWTD_38:
    1657           0 :         diffwtd_mask_d16(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w,
    1658             :             conv_params, bd);
    1659           0 :         break;
    1660           0 :     case DIFFWTD_38_INV:
    1661           0 :         diffwtd_mask_d16(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w,
    1662             :             conv_params, bd);
    1663           0 :         break;
    1664           0 :     default: assert(0);
    1665             :     }
    1666           0 : }
    1667             : 
    1668             : int is_masked_compound_type(COMPOUND_TYPE type);
    1669             : 
    1670             : #if II_COMP_FLAG
    1671             : /* clang-format off */
    1672             : static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
    1673             :   60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
    1674             :   31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
    1675             :   16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,  9,  9,  9,  8,
    1676             :   8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  4,  4,
    1677             :   4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,
    1678             :   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,
    1679             :   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
    1680             : };
    1681             : static uint8_t ii_size_scales[BlockSizeS_ALL] = {
    1682             :     32, 16, 16, 16, 8, 8, 8, 4,
    1683             :     4,  4,  2,  2,  2, 1, 1, 1,
    1684             :     8,  8,  4,  4,  2, 2
    1685             : };
    1686             : /* clang-format on */
    1687             : 
    1688    24111400 : static void build_smooth_interintra_mask(uint8_t *mask, int stride,
    1689             :                                          BlockSize plane_bsize,
    1690             :                                          INTERINTRA_MODE mode) {
    1691             :   int i, j;
    1692    24111400 :   const int bw = block_size_wide[plane_bsize];
    1693    24111400 :   const int bh = block_size_high[plane_bsize];
    1694    24111400 :   const int size_scale = ii_size_scales[plane_bsize];
    1695             : 
    1696    24111400 :   switch (mode) {
    1697     6223300 :     case II_V_PRED:
    1698    98468000 :       for (i = 0; i < bh; ++i) {
    1699    92244700 :         memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
    1700    92244700 :         mask += stride;
    1701             :       }
    1702     6223300 :       break;
    1703             : 
    1704     7888590 :     case II_H_PRED:
    1705   118706000 :       for (i = 0; i < bh; ++i) {
    1706  1911850000 :         for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
    1707   110817000 :         mask += stride;
    1708             :       }
    1709     7888590 :       break;
    1710             : 
    1711     4990540 :     case II_SMOOTH_PRED:
    1712    77351900 :       for (i = 0; i < bh; ++i) {
    1713  1234780000 :         for (j = 0; j < bw; ++j)
    1714  1162420000 :           mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
    1715    72361400 :         mask += stride;
    1716             :       }
    1717     4990540 :       break;
    1718             : 
    1719     5008960 :     case II_DC_PRED:
    1720             :     default:
    1721    78322000 :       for (i = 0; i < bh; ++i) {
    1722    73313100 :         memset(mask, 32, bw * sizeof(mask[0]));
    1723    73313100 :         mask += stride;
    1724             :       }
    1725     5008960 :       break;
    1726             :   }
    1727    24111400 : }
    1728             : #endif
    1729   452908000 : static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index,
    1730             :     int wedge_sign,
    1731             :     BlockSize sb_type) {
    1732   452908000 :     return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
    1733             : }
    1734             : 
    1735             : #if COMP_INTERINTRA
    1736           0 : void combine_interintra_highbd(
    1737             :     InterIntraMode mode, uint8_t use_wedge_interintra, uint8_t wedge_index,
    1738             :     uint8_t wedge_sign, BlockSize bsize, BlockSize plane_bsize,
    1739             :     uint8_t *comppred8, int compstride, const uint8_t *interpred8,
    1740             :     int interstride, const uint8_t *intrapred8, int intrastride, int bd)
    1741             : {
    1742           0 :     const int bw = block_size_wide[plane_bsize];
    1743           0 :     const int bh = block_size_high[plane_bsize];
    1744             : 
    1745           0 :     if (use_wedge_interintra) {
    1746           0 :         if (is_interintra_wedge_used(bsize)) {
    1747             :             const uint8_t *mask =
    1748           0 :                 av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
    1749           0 :             const int subh = 2 * mi_size_high[bsize] == bh;
    1750           0 :             const int subw = 2 * mi_size_wide[bsize] == bw;
    1751           0 :             aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8,
    1752             :                 intrastride, interpred8, interstride, mask,
    1753           0 :                 block_size_wide[bsize], bw, bh, subw, subh, bd);
    1754             :         }
    1755           0 :         return;
    1756             :     }
    1757             : 
    1758             :     uint8_t mask[MAX_SB_SQUARE];
    1759           0 :     build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
    1760           0 :     aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
    1761             :         interpred8, interstride, mask, bw, bw, bh, 0, 0,
    1762             :         bd);
    1763             : }
    1764             : 
    1765             : #endif //comp_interintra
    1766             : 
    1767    39849500 : const uint8_t *av1_get_compound_type_mask(
    1768             :     const InterInterCompoundData *const comp_data,
    1769             :     uint8_t *seg_mask, BlockSize sb_type)
    1770             : {
    1771    39849500 :     assert(is_masked_compound_type(comp_data->type));
    1772             :     (void)sb_type;
    1773    39872800 :     switch (comp_data->type) {
    1774    13164900 :     case COMPOUND_WEDGE:
    1775    13164900 :         return av1_get_contiguous_soft_mask(comp_data->wedge_index,
    1776    13164900 :             comp_data->wedge_sign, sb_type);
    1777    26707900 :     case COMPOUND_DIFFWTD: return seg_mask;
    1778           0 :     default: assert(0); return NULL;
    1779             :     }
    1780             : }
    1781             : 
    1782    39851000 : void build_masked_compound_no_round(
    1783             :     uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
    1784             :     const CONV_BUF_TYPE *src1, int src1_stride,
    1785             :     const InterInterCompoundData *const comp_data,
    1786             :     uint8_t *seg_mask,
    1787             :     BlockSize sb_type, int h,
    1788             :     int w, ConvolveParams *conv_params, uint8_t bit_depth)
    1789             : {
    1790             :     // Derive subsampling from h and w passed in. May be refactored to
    1791             :     // pass in subsampling factors directly.
    1792    39851000 :     const int subh = (2 << mi_size_high_log2[sb_type]) == h;
    1793    39851000 :     const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
    1794    39851000 :     const uint8_t *mask = av1_get_compound_type_mask(comp_data, seg_mask, sb_type);
    1795             : 
    1796    39848200 :     if (bit_depth > EB_8BIT) {
    1797           0 :         aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
    1798           0 :             src1_stride, mask, block_size_wide[sb_type], w,
    1799             :             h, subw, subh, conv_params, bit_depth);
    1800             :     }
    1801             :     else {
    1802    39848200 :         aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
    1803    39848200 :             src1_stride, mask, block_size_wide[sb_type], w,
    1804             :             h, subw, subh, conv_params);
    1805             :     }
    1806    39864000 : }
    1807             : 
    1808    39797900 : void av1_make_masked_inter_predictor(
    1809             :     uint8_t                   *src_ptr,
    1810             :     uint32_t                   src_stride,
    1811             :     uint8_t                   *dst_ptr,
    1812             :     uint32_t                   dst_stride,
    1813             :     const BlockGeom           *blk_geom,
    1814             :     uint8_t                    bwidth,
    1815             :     uint8_t                    bheight,
    1816             :     InterpFilterParams        *filter_params_x,
    1817             :     InterpFilterParams        *filter_params_y,
    1818             :     int32_t                    subpel_x,
    1819             :     int32_t                    subpel_y,
    1820             :     ConvolveParams            *conv_params,
    1821             :     InterInterCompoundData    *comp_data,
    1822             :     uint8_t                    bitdepth,
    1823             :     uint8_t                    plane
    1824             : )
    1825             : {
    1826             :     //We come here when we have a prediction done using regular path for the ref0 stored in conv_param.dst.
    1827             :     //use regular path to generate a prediction for ref1 into  a temporary buffer,
    1828             :     //then  blend that temporary buffer with that from  the first reference.
    1829             : 
    1830             :     DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
    1831             : 
    1832             : #define INTER_PRED_BYTES_PER_PIXEL 2
    1833             :     DECLARE_ALIGNED(32, uint8_t,
    1834             :     tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
    1835             : #undef INTER_PRED_BYTES_PER_PIXEL
    1836             :     //uint8_t *tmp_dst =  tmp_buf;
    1837    39797900 :     const int tmp_buf_stride = MAX_SB_SIZE;
    1838             : 
    1839    39797900 :     CONV_BUF_TYPE *org_dst = conv_params->dst;//save the ref0 prediction pointer
    1840    39797900 :     int org_dst_stride = conv_params->dst_stride;
    1841    39797900 :     CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
    1842    39797900 :     conv_params->dst = tmp_buf16;
    1843    39797900 :     conv_params->dst_stride = tmp_buf_stride;
    1844    39797900 :     assert(conv_params->do_average == 0);
    1845             : 
    1846    39797900 :     if (bitdepth == EB_8BIT)
    1847    39806100 :         convolve[subpel_x != 0][subpel_y != 0][1](
    1848             :             src_ptr,
    1849             :             src_stride,
    1850             :             dst_ptr,
    1851             :             dst_stride,
    1852             :             bwidth,
    1853             :             bheight,
    1854             :             filter_params_x,
    1855             :             filter_params_y,
    1856             :             subpel_x,
    1857             :             subpel_y,
    1858             :             conv_params);
    1859             :     else
    1860           0 :         convolveHbd[subpel_x != 0][subpel_y != 0][1](
    1861             :             (uint16_t *)src_ptr,
    1862             :             src_stride,
    1863             :             (uint16_t *)dst_ptr,
    1864             :             dst_stride,
    1865             :             bwidth,
    1866             :             bheight,
    1867             :             filter_params_x,
    1868             :             filter_params_y,
    1869             :             subpel_x,
    1870             :             subpel_y,
    1871             :             conv_params,
    1872             :             bitdepth);
    1873             : 
    1874    39845300 :     if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
    1875             :         //CHKN  for DIFF: need to compute the mask  comp_data->seg_mask is the output computed from the two preds org_dst and tmp_buf16
    1876             :         //for WEDGE the mask is fixed from the table based on wedge_sign/index
    1877    26355400 :         av1_build_compound_diffwtd_mask_d16(
    1878    26355400 :             seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
    1879             :             tmp_buf16, tmp_buf_stride, bheight, bwidth, conv_params, bitdepth);
    1880             :     }
    1881             : 
    1882    39850600 :     build_masked_compound_no_round(dst_ptr, dst_stride, org_dst, org_dst_stride,
    1883             :         tmp_buf16, tmp_buf_stride, comp_data, seg_mask,
    1884    39850600 :         blk_geom->bsize, bheight, bwidth, conv_params, bitdepth);
    1885             : 
    1886    39864500 : }
    1887             : #if INTER_INTER_HBD
    1888           0 : void av1_make_masked_inter_predictor_hbd(
    1889             :     uint16_t                  *src_ptr,
    1890             :     uint32_t                   src_stride,
    1891             :     uint16_t                  *dst_ptr,
    1892             :     uint32_t                   dst_stride,
    1893             :     const BlockGeom           *blk_geom,
    1894             :     uint8_t                    bwidth,
    1895             :     uint8_t                    bheight,
    1896             :     InterpFilterParams        *filter_params_x,
    1897             :     InterpFilterParams        *filter_params_y,
    1898             :     int32_t                    subpel_x,
    1899             :     int32_t                    subpel_y,
    1900             :     ConvolveParams            *conv_params,
    1901             :     InterInterCompoundData    *comp_data,
    1902             :     uint8_t                    bitdepth,
    1903             :     uint8_t                    plane
    1904             : )
    1905             : {
    1906             :     //We come here when we have a prediction done using regular path for the ref0 stored in conv_param.dst.
    1907             :     //use regular path to generate a prediction for ref1 into  a temporary buffer,
    1908             :     //then  blend that temporary buffer with that from  the first reference.
    1909             : 
    1910             :     DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
    1911             : 
    1912             : #define INTER_PRED_BYTES_PER_PIXEL 2
    1913             :     DECLARE_ALIGNED(32, uint8_t,
    1914             :     tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
    1915             : #undef INTER_PRED_BYTES_PER_PIXEL
    1916             :     //uint8_t *tmp_dst =  tmp_buf;
    1917           0 :     const int tmp_buf_stride = MAX_SB_SIZE;
    1918             : 
    1919           0 :     CONV_BUF_TYPE *org_dst = conv_params->dst;//save the ref0 prediction pointer
    1920           0 :     int org_dst_stride = conv_params->dst_stride;
    1921           0 :     CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
    1922           0 :     conv_params->dst = tmp_buf16;
    1923           0 :     conv_params->dst_stride = tmp_buf_stride;
    1924           0 :     assert(conv_params->do_average == 0);
    1925             : 
    1926           0 :     convolveHbd[subpel_x != 0][subpel_y != 0][1](
    1927             :         src_ptr,
    1928             :         src_stride,
    1929             :         dst_ptr,
    1930             :         dst_stride,
    1931             :         bwidth,
    1932             :         bheight,
    1933             :         filter_params_x,
    1934             :         filter_params_y,
    1935             :         subpel_x,
    1936             :         subpel_y,
    1937             :         conv_params,
    1938             :         EB_10BIT);
    1939             : 
    1940           0 :     if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
    1941             :         //CHKN  for DIFF: need to compute the mask  comp_data->seg_mask is the output computed from the two preds org_dst and tmp_buf16
    1942             :         //for WEDGE the mask is fixed from the table based on wedge_sign/index
    1943           0 :         av1_build_compound_diffwtd_mask_d16(
    1944           0 :             seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
    1945             :             tmp_buf16, tmp_buf_stride, bheight, bwidth, conv_params, bitdepth);
    1946             :     }
    1947             : 
    1948           0 :     build_masked_compound_no_round((uint8_t *)dst_ptr, dst_stride, org_dst, org_dst_stride,
    1949             :         tmp_buf16, tmp_buf_stride, comp_data, seg_mask,
    1950           0 :         blk_geom->bsize, bheight, bwidth, conv_params, bitdepth);
    1951             : 
    1952           0 : }
    1953             : #endif
    1954             : 
    1955           0 : void av1_make_masked_warp_inter_predictor(
    1956             :     uint8_t                   *src_ptr,
    1957             :     uint32_t                   src_stride,
    1958             :     uint16_t                   buf_width,
    1959             :     uint16_t                   buf_height,
    1960             :     uint8_t                   *dst_ptr,
    1961             :     uint32_t                   dst_stride,
    1962             :     const BlockGeom           *blk_geom,
    1963             :     uint8_t                    bwidth,
    1964             :     uint8_t                    bheight,
    1965             :     ConvolveParams            *conv_params,
    1966             :     InterInterCompoundData    *comp_data,
    1967             :     uint8_t                    bitdepth,
    1968             :     uint8_t                    plane,
    1969             :     uint16_t                                pu_origin_x,
    1970             :     uint16_t                                pu_origin_y,
    1971             :     EbWarpedMotionParams                   *wm_params_l1
    1972             : )
    1973             : {
    1974           0 :     EbBool is16bit = (EbBool)(bitdepth > EB_8BIT);
    1975             : 
    1976             :     //We come here when we have a prediction done using regular path for the ref0 stored in conv_param.dst.
    1977             :     //use regular path to generate a prediction for ref1 into  a temporary buffer,
    1978             :     //then  blend that temporary buffer with that from  the first reference.
    1979             : 
    1980             :     DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
    1981             : 
    1982             : #define INTER_PRED_BYTES_PER_PIXEL 2
    1983             :     DECLARE_ALIGNED(32, uint8_t,
    1984             :     tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
    1985             : #undef INTER_PRED_BYTES_PER_PIXEL
    1986           0 :     uint8_t *tmp_dst =  tmp_buf;
    1987           0 :     const int tmp_buf_stride = MAX_SB_SIZE;
    1988             : 
    1989           0 :     CONV_BUF_TYPE *org_dst = conv_params->dst;//save the ref0 prediction pointer
    1990           0 :     int org_dst_stride = conv_params->dst_stride;
    1991           0 :     CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
    1992           0 :     conv_params->dst = tmp_buf16;
    1993           0 :     conv_params->dst_stride = tmp_buf_stride;
    1994           0 :     assert(conv_params->do_average == 0);
    1995             : 
    1996           0 :     uint8_t ss_x = plane == 0 ? 0 : 1; // subsamplings
    1997           0 :     uint8_t ss_y = plane == 0 ? 0 : 1;
    1998             : 
    1999           0 :     eb_av1_warp_plane(
    2000             :         wm_params_l1,
    2001             :         (int) is16bit,
    2002             :         bitdepth,
    2003             :         src_ptr,
    2004             :         (int)buf_width,
    2005             :         (int)buf_height,
    2006             :         src_stride,
    2007             :         tmp_dst,
    2008             :         pu_origin_x,
    2009             :         pu_origin_y,
    2010             :         bwidth,
    2011             :         bheight,
    2012             :         MAX_SB_SQUARE,
    2013             :         ss_x, //int subsampling_x,
    2014             :         ss_y, //int subsampling_y,
    2015             :         conv_params);
    2016             : 
    2017           0 :     if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
    2018             :         //CHKN  for DIFF: need to compute the mask  comp_data->seg_mask is the output computed from the two preds org_dst and tmp_buf16
    2019             :         //for WEDGE the mask is fixed from the table based on wedge_sign/index
    2020           0 :         av1_build_compound_diffwtd_mask_d16(
    2021           0 :             seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
    2022             :             tmp_buf16, tmp_buf_stride, bheight, bwidth, conv_params, bitdepth);
    2023             :     }
    2024             : 
    2025           0 :     build_masked_compound_no_round(dst_ptr, dst_stride, org_dst, org_dst_stride,
    2026             :         tmp_buf16, tmp_buf_stride, comp_data, seg_mask,
    2027           0 :         blk_geom->bsize, bheight, bwidth, conv_params, bitdepth);
    2028             : 
    2029           0 : }
    2030             : 
    2031             : 
    2032           0 : void aom_subtract_block_c(int rows, int cols, int16_t *diff,
    2033             :     ptrdiff_t diff_stride, const uint8_t *src,
    2034             :     ptrdiff_t src_stride, const uint8_t *pred,
    2035             :     ptrdiff_t pred_stride) {
    2036             :     int r, c;
    2037             : 
    2038           0 :     for (r = 0; r < rows; r++) {
    2039           0 :         for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
    2040             : 
    2041           0 :         diff += diff_stride;
    2042           0 :         pred += pred_stride;
    2043           0 :         src += src_stride;
    2044             :     }
    2045           0 : }
    2046             : 
    2047           0 : static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base,
    2048             :     const uint8_t *src0, int src0_stride,
    2049             :     const uint8_t *src1, int src1_stride, int h, int w) {
    2050             :     int i, j, m, diff;
    2051           0 :     for (i = 0; i < h; ++i) {
    2052           0 :         for (j = 0; j < w; ++j) {
    2053           0 :             diff =
    2054           0 :                 abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
    2055           0 :             m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
    2056           0 :             mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
    2057             :         }
    2058             :     }
    2059           0 : }
    2060             : 
    2061           0 : void av1_build_compound_diffwtd_mask_c(uint8_t *mask,
    2062             :     DIFFWTD_MASK_TYPE mask_type,
    2063             :     const uint8_t *src0, int src0_stride,
    2064             :     const uint8_t *src1, int src1_stride,
    2065             :     int h, int w) {
    2066           0 :     switch (mask_type) {
    2067           0 :     case DIFFWTD_38:
    2068           0 :         diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w);
    2069           0 :         break;
    2070           0 :     case DIFFWTD_38_INV:
    2071           0 :         diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w);
    2072           0 :         break;
    2073           0 :     default: assert(0);
    2074             :     }
    2075           0 : }
    2076             : #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
    2077             : 
    2078             : /**
    2079             :  * Computes SSE of a compound predictor constructed from 2 fundamental
    2080             :  * predictors p0 and p1 using blending with mask.
    2081             :  *
    2082             :  * r1:  Residuals of p1.
    2083             :  *      (source - p1)
    2084             :  * d:   Difference of p1 and p0.
    2085             :  *      (p1 - p0)
    2086             :  * m:   The blending mask
    2087             :  * N:   Number of pixels
    2088             :  *
    2089             :  * 'r1', 'd', and 'm' are contiguous.
    2090             :  *
    2091             :  * Computes:
    2092             :  *  Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
    2093             :  *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
    2094             :  *    where r0 is (source - p0), and r1 is (source - p1), which is in turn
    2095             :  *    is equivalent to:
    2096             :  *  Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
    2097             :  *    which is the SSE of the residuals of the compound predictor scaled up by
    2098             :  *    MAX_MASK_VALUE**2.
    2099             :  *
    2100             :  * Note that we clamp the partial term in the loop to 16 bits signed. This is
    2101             :  * to facilitate equivalent SIMD implementation. It should have no effect if
    2102             :  * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
    2103             :  * holds for 8 bit input, and on real input, it should hold practically always,
    2104             :  * as residuals are expected to be small.
    2105             :  */
    2106           0 : uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d,
    2107             :     const uint8_t *m, int N) {
    2108           0 :     uint64_t csse = 0;
    2109             :     int i;
    2110             : 
    2111           0 :     for (i = 0; i < N; i++) {
    2112           0 :         int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i];
    2113           0 :         t = clamp(t, INT16_MIN, INT16_MAX);
    2114           0 :         csse += t * t;
    2115             :     }
    2116           0 :     return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
    2117             : }
    2118             : static const uint8_t bsize_curvfit_model_cat_lookup[BlockSizeS_ALL] = {
    2119             :   0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
    2120             : };
    2121   310486000 : static int sse_norm_curvfit_model_cat_lookup(double sse_norm) {
    2122   310486000 :     return (sse_norm > 16.0);
    2123             : }
    2124             : static const double interp_rgrid_curv[4][65] = {
    2125             :   {
    2126             :       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
    2127             :       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
    2128             :       0.000000,    23.801499,   28.387688,   33.388795,   42.298282,
    2129             :       41.525408,   51.597692,   49.566271,   54.632979,   60.321507,
    2130             :       67.730678,   75.766165,   85.324032,   96.600012,   120.839562,
    2131             :       173.917577,  255.974908,  354.107573,  458.063476,  562.345966,
    2132             :       668.568424,  772.072881,  878.598490,  982.202274,  1082.708946,
    2133             :       1188.037853, 1287.702240, 1395.588773, 1490.825830, 1584.231230,
    2134             :       1691.386090, 1766.822555, 1869.630904, 1926.743565, 2002.949495,
    2135             :       2047.431137, 2138.486068, 2154.743767, 2209.242472, 2277.593051,
    2136             :       2290.996432, 2307.452938, 2343.567091, 2397.654644, 2469.425868,
    2137             :       2558.591037, 2664.860422, 2787.944296, 2927.552932, 3083.396602,
    2138             :       3255.185579, 3442.630134, 3645.440541, 3863.327072, 4096.000000,
    2139             :   },
    2140             :   {
    2141             :       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
    2142             :       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
    2143             :       0.000000,    8.998436,    9.439592,    9.731837,    10.865931,
    2144             :       11.561347,   12.578139,   14.205101,   16.770584,   19.094853,
    2145             :       21.330863,   23.298907,   26.901921,   34.501017,   57.891733,
    2146             :       112.234763,  194.853189,  288.302032,  380.499422,  472.625309,
    2147             :       560.226809,  647.928463,  734.155122,  817.489721,  906.265783,
    2148             :       999.260562,  1094.489206, 1197.062998, 1293.296825, 1378.926484,
    2149             :       1472.760990, 1552.663779, 1635.196884, 1692.451951, 1759.741063,
    2150             :       1822.162720, 1916.515921, 1966.686071, 2031.647506, 2033.700134,
    2151             :       2087.847688, 2161.688858, 2242.536028, 2334.023491, 2436.337802,
    2152             :       2549.665519, 2674.193198, 2810.107395, 2957.594666, 3116.841567,
    2153             :       3288.034655, 3471.360486, 3667.005616, 3875.156602, 4096.000000,
    2154             :   },
    2155             :   {
    2156             :       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
    2157             :       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
    2158             :       0.000000,    2.377584,    2.557185,    2.732445,    2.851114,
    2159             :       3.281800,    3.765589,    4.342578,    5.145582,    5.611038,
    2160             :       6.642238,    7.945977,    11.800522,   17.346624,   37.501413,
    2161             :       87.216800,   165.860942,  253.865564,  332.039345,  408.518863,
    2162             :       478.120452,  547.268590,  616.067676,  680.022540,  753.863541,
    2163             :       834.529973,  919.489191,  1008.264989, 1092.230318, 1173.971886,
    2164             :       1249.514122, 1330.510941, 1399.523249, 1466.923387, 1530.533471,
    2165             :       1586.515722, 1695.197774, 1746.648696, 1837.136959, 1909.075485,
    2166             :       1975.074651, 2060.159200, 2155.335095, 2259.762505, 2373.710437,
    2167             :       2497.447898, 2631.243895, 2775.367434, 2930.087523, 3095.673170,
    2168             :       3272.393380, 3460.517161, 3660.313520, 3872.051464, 4096.000000,
    2169             :   },
    2170             :   {
    2171             :       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
    2172             :       0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
    2173             :       0.000000,    0.296997,    0.342545,    0.403097,    0.472889,
    2174             :       0.614483,    0.842937,    1.050824,    1.326663,    1.717750,
    2175             :       2.530591,    3.582302,    6.995373,    9.973335,    24.042464,
    2176             :       56.598240,   113.680735,  180.018689,  231.050567,  266.101082,
    2177             :       294.957934,  323.326511,  349.434429,  380.443211,  408.171987,
    2178             :       441.214916,  475.716772,  512.900000,  551.186939,  592.364455,
    2179             :       624.527378,  661.940693,  679.185473,  724.800679,  764.781792,
    2180             :       873.050019,  950.299001,  939.292954,  1052.406153, 1033.893184,
    2181             :       1112.182406, 1219.174326, 1337.296681, 1471.648357, 1622.492809,
    2182             :       1790.093491, 1974.713858, 2176.617364, 2396.067465, 2633.327614,
    2183             :       2888.661266, 3162.331876, 3454.602899, 3765.737789, 4096.000000,
    2184             :   },
    2185             : };
    2186             : 
    2187             : static const double interp_dgrid_curv[2][65] = {
    2188             :   {
    2189             :       16.000000, 15.962891, 15.925174, 15.886888, 15.848074, 15.808770,
    2190             :       15.769015, 15.728850, 15.688313, 15.647445, 15.606284, 15.564870,
    2191             :       15.525918, 15.483820, 15.373330, 15.126844, 14.637442, 14.184387,
    2192             :       13.560070, 12.880717, 12.165995, 11.378144, 10.438769, 9.130790,
    2193             :       7.487633,  5.688649,  4.267515,  3.196300,  2.434201,  1.834064,
    2194             :       1.369920,  1.035921,  0.775279,  0.574895,  0.427232,  0.314123,
    2195             :       0.233236,  0.171440,  0.128188,  0.092762,  0.067569,  0.049324,
    2196             :       0.036330,  0.027008,  0.019853,  0.015539,  0.011093,  0.008733,
    2197             :       0.007624,  0.008105,  0.005427,  0.004065,  0.003427,  0.002848,
    2198             :       0.002328,  0.001865,  0.001457,  0.001103,  0.000801,  0.000550,
    2199             :       0.000348,  0.000193,  0.000085,  0.000021,  0.000000,
    2200             :   },
    2201             :   {
    2202             :       16.000000, 15.996116, 15.984769, 15.966413, 15.941505, 15.910501,
    2203             :       15.873856, 15.832026, 15.785466, 15.734633, 15.679981, 15.621967,
    2204             :       15.560961, 15.460157, 15.288367, 15.052462, 14.466922, 13.921212,
    2205             :       13.073692, 12.222005, 11.237799, 9.985848,  8.898823,  7.423519,
    2206             :       5.995325,  4.773152,  3.744032,  2.938217,  2.294526,  1.762412,
    2207             :       1.327145,  1.020728,  0.765535,  0.570548,  0.425833,  0.313825,
    2208             :       0.232959,  0.171324,  0.128174,  0.092750,  0.067558,  0.049319,
    2209             :       0.036330,  0.027008,  0.019853,  0.015539,  0.011093,  0.008733,
    2210             :       0.007624,  0.008105,  0.005427,  0.004065,  0.003427,  0.002848,
    2211             :       0.002328,  0.001865,  0.001457,  0.001103,  0.000801,  0.000550,
    2212             :       0.000348,  0.000193,  0.000085,  0.000021,  -0.000000,
    2213             :   },
    2214             : };
    2215             : 
    2216             : 
    2217             : /*
    2218             :   Precalucation factors to interp_cubic()
    2219             :     interp_cubic() OUT is: p[1] + 0.5 * x * (p[2] - p[0] +
    2220             :                       x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] +
    2221             :                       x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
    2222             :   Precalucation:
    2223             :     interp_cubic() OUT is: D + x * (C + x * (B + x * A))
    2224             :     For precalculated factors:
    2225             :     double A = 0.5 *(3.0 * (p[1] - p[2]) + p[3] - p[0]);
    2226             :     double B = 0.5 *(2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3]);
    2227             :     double C = 0.5 * (p[2] - p[0]);
    2228             :     double D = p[1];
    2229             : 
    2230             :     Precalculated values of array factors:
    2231             :     A is: (0 to sizeof(ARRAY[])-1)
    2232             :     B is: (0 to sizeof(ARRAY[A][])-4)
    2233             :     PRECALC[A][B][0] = 0.5 *(3.0 * (ARRAY[A][B+1] - ARRAY[A][B+2]) + ARRAY[A][B+3] - ARRAY[A][B])
    2234             :     PRECALC[A][B][1] = 0.5 *(2.0 * p[0] - 5.0 * ARRAY[A][B+1] + 4.0 * ARRAY[A][B+2]) - ARRAY[A][B+3]);
    2235             :     PRECALC[A][B][2] = 0.5 * (ARRAY[A][B+2] - ARRAY[A][B]);
    2236             :     PRECALC[A][B][3] = ARRAY[A][B+1]
    2237             : */
    2238             : 
    2239   310536000 : void av1_model_rd_curvfit(BlockSize bsize, double sse_norm, double xqr,
    2240             :     double *rate_f, double *distbysse_f) {
    2241   310536000 :     const double x_start = -15.5;
    2242   310536000 :     const double x_end = 16.5;
    2243   310536000 :     const double x_step = 0.5;
    2244   310536000 :     const double epsilon = 1e-6;
    2245   310536000 :     const int rcat = bsize_curvfit_model_cat_lookup[bsize];
    2246   310536000 :     const int dcat = sse_norm_curvfit_model_cat_lookup(sse_norm);
    2247             :     (void)x_end;
    2248             : 
    2249   310814000 :     xqr = AOMMAX(xqr, x_start + x_step + epsilon);
    2250   310814000 :     xqr = AOMMIN(xqr, x_end - x_step - epsilon);
    2251   310814000 :     const double x = (xqr - x_start) / x_step;
    2252   310814000 :     const int xi = (int)floor(x);
    2253   310814000 :     assert(xi > 0);
    2254             : 
    2255   310814000 :     const double *prate = &interp_rgrid_curv[rcat][(xi - 1)];
    2256   310814000 :     *rate_f = prate[1];
    2257   310814000 :     const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)];
    2258   310814000 :     *distbysse_f = pdist[1];
    2259             : 
    2260   310814000 : }
    2261             : 
    2262             : // Fits a curve for rate and distortion using as feature:
    2263             : // log2(sse_norm/qstep^2)
    2264   309433000 : static void model_rd_with_curvfit(
    2265             :     PictureControlSet      *picture_control_set_ptr,
    2266             :     BlockSize plane_bsize,
    2267             :     int64_t sse, int num_samples, int *rate,
    2268             :     int64_t *dist,
    2269             :     uint32_t rdmult
    2270             : )
    2271             : {
    2272             :     (void)plane_bsize;
    2273   309433000 :     const int dequant_shift = 3;
    2274             : #if 0
    2275             :     int32_t current_q_index = MAX(0, MIN(QINDEX_RANGE - 1, picture_control_set_ptr->parent_pcs_ptr->frm_hdr.quantization_params.base_q_idx));
    2276             : #else
    2277   309433000 :     int32_t current_q_index = picture_control_set_ptr->parent_pcs_ptr->frm_hdr.quantization_params.base_q_idx;
    2278             : #endif
    2279   309433000 :     Dequants *const dequants = &picture_control_set_ptr->parent_pcs_ptr->deq;
    2280   309433000 :     int16_t quantizer = dequants->y_dequant_Q3[current_q_index][1];
    2281             : 
    2282   309433000 :     const int qstep = AOMMAX(quantizer >> dequant_shift, 1);
    2283             : 
    2284   309433000 :     if (sse == 0) {
    2285         673 :         if (rate) *rate = 0;
    2286         673 :         if (dist) *dist = 0;
    2287         673 :         return;
    2288             :     }
    2289   309432000 :     aom_clear_system_state();
    2290   309449000 :     const double sse_norm = (double)sse / num_samples;
    2291   309449000 :     const double xqr = (double)LOG2F((uint32_t)sse_norm / (qstep * qstep));
    2292             : 
    2293             :     double rate_f, dist_by_sse_norm_f;
    2294   309495000 :     av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f, &dist_by_sse_norm_f);
    2295             : 
    2296   310500000 :     const double dist_f = dist_by_sse_norm_f * sse_norm;
    2297   310500000 :     int rate_i = (int)((rate_f * num_samples) + 0.5);
    2298   310500000 :     int64_t dist_i = (int64_t)((dist_f * num_samples) + 0.5);
    2299   310500000 :     aom_clear_system_state();
    2300             : 
    2301             :     // Check if skip is better
    2302   311086000 :     if (rate_i == 0) {
    2303           0 :         dist_i = sse << 4;
    2304             :     }
    2305   311086000 :     else if (RDCOST(rdmult, rate_i, dist_i) >= RDCOST(rdmult, 0, sse << 4)) {
    2306   233120000 :         rate_i = 0;
    2307   233120000 :         dist_i = sse << 4;
    2308             :     }
    2309             : 
    2310   311086000 :     if (rate) *rate = rate_i;
    2311   311086000 :     if (dist) *dist = dist_i;
    2312             : }
    2313             : 
    2314             : 
    2315             : /**
    2316             :  * Compute the element-wise difference of the squares of 2 arrays.
    2317             :  *
    2318             :  * d: Difference of the squares of the inputs: a**2 - b**2
    2319             :  * a: First input array
    2320             :  * b: Second input array
    2321             :  * N: Number of elements
    2322             :  *
    2323             :  * 'd', 'a', and 'b' are contiguous.
    2324             :  *
    2325             :  * The result is saturated to signed 16 bits.
    2326             :  */
    2327           0 : void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a,
    2328             :     const int16_t *b, int N) {
    2329             :     int i;
    2330             : 
    2331           0 :     for (i = 0; i < N; i++)
    2332           0 :         d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX);
    2333           0 : }
    2334             : 
    2335           0 : uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) {
    2336           0 :     uint64_t ss = 0;
    2337             :     do {
    2338           0 :         const int16_t v = *src++;
    2339           0 :         ss += v * v;
    2340           0 :     } while (--n);
    2341             : 
    2342           0 :     return ss;
    2343             : }
    2344             : /**
    2345             :  * Choose the mask sign for a compound predictor.
    2346             :  *
    2347             :  * ds:    Difference of the squares of the residuals.
    2348             :  *        r0**2 - r1**2
    2349             :  * m:     The blending mask
    2350             :  * N:     Number of pixels
    2351             :  * limit: Pre-computed threshold value.
    2352             :  *        MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
    2353             :  *
    2354             :  * 'ds' and 'm' are contiguous.
    2355             :  *
    2356             :  * Returns true if the negated mask has lower SSE compared to the positive
    2357             :  * mask. Computation is based on:
    2358             :  *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2)
    2359             :  *                                     >
    2360             :  *                                Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2)
    2361             :  *
    2362             :  *  which can be simplified to:
    2363             :  *
    2364             :  *  Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
    2365             :  *
    2366             :  *  The right hand side does not depend on the mask, and needs to be passed as
    2367             :  *  the 'limit' parameter.
    2368             :  *
    2369             :  *  After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left
    2370             :  *  hand side is simply a scalar product between an int16_t and uint8_t vector.
    2371             :  *
    2372             :  *  Note that for efficiency, ds is stored on 16 bits. Real input residuals
    2373             :  *  being small, this should not cause a noticeable issue.
    2374             :  */
    2375           0 : int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m,
    2376             :     int N, int64_t limit) {
    2377           0 :     int64_t acc = 0;
    2378             : 
    2379             :     do {
    2380           0 :         acc += *ds++ * *m++;
    2381           0 :     } while (--N);
    2382             : 
    2383           0 :     return acc > limit;
    2384             : }
    2385             : 
    2386    11529000 : static void pick_wedge(
    2387             :     PictureControlSet                    *picture_control_set_ptr,
    2388             :     ModeDecisionContext                  *context_ptr,
    2389             :     const BlockSize bsize,
    2390             :     const uint8_t *const p0,
    2391             :     const int16_t *const residual1,
    2392             :     const int16_t *const diff10,
    2393             :     int8_t *const best_wedge_sign,
    2394             :     int8_t *const best_wedge_index)
    2395             : {
    2396             : 
    2397    11529000 :     EbPictureBufferDesc   *src_pic = picture_control_set_ptr->parent_pcs_ptr->enhanced_picture_ptr;
    2398    11529000 :     uint8_t               *src_buf = src_pic->buffer_y + (context_ptr->cu_origin_x + src_pic->origin_x) + (context_ptr->cu_origin_y + src_pic->origin_y) * src_pic->stride_y;
    2399             : 
    2400    11529000 :     const int bw = block_size_wide[bsize];
    2401    11529000 :     const int bh = block_size_high[bsize];
    2402    11529000 :     const int N = bw * bh;
    2403    11529000 :     assert(N >= 64);
    2404             :     int rate;
    2405             :     int64_t dist;
    2406    11529000 :     int64_t rd, best_rd = INT64_MAX;
    2407             :     int8_t wedge_index;
    2408             :     int8_t wedge_sign;
    2409    11529000 :     int8_t wedge_types = (1 << get_wedge_bits_lookup(bsize));
    2410             :     const uint8_t *mask;
    2411             :     uint64_t sse;
    2412    11529100 :     const int bd_round = 0;
    2413             :     DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]);  // src - pred0
    2414             : 
    2415    11529100 :     aom_subtract_block(bh, bw, residual0, bw, src_buf/*src->buf*/, src_pic->stride_y/*src->stride*/, p0, bw);
    2416             : 
    2417    11528800 :     int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) -
    2418    11531700 :         (int64_t)aom_sum_squares_i16(residual1, N)) *
    2419             :         (1 << WEDGE_WEIGHT_BITS) / 2;
    2420    11531600 :     int16_t *ds = residual0;
    2421             : 
    2422    11531600 :     av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
    2423             : 
    2424   195232000 :     for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
    2425   183702000 :         mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
    2426             : 
    2427   183642000 :         wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
    2428             : 
    2429   183767000 :         mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
    2430   183740000 :         sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
    2431   183212000 :         sse = ROUND_POWER_OF_TWO(sse, bd_round);
    2432             : 
    2433   183212000 :         model_rd_with_curvfit(picture_control_set_ptr, bsize, sse, N, &rate, &dist, context_ptr->full_lambda);
    2434             : 
    2435   183628000 :         rd = RDCOST(context_ptr->full_lambda, rate, dist);
    2436             : 
    2437   183628000 :         if (rd < best_rd) {
    2438    45607700 :             *best_wedge_index = wedge_index;
    2439    45607700 :             *best_wedge_sign = wedge_sign;
    2440    45607700 :             best_rd = rd;
    2441             :         }
    2442             :     }
    2443    11529700 : }
    2444             : 
    2445             : extern aom_variance_fn_ptr_t mefn_ptr[BlockSizeS_ALL];
    2446             : 
    2447           0 : static int8_t estimate_wedge_sign(
    2448             :     PictureControlSet                    *picture_control_set_ptr,
    2449             :     ModeDecisionContext                  *context_ptr,
    2450             :     const BlockSize bsize,
    2451             :     const uint8_t *pred0,
    2452             :     int stride0,
    2453             :     const uint8_t *pred1,
    2454             :     int stride1)
    2455             : {
    2456             :     static const BlockSize split_qtr[BlockSizeS_ALL] = {
    2457             :         //                            4X4
    2458             :         BLOCK_INVALID,
    2459             :         // 4X8,        8X4,           8X8
    2460             :         BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
    2461             :         // 8X16,       16X8,          16X16
    2462             :         BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
    2463             :         // 16X32,      32X16,         32X32
    2464             :         BLOCK_8X16, BLOCK_16X8, BLOCK_16X16,
    2465             :         // 32X64,      64X32,         64X64
    2466             :         BLOCK_16X32, BLOCK_32X16, BLOCK_32X32,
    2467             :         // 64x128,     128x64,        128x128
    2468             :         BLOCK_32X64, BLOCK_64X32, BLOCK_64X64,
    2469             :         // 4X16,       16X4,          8X32
    2470             :         BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
    2471             :         // 32X8,       16X64,         64X16
    2472             :         BLOCK_16X4, BLOCK_8X32, BLOCK_32X8
    2473             :     };
    2474             : 
    2475           0 :     const int bw = block_size_wide[bsize];
    2476           0 :     const int bh = block_size_high[bsize];
    2477             :     uint32_t esq[2][4];
    2478             :     int64_t tl, br;
    2479             : 
    2480           0 :     const BlockSize f_index = split_qtr[bsize];
    2481           0 :     assert(f_index != BLOCK_INVALID);
    2482             :     (void)f_index;
    2483             : 
    2484           0 :     const aom_variance_fn_ptr_t *fn_ptr = &mefn_ptr[bsize];
    2485           0 :     EbPictureBufferDesc   *src_pic = picture_control_set_ptr->parent_pcs_ptr->enhanced_picture_ptr;
    2486           0 :     uint8_t               *src_buf = src_pic->buffer_y + (context_ptr->cu_origin_x + src_pic->origin_x) + (context_ptr->cu_origin_y + src_pic->origin_y) * src_pic->stride_y;
    2487             : 
    2488           0 :     fn_ptr->vf(src_buf, src_pic->stride_y, pred0, stride0, &esq[0][0]);
    2489           0 :     fn_ptr->vf(src_buf + bw / 2, src_pic->stride_y, pred0 + bw / 2, stride0, &esq[0][1]);
    2490           0 :     fn_ptr->vf(src_buf + bh / 2 * src_pic->stride_y, src_pic->stride_y, pred0 + bh / 2 * stride0, stride0, &esq[0][2]);
    2491           0 :     fn_ptr->vf(src_buf + bh / 2 * src_pic->stride_y + bw / 2, src_pic->stride_y, pred0 + bh / 2 * stride0 + bw / 2, stride0, &esq[0][3]);
    2492           0 :     fn_ptr->vf(src_buf, src_pic->stride_y, pred1, stride1, &esq[1][0]);
    2493           0 :     fn_ptr->vf(src_buf + bw / 2, src_pic->stride_y, pred1 + bw / 2, stride1, &esq[1][1]);
    2494           0 :     fn_ptr->vf(src_buf + bh / 2 * src_pic->stride_y, src_pic->stride_y, pred1 + bh / 2 * stride1, stride0, &esq[1][2]);
    2495           0 :     fn_ptr->vf(src_buf + bh / 2 * src_pic->stride_y + bw / 2, src_pic->stride_y, pred1 + bh / 2 * stride1 + bw / 2, stride0, &esq[1][3]);
    2496             : 
    2497           0 :     tl = ((int64_t)esq[0][0] + esq[0][1] + esq[0][2]) -
    2498           0 :         ((int64_t)esq[1][0] + esq[1][1] + esq[1][2]);
    2499           0 :     br = ((int64_t)esq[1][3] + esq[1][1] + esq[1][2]) -
    2500           0 :         ((int64_t)esq[0][3] + esq[0][1] + esq[0][2]);
    2501           0 :     return (tl + br > 0);
    2502             : }
    2503             : // Choose the best wedge index the specified sign
    2504             : #if II_COMP_FLAG
    2505     4453380 : int64_t pick_wedge_fixed_sign(
    2506             : #else
    2507             : static int64_t pick_wedge_fixed_sign(
    2508             : #endif
    2509             : #if II_COMP_FLAG
    2510             :     ModeDecisionCandidate        *candidate_ptr,
    2511             : #endif
    2512             :     PictureControlSet                    *picture_control_set_ptr,
    2513             :     ModeDecisionContext                  *context_ptr,
    2514             :     //const AV1_COMP *const cpi,
    2515             :     //const MACROBLOCK *const x,
    2516             :     const BlockSize bsize,
    2517             :     const int16_t *const residual1,
    2518             :     const int16_t *const diff10,
    2519             :     const int8_t wedge_sign,
    2520             :     int8_t *const best_wedge_index) {
    2521             :   //const MACROBLOCKD *const xd = &x->e_mbd;
    2522             : 
    2523     4453380 :   const int bw = block_size_wide[bsize];
    2524     4453380 :   const int bh = block_size_high[bsize];
    2525     4453380 :   const int N = bw * bh;
    2526     4453380 :   assert(N >= 64);
    2527             :   int rate;
    2528             :   int64_t dist;
    2529     4453380 :   int64_t rd, best_rd = INT64_MAX;
    2530             :   int8_t wedge_index;
    2531     4453380 :   int8_t wedge_types = (1 << get_wedge_bits_lookup(bsize));
    2532             :   const uint8_t *mask;
    2533             :   uint64_t sse;
    2534             :   //const int hbd = 0;// is_cur_buf_hbd(xd);
    2535     4453370 :   const int bd_round = 0;//hbd ? (xd->bd - 8) * 2 : 0;
    2536    75587200 :   for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
    2537    71130600 :     mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
    2538    71123600 :     sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
    2539    71064600 :     sse = ROUND_POWER_OF_TWO(sse, bd_round);
    2540             : 
    2541    71064600 :     model_rd_with_curvfit(picture_control_set_ptr,bsize, /*0,*/ sse, N,    &rate, &dist, context_ptr->full_lambda);
    2542             :    // model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, &rate, &dist);
    2543             : 
    2544             :    // rate += x->wedge_idx_cost[bsize][wedge_index];
    2545             : #if  II_COMP_FLAG
    2546    71133800 :     rate  += candidate_ptr->md_rate_estimation_ptr->wedge_idx_fac_bits[bsize][wedge_index];
    2547             : #endif
    2548    71133800 :     rd = RDCOST(/*x->rdmult*/context_ptr->full_lambda, rate, dist);
    2549             : 
    2550    71133800 :     if (rd < best_rd) {
    2551    16137100 :       *best_wedge_index = wedge_index;
    2552    16137100 :       best_rd = rd;
    2553             :     }
    2554             :   }
    2555     4456550 :   return best_rd ;//- RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
    2556             : }
    2557             : 
    2558    11531000 : static void  pick_interinter_wedge(
    2559             :     ModeDecisionCandidate                *candidate_ptr,
    2560             :     PictureControlSet                    *picture_control_set_ptr,
    2561             :     ModeDecisionContext                  *context_ptr,
    2562             :     InterInterCompoundData               *interinter_comp,
    2563             :     const BlockSize bsize,
    2564             :     const uint8_t *const p0,
    2565             :     const uint8_t *const p1,
    2566             :     const int16_t *const residual1,
    2567             :     const int16_t *const diff10)
    2568             : {
    2569             :     (void)candidate_ptr;
    2570    11531000 :     const int bw = block_size_wide[bsize];
    2571             :     //int64_t rd;
    2572    11531000 :     int8_t wedge_index = -1;
    2573    11531000 :     int8_t wedge_sign = 0;
    2574             : 
    2575    11531000 :     assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
    2576             :     //TODO: OMK+CHKN to check on FIX_RATE_E_WEDGE
    2577             : 
    2578             :     // Two method
    2579             :     // Fast seatch method to be added  OMK
    2580    11530800 :     if (picture_control_set_ptr->parent_pcs_ptr->wedge_mode == 2 || picture_control_set_ptr->parent_pcs_ptr->wedge_mode == 3) {
    2581           0 :         wedge_sign = estimate_wedge_sign(picture_control_set_ptr, context_ptr, bsize, p0, bw, p1, bw);
    2582             :     }
    2583             :     else {
    2584    11531000 :          pick_wedge(picture_control_set_ptr, context_ptr,
    2585             :             bsize, p0, residual1, diff10, &wedge_sign,
    2586             :             &wedge_index);
    2587             :     }
    2588             : 
    2589    11529500 :     interinter_comp->wedge_sign = wedge_sign;
    2590    11529500 :     interinter_comp->wedge_index = wedge_index;
    2591             : 
    2592    11529500 : }
    2593             : 
    2594    19870400 : static void  pick_interinter_seg(
    2595             :     PictureControlSet                    *picture_control_set_ptr,
    2596             :     ModeDecisionContext                  *context_ptr,
    2597             :     InterInterCompoundData               *interinter_comp,
    2598             :     const BlockSize bsize,
    2599             :     const uint8_t *const p0,
    2600             :     const uint8_t *const p1,
    2601             :     const int16_t *const residual1,
    2602             :     const int16_t *const diff10)
    2603             : {
    2604    19870400 :     const int bw = block_size_wide[bsize];
    2605    19870400 :     const int bh = block_size_high[bsize];
    2606    19870400 :     const int N = 1 << num_pels_log2_lookup[bsize];
    2607             :     int rate;
    2608             :     int64_t dist;
    2609             :     DIFFWTD_MASK_TYPE cur_mask_type;
    2610    19870400 :     int64_t best_rd = INT64_MAX;
    2611    19870400 :     DIFFWTD_MASK_TYPE best_mask_type = 0;
    2612             :     DECLARE_ALIGNED(16, uint8_t, seg_mask0[2 * MAX_SB_SQUARE]);
    2613             :     DECLARE_ALIGNED(16, uint8_t, seg_mask1[2 * MAX_SB_SQUARE]);
    2614    19870400 :     uint8_t *tmp_mask[2] = { seg_mask0, seg_mask1 };
    2615             : 
    2616             :     // try each mask type and its inverse
    2617    59594000 :     for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
    2618             : 
    2619             :         // build mask and inverse
    2620    39718900 :         av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type,
    2621             :             p0, bw, p1, bw, bh, bw);
    2622             :         // compute rd for mask
    2623    39723700 :         uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10, tmp_mask[cur_mask_type], N);
    2624             : 
    2625    39706300 :         sse = ROUND_POWER_OF_TWO(sse, 0 );
    2626             : 
    2627    39706300 :         model_rd_with_curvfit(picture_control_set_ptr, bsize,  sse, N, &rate, &dist, context_ptr->full_lambda);
    2628             : 
    2629    39723600 :         const int64_t rd0 = RDCOST(context_ptr->full_lambda , rate, dist);
    2630             : 
    2631    39723600 :         if (rd0 < best_rd) {
    2632    28870000 :             best_mask_type = cur_mask_type;
    2633    28870000 :             best_rd = rd0;
    2634             :         }
    2635             :     }
    2636             : 
    2637    19875200 :     interinter_comp->mask_type = best_mask_type;
    2638             : 
    2639    19875200 : }
    2640             : 
    2641    31397900 : void pick_interinter_mask(
    2642             :     ModeDecisionCandidate                *candidate_ptr,
    2643             :     PictureControlSet                    *picture_control_set_ptr,
    2644             :     ModeDecisionContext                  *context_ptr,
    2645             :     InterInterCompoundData               *interinter_comp,
    2646             :     const BlockSize                      bsize,
    2647             :     const uint8_t                        *const p0,
    2648             :     const uint8_t                        *const p1,
    2649             :     const int16_t                        *const residual1,
    2650             :     const int16_t                        *const diff10)
    2651             : {
    2652             : 
    2653    31397900 :     if (interinter_comp->type == COMPOUND_WEDGE)
    2654    11531000 :         pick_interinter_wedge(candidate_ptr, picture_control_set_ptr, context_ptr, interinter_comp, bsize, p0, p1, residual1, diff10);
    2655    19866900 :     else if (interinter_comp->type == COMPOUND_DIFFWTD)
    2656    19876100 :         pick_interinter_seg(picture_control_set_ptr, context_ptr, interinter_comp, bsize, p0, p1, residual1, diff10);
    2657             :     else
    2658           0 :         assert(0);
    2659             : 
    2660    31401700 : }
    2661             : 
    2662    31390300 : void search_compound_diff_wedge(
    2663             :     PictureControlSet                    *picture_control_set_ptr,
    2664             :     ModeDecisionContext                  *context_ptr,
    2665             :     ModeDecisionCandidate                *candidate_ptr)
    2666             : {
    2667             : 
    2668             :     //if (*calc_pred_masked_compound)
    2669             :     {
    2670    31390300 :         EbPictureBufferDesc   *src_pic = picture_control_set_ptr->parent_pcs_ptr->enhanced_picture_ptr;
    2671    31390300 :         uint8_t               *src_buf = src_pic->buffer_y + (context_ptr->cu_origin_x + src_pic->origin_x) + (context_ptr->cu_origin_y + src_pic->origin_y) * src_pic->stride_y;
    2672             : 
    2673    31390300 :         uint32_t  bwidth = context_ptr->blk_geom->bwidth;
    2674    31390300 :         uint32_t  bheight = context_ptr->blk_geom->bheight;
    2675             :         EbPictureBufferDesc  pred_desc;
    2676    31390300 :         pred_desc.origin_x = pred_desc.origin_y = 0;
    2677    31390300 :         pred_desc.stride_y = bwidth;
    2678             : 
    2679    31390300 :         SequenceControlSet* sequence_control_set_ptr = ((SequenceControlSet*)(picture_control_set_ptr->sequence_control_set_wrapper_ptr->object_ptr));
    2680             :         EbPictureBufferDesc  *ref_pic_list0;
    2681    31390300 :         EbPictureBufferDesc  *ref_pic_list1 = NULL;
    2682             :         Mv mv_0;
    2683             :         Mv mv_1;
    2684    31390300 :         mv_0.x = candidate_ptr->motion_vector_xl0;
    2685    31390300 :         mv_0.y = candidate_ptr->motion_vector_yl0;
    2686    31390300 :         mv_1.x = candidate_ptr->motion_vector_xl1;
    2687    31390300 :         mv_1.y = candidate_ptr->motion_vector_yl1;
    2688             :         MvUnit mv_unit;
    2689    31390300 :         mv_unit.mv[0] = mv_0;
    2690    31390300 :         mv_unit.mv[1] = mv_1;
    2691    31390300 :         int8_t ref_idx_l0 = candidate_ptr->ref_frame_index_l0;
    2692    31390300 :         int8_t ref_idx_l1 = candidate_ptr->ref_frame_index_l1;
    2693             :         MvReferenceFrame rf[2];
    2694    31390300 :         av1_set_ref_frame(rf, candidate_ptr->ref_frame_type);
    2695             :         uint8_t list_idx0, list_idx1;
    2696    31403900 :         list_idx0 = get_list_idx(rf[0]);
    2697    31400500 :         if (rf[1] == NONE_FRAME)
    2698           0 :             list_idx1 = get_list_idx(rf[0]);
    2699             :         else
    2700    31400500 :             list_idx1 = get_list_idx(rf[1]);
    2701    31400000 :         assert(list_idx0 < MAX_NUM_OF_REF_PIC_LIST);
    2702    31400000 :         assert(list_idx1 < MAX_NUM_OF_REF_PIC_LIST);
    2703    31400000 :         if (ref_idx_l0 >= 0)
    2704    31400300 :             ref_pic_list0 = ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[list_idx0][ref_idx_l0]->object_ptr)->reference_picture;
    2705             :         else
    2706           0 :             ref_pic_list0 = (EbPictureBufferDesc*)EB_NULL;
    2707    31400000 :         if (ref_idx_l1 >= 0)
    2708    31400600 :             ref_pic_list1 = ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[list_idx1][ref_idx_l1]->object_ptr)->reference_picture;
    2709             :         else
    2710           0 :             ref_pic_list1 = (EbPictureBufferDesc*)EB_NULL;
    2711             : 
    2712             :         //CHKN get seperate prediction of each ref(Luma only)
    2713             :         //ref0 prediction
    2714    31400000 :         mv_unit.pred_direction = UNI_PRED_LIST_0;
    2715    31400000 :         pred_desc.buffer_y = context_ptr->pred0;
    2716             : 
    2717             :         //we call the regular inter prediction path here(no compound)
    2718    31400000 :         av1_inter_prediction_function_table[context_ptr->hbd_mode_decision > EB_8_BIT_MD](
    2719             :             picture_control_set_ptr,
    2720             :             0,//fixed interpolation filter for compound search
    2721             :             context_ptr->cu_ptr,
    2722    31400000 :             candidate_ptr->ref_frame_type,
    2723             :             &mv_unit,
    2724             :             0,//use_intrabc,
    2725             : #if OBMC_FLAG
    2726             :             SIMPLE_TRANSLATION,
    2727             :             0,
    2728             :             0,
    2729             : #endif
    2730             :             1,//compound_idx not used
    2731             :             NULL,// interinter_comp not used
    2732             : #if II_COMP_FLAG
    2733             :             NULL,
    2734             :             NULL,
    2735             :             NULL,
    2736             :             NULL,
    2737             :             0,
    2738             :             0,
    2739             :             0,
    2740             :             0,
    2741             : #endif
    2742    31400000 :             context_ptr->cu_origin_x,
    2743    31400000 :             context_ptr->cu_origin_y,
    2744             :             bwidth,
    2745             :             bheight,
    2746             :             ref_pic_list0,
    2747             :             ref_pic_list1,
    2748             :             &pred_desc, //output
    2749             :             0,          //output origin_x,
    2750             :             0,          //output origin_y,
    2751             :             0,//do chroma
    2752    31400000 :             (uint8_t)sequence_control_set_ptr->static_config.encoder_bit_depth);
    2753             : 
    2754             :         //ref1 prediction
    2755    31378500 :         mv_unit.pred_direction = UNI_PRED_LIST_1;
    2756    31378500 :         pred_desc.buffer_y = context_ptr->pred1;
    2757             : 
    2758             :         //we call the regular inter prediction path here(no compound)
    2759    31378500 :         av1_inter_prediction_function_table[context_ptr->hbd_mode_decision > EB_8_BIT_MD](
    2760             :             picture_control_set_ptr,
    2761             :             0,//fixed interpolation filter for compound search
    2762             :             context_ptr->cu_ptr,
    2763    31378500 :             candidate_ptr->ref_frame_type,
    2764             :             &mv_unit,
    2765             :             0,//use_intrabc,
    2766             : #if OBMC_FLAG
    2767             :             SIMPLE_TRANSLATION,
    2768             :             0,
    2769             :             0,
    2770             : #endif
    2771             :             1,//compound_idx not used
    2772             :             NULL,// interinter_comp not used
    2773             : #if II_COMP_FLAG
    2774             :             NULL,
    2775             :             NULL,
    2776             :             NULL,
    2777             :             NULL,
    2778             :             0,
    2779             :             0,
    2780             :             0,
    2781             :             0,
    2782             : #endif
    2783    31378500 :             context_ptr->cu_origin_x,
    2784    31378500 :             context_ptr->cu_origin_y,
    2785             :             bwidth,
    2786             :             bheight,
    2787             :             ref_pic_list0,
    2788             :             ref_pic_list1,
    2789             :             &pred_desc, //output
    2790             :             0,          //output origin_x,
    2791             :             0,          //output origin_y,
    2792             :             0,//do chroma
    2793    31378500 :             (uint8_t)sequence_control_set_ptr->static_config.encoder_bit_depth);
    2794             : 
    2795    31377100 :         aom_subtract_block(bheight, bwidth, context_ptr->residual1, bwidth, src_buf, src_pic->stride_y, context_ptr->pred1, bwidth);
    2796    31389400 :         aom_subtract_block(bheight, bwidth, context_ptr->diff10, bwidth, context_ptr->pred1, bwidth, context_ptr->pred0, bwidth);
    2797             : 
    2798             :         //*calc_pred_masked_compound = 0;
    2799    31395900 :         if (picture_control_set_ptr->parent_pcs_ptr->wedge_mode == 1 || picture_control_set_ptr->parent_pcs_ptr->wedge_mode == 3)
    2800           0 :             if (candidate_ptr->interinter_comp.type == COMPOUND_DIFFWTD && context_ptr->variance_ready == 0) {
    2801           0 :                 const aom_variance_fn_ptr_t *fn_ptr = &mefn_ptr[context_ptr->blk_geom->bsize];
    2802             : 
    2803             :                 unsigned int sse;
    2804           0 :                 (void)fn_ptr->vf(context_ptr->pred0, bwidth, context_ptr->pred1, pred_desc.stride_y, &sse);
    2805             : 
    2806           0 :                context_ptr->prediction_mse = ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[context_ptr->blk_geom->bsize]);
    2807           0 :                 context_ptr->variance_ready = 1;
    2808             :             }
    2809             : 
    2810             :     }
    2811    31395900 :     pick_interinter_mask(
    2812             :         candidate_ptr,
    2813             :         picture_control_set_ptr,
    2814             :         context_ptr,
    2815             :         &candidate_ptr->interinter_comp,
    2816    31395900 :         context_ptr->blk_geom->bsize,
    2817    31395900 :         context_ptr->pred0,
    2818    31395900 :         context_ptr->pred1,
    2819    31395900 :         context_ptr->residual1,
    2820    31395900 :         context_ptr->diff10);
    2821    31386300 : }
    2822             : 
    2823           0 : int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,
    2824             :     int b_stride, int width, int height) {
    2825             :     int y, x;
    2826           0 :     int64_t sse = 0;
    2827             : 
    2828           0 :     for (y = 0; y < height; y++) {
    2829           0 :         for (x = 0; x < width; x++) {
    2830           0 :             const int32_t diff = abs(a[x] - b[x]);
    2831           0 :             sse += diff * diff;
    2832             :         }
    2833             : 
    2834           0 :         a += a_stride;
    2835           0 :         b += b_stride;
    2836             :     }
    2837           0 :     return sse;
    2838             : }
    2839             : 
    2840             : #if II_COMP_FLAG
    2841    17810900 : void model_rd_for_sb_with_curvfit(
    2842             : #else
    2843             : static void model_rd_for_sb_with_curvfit(
    2844             : #endif
    2845             :     PictureControlSet      *picture_control_set_ptr,
    2846             :     ModeDecisionContext                  *context_ptr,
    2847             :     BlockSize bsize, int bw, int bh,
    2848             :     uint8_t* src_buf, uint32_t src_stride, uint8_t* pred_buf, uint32_t pred_stride,
    2849             :     int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
    2850             :     int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
    2851             :     int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
    2852             :     (void)mi_row;
    2853             :     (void)mi_col;
    2854             :     // Note our transform coeffs are 8 times an orthogonal transform.
    2855             :     // Hence quantizer step is also 8 times. To get effective quantizer
    2856             :     // we need to divide by 8 before sending to modeling function.
    2857             : 
    2858    17810900 :     int64_t rate_sum = 0;
    2859    17810900 :     int64_t dist_sum = 0;
    2860    17810900 :     int64_t total_sse = 0;
    2861             : 
    2862    35622000 :     for (int plane = plane_from; plane <= plane_to; ++plane) {
    2863    17810800 :         int32_t subsampling = plane == 0 ? 0 : 1;
    2864             :         const BlockSize plane_bsize =
    2865    17810800 :             get_plane_block_size(bsize, subsampling, subsampling);
    2866             :         int64_t dist, sse;
    2867             :         int rate;
    2868    17809700 :         const int shift = 0;
    2869    17809700 :         sse = aom_sse(src_buf, src_stride, pred_buf, pred_stride, bw, bh);
    2870    17810400 :         sse = ROUND_POWER_OF_TWO(sse, shift * 2);
    2871    17810400 :         model_rd_with_curvfit(picture_control_set_ptr , plane_bsize, sse, bw * bh, &rate, &dist, context_ptr->full_lambda);
    2872             : 
    2873    17811100 :         total_sse += sse;
    2874    17811100 :         rate_sum += rate;
    2875    17811100 :         dist_sum += dist;
    2876             : 
    2877    17811100 :         if (plane_rate) plane_rate[plane] = rate;
    2878    17811100 :         if (plane_sse) plane_sse[plane] = sse;
    2879    17811100 :         if (plane_dist) plane_dist[plane] = dist;
    2880             :     }
    2881             : 
    2882    17811300 :     if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
    2883    17811300 :     if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
    2884    17811300 :     *out_rate_sum = (int)rate_sum;
    2885    17811300 :     *out_dist_sum = dist_sum;
    2886    17811300 : }
    2887             : 
    2888             : int get_comp_index_context_enc(
    2889             :     PictureParentControlSet   *pcs_ptr,
    2890             :     int cur_frame_index,
    2891             :     int bck_frame_index,
    2892             :     int fwd_frame_index,
    2893             :     const MacroBlockD *xd);
    2894           0 : void search_compound_avg_dist(
    2895             :     PictureControlSet                    *picture_control_set_ptr,
    2896             :     ModeDecisionContext                  *context_ptr,
    2897             :     ModeDecisionCandidate                *candidate_ptr)
    2898             : {
    2899             :     int64_t est_rd[2];
    2900             : 
    2901           0 :     MbModeInfo *const mbmi = &context_ptr->cu_ptr->av1xd->mi[0]->mbmi;
    2902             :     MvReferenceFrame rf[2];
    2903           0 :     av1_set_ref_frame(rf, candidate_ptr->ref_frame_type);
    2904           0 :     mbmi->block_mi.ref_frame[0] = rf[0];
    2905           0 :     mbmi->block_mi.ref_frame[1] = rf[1];
    2906           0 :     const int comp_index_ctx = get_comp_index_context_enc(
    2907           0 :         picture_control_set_ptr->parent_pcs_ptr,
    2908           0 :         picture_control_set_ptr->parent_pcs_ptr->cur_order_hint,
    2909           0 :         picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[0] - 1],
    2910           0 :         picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[1] - 1],
    2911           0 :         context_ptr->cu_ptr->av1xd);
    2912             : 
    2913             :     //COMPOUND AVERAGE
    2914             :     COMPOUND_TYPE  comp_i;
    2915             : 
    2916           0 :     for (comp_i = COMPOUND_AVERAGE; comp_i <= COMPOUND_DISTWTD; comp_i++)
    2917             :     {
    2918             :         //assign compound type temporary for RD test
    2919           0 :         candidate_ptr->interinter_comp.type = comp_i;
    2920           0 :         candidate_ptr->comp_group_idx = 0;
    2921           0 :         candidate_ptr->compound_idx = (comp_i == COMPOUND_AVERAGE) ? 1 : 0;
    2922             : 
    2923           0 :         EbPictureBufferDesc   *src_pic = picture_control_set_ptr->parent_pcs_ptr->enhanced_picture_ptr;
    2924           0 :         uint8_t               *src_buf = src_pic->buffer_y + (context_ptr->cu_origin_x + src_pic->origin_x) + (context_ptr->cu_origin_y + src_pic->origin_y) * src_pic->stride_y;
    2925             : 
    2926           0 :         uint32_t  bwidth = context_ptr->blk_geom->bwidth;
    2927           0 :         uint32_t  bheight = context_ptr->blk_geom->bheight;
    2928             :         EbPictureBufferDesc  pred_desc;
    2929           0 :         pred_desc.origin_x = pred_desc.origin_y = 0;
    2930           0 :         pred_desc.stride_y = bwidth;
    2931           0 :         pred_desc.buffer_y = context_ptr->pred0;
    2932             : 
    2933           0 :         SequenceControlSet* sequence_control_set_ptr = ((SequenceControlSet*)(picture_control_set_ptr->sequence_control_set_wrapper_ptr->object_ptr));
    2934             :         EbPictureBufferDesc  *ref_pic_list0;
    2935           0 :         EbPictureBufferDesc  *ref_pic_list1 = NULL;
    2936             :         Mv mv_0;
    2937             :         Mv mv_1;
    2938           0 :         mv_0.x = candidate_ptr->motion_vector_xl0;
    2939           0 :         mv_0.y = candidate_ptr->motion_vector_yl0;
    2940           0 :         mv_1.x = candidate_ptr->motion_vector_xl1;
    2941           0 :         mv_1.y = candidate_ptr->motion_vector_yl1;
    2942             :         MvUnit mv_unit;
    2943           0 :         mv_unit.mv[0] = mv_0;
    2944           0 :         mv_unit.mv[1] = mv_1;
    2945           0 :         mv_unit.pred_direction = BI_PRED;
    2946           0 :         int8_t ref_idx_l0 = candidate_ptr->ref_frame_index_l0;
    2947           0 :         int8_t ref_idx_l1 = candidate_ptr->ref_frame_index_l1;
    2948             :         MvReferenceFrame rf[2];
    2949           0 :         av1_set_ref_frame(rf, candidate_ptr->ref_frame_type);
    2950             :         uint8_t list_idx0, list_idx1;
    2951           0 :         list_idx0 = get_list_idx(rf[0]);
    2952           0 :         if (rf[1] == NONE_FRAME)
    2953           0 :             list_idx1 = get_list_idx(rf[0]);
    2954             :         else
    2955           0 :             list_idx1 = get_list_idx(rf[1]);
    2956           0 :         assert(list_idx0 < MAX_NUM_OF_REF_PIC_LIST);
    2957           0 :         assert(list_idx1 < MAX_NUM_OF_REF_PIC_LIST);
    2958           0 :         if (ref_idx_l0 >= 0)
    2959           0 :             ref_pic_list0 = ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[list_idx0][ref_idx_l0]->object_ptr)->reference_picture;
    2960             :         else
    2961           0 :             ref_pic_list0 = (EbPictureBufferDesc*)EB_NULL;
    2962           0 :         if (ref_idx_l1 >= 0)
    2963           0 :             ref_pic_list1 = ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[list_idx1][ref_idx_l1]->object_ptr)->reference_picture;
    2964             :         else
    2965           0 :             ref_pic_list1 = (EbPictureBufferDesc*)EB_NULL;
    2966             : 
    2967             : 
    2968           0 :         av1_inter_prediction_function_table[context_ptr->hbd_mode_decision > EB_8_BIT_MD](
    2969             :             picture_control_set_ptr,
    2970             :             0,//fixed interpolation filter for compound search
    2971             :             context_ptr->cu_ptr,
    2972           0 :             candidate_ptr->ref_frame_type,
    2973             :             &mv_unit,
    2974             :             0,//use_intrabc,
    2975             : #if OBMC_FLAG
    2976             :             SIMPLE_TRANSLATION,
    2977             :             0,
    2978             :             0,
    2979             : #endif
    2980           0 :             candidate_ptr->compound_idx,
    2981             :             &candidate_ptr->interinter_comp,
    2982             : #if II_COMP_FLAG
    2983             :             NULL,
    2984             :             NULL,
    2985             :             NULL,
    2986             :             NULL,
    2987             :             0,
    2988             :             0,
    2989             :             0,
    2990             :             0,
    2991             : #endif
    2992           0 :             context_ptr->cu_origin_x,
    2993           0 :             context_ptr->cu_origin_y,
    2994             :             bwidth,
    2995             :             bheight,
    2996             :             ref_pic_list0,
    2997             :             ref_pic_list1,
    2998             :             &pred_desc, //output
    2999             :             0,          //output origin_x,
    3000             :             0,          //output origin_y,
    3001             :             0,//do chroma
    3002           0 :             (uint8_t)sequence_control_set_ptr->static_config.encoder_bit_depth);
    3003             : 
    3004             :         int32_t est_rate;
    3005             :         int64_t est_dist;
    3006             : 
    3007           0 :         model_rd_for_sb_with_curvfit(picture_control_set_ptr , context_ptr, context_ptr->blk_geom->bsize, bwidth, bheight,
    3008           0 :             src_buf, src_pic->stride_y, pred_desc.buffer_y, pred_desc.stride_y,
    3009             :              0, 0, 0, 0, &est_rate,
    3010             :             &est_dist, NULL, NULL, NULL, NULL, NULL);
    3011             : 
    3012           0 :         est_rate += candidate_ptr->md_rate_estimation_ptr->comp_idx_fac_bits[comp_index_ctx][candidate_ptr->compound_idx];
    3013             : 
    3014           0 :         est_rd[comp_i] =
    3015           0 :             RDCOST(context_ptr->full_lambda , est_rate, est_dist);
    3016             :     }
    3017             : 
    3018             :     //assign the best compound type
    3019           0 :     if (est_rd[COMPOUND_AVERAGE] <= est_rd[COMPOUND_DISTWTD]) {
    3020           0 :         candidate_ptr->interinter_comp.type = COMPOUND_AVERAGE;
    3021           0 :         candidate_ptr->comp_group_idx = 0;
    3022           0 :         candidate_ptr->compound_idx = 1;
    3023             :     }
    3024             :     else {
    3025           0 :         candidate_ptr->interinter_comp.type = COMPOUND_DISTWTD;
    3026           0 :         candidate_ptr->comp_group_idx = 0;
    3027           0 :         candidate_ptr->compound_idx = 0;
    3028             :     }
    3029             : 
    3030           0 : }
    3031             : 
    3032             : #if II_COMP_FLAG
    3033    29671400 :  void combine_interintra(INTERINTRA_MODE mode,
    3034             :     int8_t use_wedge_interintra, int wedge_index,
    3035             :     int wedge_sign, BlockSize bsize,
    3036             :     BlockSize plane_bsize, uint8_t *comppred,
    3037             :     int compstride, const uint8_t *interpred,
    3038             :     int interstride, const uint8_t *intrapred,
    3039             :     int intrastride)
    3040             : {
    3041    29671400 :     const int bw = block_size_wide[plane_bsize];
    3042    29671400 :     const int bh = block_size_high[plane_bsize];
    3043             : 
    3044    29671400 :     if (use_wedge_interintra) {
    3045     5566590 :         if (is_interintra_wedge_used(bsize)) {
    3046             :             const uint8_t *mask =
    3047     5566560 :                 av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
    3048     5566550 :             const int subw = 2 * mi_size_wide[bsize] == bw;
    3049     5566550 :             const int subh = 2 * mi_size_high[bsize] == bh;
    3050     5566550 :             aom_blend_a64_mask(comppred, compstride, intrapred, intrastride,
    3051     5566550 :                 interpred, interstride, mask, block_size_wide[bsize],
    3052             :                 bw, bh, subw, subh);
    3053             :         }
    3054     5566740 :         return;
    3055             :     }
    3056             :     else {
    3057             :         uint8_t mask[MAX_SB_SQUARE];
    3058    24104800 :         build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
    3059    24111400 :         aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred,
    3060             :             interstride, mask, bw, bw, bh, 0, 0);
    3061             :     }
    3062             : }
    3063             : #endif
    3064             : #if II_COMP_FLAG
    3065             :  extern void eb_av1_predict_intra_block(
    3066             :     TileInfo * tile,
    3067             :     STAGE       stage,
    3068             :     const BlockGeom            * blk_geom,
    3069             :     const Av1Common *cm,
    3070             :     int32_t wpx,
    3071             :     int32_t hpx,
    3072             :     TxSize tx_size,
    3073             :     PredictionMode mode,
    3074             :     int32_t angle_delta,
    3075             :     int32_t use_palette,
    3076             : #if PAL_SUP
    3077             :      PaletteInfo  *palette_info,
    3078             : #endif
    3079             :     FilterIntraMode filter_intra_mode,
    3080             :     uint8_t* topNeighArray,
    3081             :     uint8_t* leftNeighArray,
    3082             :     EbPictureBufferDesc  *recon_buffer,
    3083             :     int32_t col_off,
    3084             :     int32_t row_off,
    3085             :     int32_t plane,
    3086             :     BlockSize bsize,
    3087             :     uint32_t tu_org_x_pict,
    3088             :     uint32_t tu_org_y_pict,
    3089             :     uint32_t bl_org_x_pict,
    3090             :     uint32_t bl_org_y_pict,
    3091             :     uint32_t bl_org_x_mb,
    3092             :     uint32_t bl_org_y_mb);
    3093             :  #if INTER_INTRA_HBD
    3094             : extern void eb_av1_predict_intra_block_16bit(
    3095             :     TileInfo * tile,
    3096             :     STAGE       stage,
    3097             :     const BlockGeom * blk_geom,
    3098             :     const Av1Common *cm,
    3099             :     int32_t wpx,
    3100             :     int32_t hpx,
    3101             :     TxSize tx_size,
    3102             :     PredictionMode mode,
    3103             :     int32_t angle_delta,
    3104             :     int32_t use_palette,
    3105             : #if PAL_SUP
    3106             :     PaletteInfo  *palette_info,
    3107             : #endif
    3108             :     FilterIntraMode filter_intra_mode,
    3109             :     uint16_t* topNeighArray,
    3110             :     uint16_t* leftNeighArray,
    3111             :     EbPictureBufferDesc  *recon_buffer,
    3112             :     int32_t col_off,
    3113             :     int32_t row_off,
    3114             :     int32_t plane,
    3115             :     BlockSize bsize,
    3116             :     uint32_t tu_org_x_pict,
    3117             :     uint32_t tu_org_y_pict,
    3118             :     uint32_t bl_org_x_pict,
    3119             :     uint32_t bl_org_y_pict,
    3120             :     uint32_t bl_org_x_mb,
    3121             :     uint32_t bl_org_y_mb);
    3122             : #endif
    3123             :  #define INTERINTRA_WEDGE_SIGN 0
    3124             : #endif
    3125             : #if OBMC_FLAG
    3126             : 
    3127             : struct build_prediction_hbd_ctxt {
    3128             :     const AV1_COMMON *cm;
    3129             :     int mi_row;
    3130             :     int mi_col;
    3131             :     uint16_t **tmp_buf;
    3132             :     int *tmp_width;
    3133             :     int *tmp_height;
    3134             :     int *tmp_stride;
    3135             :     int mb_to_far_edge;
    3136             : 
    3137             :     PictureControlSet                    *picture_control_set_ptr;
    3138             :     MvUnit                                mv_unit         ;
    3139             :     uint16_t                              pu_origin_x     ;
    3140             :     uint16_t                              pu_origin_y     ;
    3141             :     EbPictureBufferDesc                  *ref_pic_list0   ;
    3142             :     EbPictureBufferDesc                   prediction_ptr  ;
    3143             :     uint16_t                              dst_origin_x    ;
    3144             :     uint16_t                              dst_origin_y    ;
    3145             :     EbBool                                perform_chroma  ;
    3146             : 
    3147             : 
    3148             : };
    3149             : 
    3150             : struct build_prediction_ctxt {
    3151             :     const AV1_COMMON *cm;
    3152             :     int mi_row;
    3153             :     int mi_col;
    3154             :     uint8_t **tmp_buf;
    3155             :     int *tmp_width;
    3156             :     int *tmp_height;
    3157             :     int *tmp_stride;
    3158             :     int mb_to_far_edge;
    3159             : 
    3160             :         PictureControlSet                    *picture_control_set_ptr;
    3161             :         MvUnit                                mv_unit         ;
    3162             :         uint16_t                              pu_origin_x     ;
    3163             :         uint16_t                              pu_origin_y     ;
    3164             :         EbPictureBufferDesc                  *ref_pic_list0   ;
    3165             :         EbPictureBufferDesc                   prediction_ptr  ;
    3166             :         uint16_t                              dst_origin_x    ;
    3167             :         uint16_t                              dst_origin_y    ;
    3168             :         EbBool                                perform_chroma  ;
    3169             : 
    3170             : 
    3171             : };
    3172             : // input: log2 of length, 0(4), 1(8), ...
    3173             : static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 };
    3174             : 
    3175             : 
    3176             : typedef void(*overlappable_nb_visitor_t)(
    3177             :     uint8_t is16bit,
    3178             :     MacroBlockD *xd,
    3179             :     int rel_mi_pos,
    3180             :     uint8_t nb_mi_size,
    3181             :     MbModeInfo *nb_mi,
    3182             :     void *fun_ctxt,
    3183             :     const int num_planes);
    3184             : 
    3185    13860500 : static INLINE void foreach_overlappable_nb_above(
    3186             :     uint8_t is16bit ,
    3187             :     const AV1_COMMON *cm,
    3188             :     MacroBlockD *xd,
    3189             :     int mi_col,
    3190             :     int nb_max,
    3191             :     overlappable_nb_visitor_t fun,
    3192             :     void *fun_ctxt) {
    3193    13860500 :     const int num_planes = 2;
    3194    13860500 :     if (!xd->up_available) return;
    3195             : 
    3196    13141500 :     int nb_count = 0;
    3197             : 
    3198             :     // prev_row_mi points into the mi array, starting at the beginning of the
    3199             :     // previous row.
    3200    13141500 :     ModeInfo **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
    3201    13141500 :     const int end_col = AOMMIN(mi_col + xd->n4_w, cm->mi_cols);
    3202             :     uint8_t mi_step;
    3203    26891100 :     for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
    3204    13749600 :         above_mi_col += mi_step) {
    3205    13750500 :         ModeInfo /*MbModeInfo*/ **above_mi = prev_row_mi + above_mi_col;
    3206    13750500 :         mi_step =
    3207    13750500 :             AOMMIN(mi_size_wide[above_mi[0]->mbmi.block_mi.sb_type], mi_size_wide[BLOCK_64X64]);
    3208             :         // If we're considering a block with width 4, it should be treated as
    3209             :         // half of a pair of blocks with chroma information in the second. Move
    3210             :         // above_mi_col back to the start of the pair if needed, set above_mbmi
    3211             :         // to point at the block with chroma information, and set mi_step to 2 to
    3212             :         // step over the entire pair at the end of the iteration.
    3213    13750500 :         if (mi_step == 1) {
    3214       34256 :             above_mi_col &= ~1;
    3215       34256 :             above_mi = prev_row_mi + above_mi_col + 1;
    3216       34256 :             mi_step = 2;
    3217             :         }
    3218    13750500 :         if (is_neighbor_overlappable( &(*above_mi)->mbmi)) {
    3219    13715000 :             ++nb_count;
    3220             : 
    3221    13715000 :             fun(
    3222             :                 is16bit,
    3223             :                 xd,
    3224             :                 above_mi_col - mi_col,
    3225    13715000 :                 AOMMIN(xd->n4_w, mi_step),
    3226    13715000 :                 &(*above_mi)->mbmi ,
    3227             :                 fun_ctxt,
    3228             :                 num_planes);
    3229             :         }
    3230             :     }
    3231             : }
    3232             : 
    3233    13868800 : static INLINE void foreach_overlappable_nb_left(
    3234             :     uint8_t is16bit ,
    3235             :     const AV1_COMMON *cm,
    3236             :     MacroBlockD *xd,
    3237             :     int mi_row,
    3238             :     int nb_max,
    3239             :     overlappable_nb_visitor_t fun,
    3240             :     void *fun_ctxt) {
    3241    13868800 :     const int num_planes = 2;
    3242    13868800 :     if (!xd->left_available) return;
    3243             : 
    3244    13316500 :     int nb_count = 0;
    3245             : 
    3246             :     // prev_col_mi points into the mi array, starting at the top of the
    3247             :     // previous column
    3248             : 
    3249    13316500 :     ModeInfo  **prev_col_mi =  xd->mi - 1 - mi_row * xd->mi_stride;
    3250    13316500 :     const int end_row = AOMMIN(mi_row + xd->n4_h, cm->mi_rows);
    3251             :     uint8_t mi_step;
    3252    27783500 :     for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
    3253    14467100 :         left_mi_row += mi_step) {
    3254    14467900 :         ModeInfo **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
    3255    14467900 :         mi_step =
    3256    14467900 :             AOMMIN(mi_size_high[left_mi[0]->mbmi.block_mi.sb_type], mi_size_high[BLOCK_64X64]);
    3257    14467900 :         if (mi_step == 1) {
    3258       34122 :             left_mi_row &= ~1;
    3259       34122 :             left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride;
    3260       34122 :             mi_step = 2;
    3261             :         }
    3262    14467900 :         if (is_neighbor_overlappable( &(*left_mi)->mbmi)) {
    3263    14429100 :             ++nb_count;
    3264             : 
    3265    14429100 :             fun(
    3266             :                 is16bit,
    3267             :                 xd,
    3268             :                 left_mi_row - mi_row,
    3269    14429100 :                 AOMMIN(xd->n4_h, mi_step),
    3270    14429100 :                 &(*left_mi)->mbmi ,
    3271             :                 fun_ctxt,
    3272             :                 num_planes);
    3273             :         }
    3274             :     }
    3275             : }
    3276             : // HW does not support < 4x4 prediction. To limit the bandwidth requirement, if
    3277             : // block-size of current plane is smaller than 8x8, always only blend with the
    3278             : // left neighbor(s) (skip blending with the above side).
    3279             : #define DISABLE_CHROMA_U8X8_OBMC 0  // 0: one-sided obmc; 1: disable
    3280             : 
    3281    29563700 : int av1_skip_u4x4_pred_in_obmc(BlockSize bsize,
    3282             :      int dir, int subsampling_x, int subsampling_y) {
    3283    29563700 :     assert(is_motion_variation_allowed_bsize(bsize));
    3284             : 
    3285             :     const BlockSize bsize_plane =
    3286    29562400 :         get_plane_block_size(bsize,subsampling_x,subsampling_y);
    3287    29559900 :     switch (bsize_plane) {
    3288             : #if DISABLE_CHROMA_U8X8_OBMC
    3289             :     case BLOCK_4X4:
    3290             :     case BLOCK_8X4:
    3291             :     case BLOCK_4X8: return 1; break;
    3292             : #else
    3293     1039730 :     case BLOCK_4X4:
    3294             :     case BLOCK_8X4:
    3295     1039730 :     case BLOCK_4X8: return dir == 0; break;
    3296             : #endif
    3297    28520200 :     default: return 0;
    3298             :     }
    3299             : }
    3300             : 
    3301      396686 : void av1_setup_build_prediction_by_above_pred(
    3302             :     MacroBlockD *xd, int rel_mi_col, uint8_t above_mi_width,
    3303             :     MbModeInfo *above_mbmi, struct build_prediction_ctxt *ctxt,
    3304             :     const int num_planes,uint8_t is16bit)
    3305             : {
    3306             :     (void)num_planes;
    3307      396686 :     const int above_mi_col = ctxt->mi_col + rel_mi_col;
    3308             : 
    3309             :     //use above mbmi  to set up the reference object from where to read
    3310             : 
    3311      396686 :     ctxt->mv_unit.mv[0].x = above_mbmi->block_mi.mv[0].as_mv.col;
    3312      396686 :     ctxt->mv_unit.mv[0].y = above_mbmi->block_mi.mv[0].as_mv.row;
    3313      396686 :     ctxt->mv_unit.pred_direction = UNI_PRED_LIST_0;
    3314             : 
    3315      396686 :     uint8_t ref_idx_l0 = get_ref_frame_idx(above_mbmi->block_mi.ref_frame[0]);
    3316      396682 :     uint8_t list_idx0  = get_list_idx(above_mbmi->block_mi.ref_frame[0]);
    3317             : 
    3318      396687 :     if (is16bit)
    3319           0 :         ctxt->ref_pic_list0 = ((EbReferenceObject*)ctxt->picture_control_set_ptr->ref_pic_ptr_array[list_idx0][ref_idx_l0]->object_ptr)->reference_picture16bit;
    3320             :     else
    3321      396687 :         ctxt->ref_pic_list0 = ((EbReferenceObject*)ctxt->picture_control_set_ptr->ref_pic_ptr_array[list_idx0][ref_idx_l0]->object_ptr)->reference_picture;
    3322             : 
    3323      396687 :     xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
    3324      396687 :     xd->mb_to_right_edge = ctxt->mb_to_far_edge +
    3325      396687 :         (xd->n4_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
    3326      396687 : }
    3327      417268 : void av1_setup_build_prediction_by_left_pred(MacroBlockD *xd, int rel_mi_row,
    3328             :     uint8_t left_mi_height,
    3329             :     MbModeInfo *left_mbmi,
    3330             :     struct build_prediction_ctxt *ctxt,
    3331             :     const int num_planes,uint8_t is16bit)
    3332             : {
    3333             :     (void)num_planes;
    3334      417268 :     const int left_mi_row = ctxt->mi_row + rel_mi_row;
    3335             : 
    3336      417268 :     ctxt->mv_unit.mv[0].x = left_mbmi->block_mi.mv[0].as_mv.col;
    3337      417268 :     ctxt->mv_unit.mv[0].y = left_mbmi->block_mi.mv[0].as_mv.row;
    3338      417268 :     ctxt->mv_unit.pred_direction = UNI_PRED_LIST_0;
    3339             : 
    3340             : 
    3341      417268 :     uint8_t ref_idx_l0 = get_ref_frame_idx(left_mbmi->block_mi.ref_frame[0]);
    3342      417267 :     uint8_t list_idx0 = get_list_idx(left_mbmi->block_mi.ref_frame[0]);
    3343             : 
    3344      417267 :     if (is16bit)
    3345           0 :         ctxt->ref_pic_list0 = ((EbReferenceObject*)ctxt->picture_control_set_ptr->ref_pic_ptr_array[list_idx0][ref_idx_l0]->object_ptr)->reference_picture16bit;
    3346             :     else
    3347      417267 :         ctxt->ref_pic_list0 = ((EbReferenceObject*)ctxt->picture_control_set_ptr->ref_pic_ptr_array[list_idx0][ref_idx_l0]->object_ptr)->reference_picture;
    3348             : 
    3349      417267 :     xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
    3350      417267 :     xd->mb_to_bottom_edge =
    3351      417267 :         ctxt->mb_to_far_edge +
    3352      417267 :         (xd->n4_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
    3353      417267 : }
    3354             : 
    3355           0 : EbErrorType get_single_prediction_for_obmc_luma_hbd(
    3356             :     uint32_t                              interp_filters,
    3357             :     MacroBlockD                          *xd,
    3358             :     MvUnit                               *mv_unit,
    3359             :     uint16_t                              pu_origin_x,
    3360             :     uint16_t                              pu_origin_y,
    3361             :     uint8_t                               bwidth,
    3362             :     uint8_t                               bheight,
    3363             :     EbPictureBufferDesc                  *ref_pic_list0,
    3364             :     EbPictureBufferDesc                  *prediction_ptr,
    3365             :     uint16_t                              dst_origin_x,
    3366             :     uint16_t                              dst_origin_y)
    3367             : {
    3368           0 :     EbErrorType  return_error = EB_ErrorNone;
    3369           0 :     uint8_t         is_compound = 0;
    3370             :     DECLARE_ALIGNED(32, uint16_t, tmp_dstY[128 * 128]);//move this to context if stack does not hold.
    3371             : 
    3372             :     MV  mv, mv_q4;
    3373             :     int32_t subpel_x, subpel_y;
    3374             :     uint16_t * src_ptr;
    3375             :     uint16_t * dst_ptr;
    3376             :     int32_t src_stride;
    3377             :     int32_t dst_stride;
    3378             :     ConvolveParams conv_params;
    3379             :     InterpFilterParams filter_params_x, filter_params_y;
    3380             : 
    3381             :     {
    3382             :         //List0-Y
    3383           0 :         mv.col = mv_unit->mv[REF_LIST_0].x;
    3384           0 :         mv.row = mv_unit->mv[REF_LIST_0].y;
    3385           0 :         assert(ref_pic_list0 != NULL);
    3386           0 :         src_ptr = (uint16_t*)ref_pic_list0->buffer_y + ref_pic_list0->origin_x + pu_origin_x + (ref_pic_list0->origin_y + pu_origin_y) * ref_pic_list0->stride_y;
    3387           0 :         dst_ptr = (uint16_t*)prediction_ptr->buffer_y + prediction_ptr->origin_x + dst_origin_x + (prediction_ptr->origin_y + dst_origin_y) * prediction_ptr->stride_y;
    3388           0 :         src_stride = ref_pic_list0->stride_y;
    3389           0 :         dst_stride = prediction_ptr->stride_y;
    3390             : 
    3391           0 :         mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bwidth, bheight, 0, 0);//mv_q4 has 1 extra bit for fractionnal to accomodate chroma when accessing filter coeffs.
    3392           0 :         subpel_x = mv_q4.col & SUBPEL_MASK;
    3393           0 :         subpel_y = mv_q4.row & SUBPEL_MASK;
    3394           0 :         src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    3395           0 :         conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstY, 128, is_compound, EB_10BIT);
    3396           0 :         av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    3397             :             &filter_params_y, bwidth, bheight);
    3398             : 
    3399           0 :         convolveHbd[subpel_x != 0][subpel_y != 0][is_compound](
    3400             :             src_ptr,
    3401             :             src_stride,
    3402             :             dst_ptr,
    3403             :             dst_stride,
    3404             :             bwidth,
    3405             :             bheight,
    3406             :             &filter_params_x,
    3407             :             &filter_params_y,
    3408             :             subpel_x,
    3409             :             subpel_y,
    3410             :             &conv_params,
    3411             :             10);
    3412             : 
    3413             :     }
    3414           0 :     return return_error;
    3415             : }
    3416           0 : EbErrorType get_single_prediction_for_obmc_chroma_hbd(
    3417             :     uint32_t                              interp_filters,
    3418             :     MacroBlockD                          *xd,
    3419             :     MvUnit                               *mv_unit,
    3420             :     uint16_t                              pu_origin_x,
    3421             :     uint16_t                              pu_origin_y,
    3422             :     uint8_t                               bwidth,
    3423             :     uint8_t                               bheight,
    3424             :     EbPictureBufferDesc                  *ref_pic_list0,
    3425             :     EbPictureBufferDesc                  *prediction_ptr,
    3426             :     uint16_t                              dst_origin_x,
    3427             :     uint16_t                              dst_origin_y)
    3428             : {
    3429           0 :     EbErrorType  return_error = EB_ErrorNone;
    3430           0 :     uint8_t         is_compound = 0;
    3431             : 
    3432             :     DECLARE_ALIGNED(32, uint16_t, tmp_dstCb[64 * 64]);
    3433             :     DECLARE_ALIGNED(32, uint16_t, tmp_dstCr[64 * 64]);
    3434             : 
    3435             :     MV  mv, mv_q4;
    3436             :     int32_t subpel_x, subpel_y;
    3437             :     uint16_t * src_ptr;
    3438             :     uint16_t * dst_ptr;
    3439             :     int32_t src_stride;
    3440             :     int32_t dst_stride;
    3441             :     ConvolveParams conv_params;
    3442             :     InterpFilterParams filter_params_x, filter_params_y;
    3443             :     {
    3444             :         //List0-Y
    3445           0 :         mv.col = mv_unit->mv[REF_LIST_0].x;
    3446           0 :         mv.row = mv_unit->mv[REF_LIST_0].y;
    3447           0 :         assert(ref_pic_list0 != NULL);
    3448             : 
    3449             :        {
    3450             :             //List0-Cb
    3451           0 :             src_ptr = (uint16_t*)ref_pic_list0->buffer_cb + (ref_pic_list0->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic_list0->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list0->stride_cb;
    3452           0 :             dst_ptr = (uint16_t*)prediction_ptr->buffer_cb + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb;
    3453           0 :             src_stride = ref_pic_list0->stride_cb;
    3454           0 :             dst_stride = prediction_ptr->stride_cb;
    3455             : 
    3456           0 :             mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bwidth, bheight, 1, 1);
    3457           0 :             subpel_x = mv_q4.col & SUBPEL_MASK;
    3458           0 :             subpel_y = mv_q4.row & SUBPEL_MASK;
    3459           0 :             src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    3460           0 :             conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstCb, 64, is_compound, EB_10BIT);
    3461             : 
    3462           0 :             av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    3463             :                 &filter_params_y, bwidth, bheight);
    3464             : 
    3465           0 :             convolveHbd[subpel_x != 0][subpel_y != 0][is_compound](
    3466             :                 src_ptr,
    3467             :                 src_stride,
    3468             :                 dst_ptr,
    3469             :                 dst_stride,
    3470             :                 bwidth,
    3471             :                 bheight,
    3472             :                 &filter_params_x,
    3473             :                 &filter_params_y,
    3474             :                 subpel_x,
    3475             :                 subpel_y,
    3476             :                 &conv_params,
    3477             :                 10);
    3478             : 
    3479             :             //List0-Cr
    3480           0 :             src_ptr = (uint16_t*)ref_pic_list0->buffer_cr + (ref_pic_list0->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic_list0->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list0->stride_cr;
    3481           0 :             dst_ptr = (uint16_t*) prediction_ptr->buffer_cr + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cr;
    3482           0 :             src_stride = ref_pic_list0->stride_cr;
    3483           0 :             dst_stride = prediction_ptr->stride_cr;
    3484             : 
    3485           0 :             mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bwidth, bheight, 1, 1);
    3486           0 :             subpel_x = mv_q4.col & SUBPEL_MASK;
    3487           0 :             subpel_y = mv_q4.row & SUBPEL_MASK;
    3488           0 :             src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    3489           0 :             conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstCr, 64, is_compound, EB_10BIT);
    3490           0 :             av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    3491             :                 &filter_params_y, bwidth, bheight);
    3492             : 
    3493           0 :             convolveHbd[subpel_x != 0][subpel_y != 0][is_compound](
    3494             :                 src_ptr,
    3495             :                 src_stride,
    3496             :                 dst_ptr,
    3497             :                 dst_stride,
    3498             :                 bwidth,
    3499             :                 bheight,
    3500             :                 &filter_params_x,
    3501             :                 &filter_params_y,
    3502             :                 subpel_x,
    3503             :                 subpel_y,
    3504             :                 &conv_params,
    3505             :                 10);
    3506             :         }
    3507             :     }
    3508             : 
    3509           0 :     return return_error;
    3510             : }
    3511             : 
    3512      813937 : EbErrorType get_single_prediction_for_obmc_luma(
    3513             :     uint32_t                              interp_filters,
    3514             :     MacroBlockD                          *xd,
    3515             :     MvUnit                               *mv_unit,
    3516             :     uint16_t                              pu_origin_x,
    3517             :     uint16_t                              pu_origin_y,
    3518             :     uint8_t                               bwidth,
    3519             :     uint8_t                               bheight,
    3520             :     EbPictureBufferDesc                  *ref_pic_list0,
    3521             :     EbPictureBufferDesc                  *prediction_ptr,
    3522             :     uint16_t                              dst_origin_x,
    3523             :     uint16_t                              dst_origin_y)
    3524             : {
    3525      813937 :     EbErrorType  return_error = EB_ErrorNone;
    3526      813937 :     uint8_t         is_compound = 0;
    3527             :     DECLARE_ALIGNED(32, uint16_t, tmp_dstY[128 * 128]);//move this to context if stack does not hold.
    3528             : 
    3529             :     MV  mv, mv_q4;
    3530             :     int32_t subpel_x, subpel_y;
    3531             :     uint8_t * src_ptr;
    3532             :     uint8_t * dst_ptr;
    3533             :     int32_t src_stride;
    3534             :     int32_t dst_stride;
    3535             :     ConvolveParams conv_params;
    3536             :     InterpFilterParams filter_params_x, filter_params_y;
    3537             : 
    3538             :     {
    3539             :         //List0-Y
    3540      813937 :         mv.col = mv_unit->mv[REF_LIST_0].x;
    3541      813937 :         mv.row = mv_unit->mv[REF_LIST_0].y;
    3542      813937 :         assert(ref_pic_list0 != NULL);
    3543      813937 :         src_ptr = ref_pic_list0->buffer_y + ref_pic_list0->origin_x + pu_origin_x + (ref_pic_list0->origin_y + pu_origin_y) * ref_pic_list0->stride_y;
    3544      813937 :         dst_ptr = prediction_ptr->buffer_y + prediction_ptr->origin_x + dst_origin_x + (prediction_ptr->origin_y + dst_origin_y) * prediction_ptr->stride_y;
    3545      813937 :         src_stride = ref_pic_list0->stride_y;
    3546      813937 :         dst_stride = prediction_ptr->stride_y;
    3547             : 
    3548      813937 :         mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bwidth, bheight, 0, 0);//mv_q4 has 1 extra bit for fractionnal to accomodate chroma when accessing filter coeffs.
    3549      813955 :         subpel_x = mv_q4.col & SUBPEL_MASK;
    3550      813955 :         subpel_y = mv_q4.row & SUBPEL_MASK;
    3551      813955 :         src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    3552      813955 :         conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstY, 128, is_compound, EB_8BIT);
    3553      813952 :         av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    3554             :             &filter_params_y, bwidth, bheight);
    3555             : 
    3556      813947 :         convolve[subpel_x != 0][subpel_y != 0][is_compound](
    3557             :             src_ptr,
    3558             :             src_stride,
    3559             :             dst_ptr,
    3560             :             dst_stride,
    3561             :             bwidth,
    3562             :             bheight,
    3563             :             &filter_params_x,
    3564             :             &filter_params_y,
    3565             :             subpel_x,
    3566             :             subpel_y,
    3567             :             &conv_params);
    3568             : 
    3569             :     }
    3570      813954 :     return return_error;
    3571             : }
    3572             : 
    3573      614604 : EbErrorType get_single_prediction_for_obmc_chroma(
    3574             :     uint32_t                              interp_filters,
    3575             :     MacroBlockD                          *xd,
    3576             :     MvUnit                               *mv_unit,
    3577             :     uint16_t                              pu_origin_x,
    3578             :     uint16_t                              pu_origin_y,
    3579             :     uint8_t                               bwidth,
    3580             :     uint8_t                               bheight,
    3581             :     EbPictureBufferDesc                  *ref_pic_list0,
    3582             :     EbPictureBufferDesc                  *prediction_ptr,
    3583             :     uint16_t                              dst_origin_x,
    3584             :     uint16_t                              dst_origin_y)
    3585             : {
    3586      614604 :     EbErrorType  return_error = EB_ErrorNone;
    3587      614604 :     uint8_t         is_compound = 0;
    3588             : 
    3589             :     DECLARE_ALIGNED(32, uint16_t, tmp_dstCb[64 * 64]);
    3590             :     DECLARE_ALIGNED(32, uint16_t, tmp_dstCr[64 * 64]);
    3591             : 
    3592             :     MV  mv, mv_q4;
    3593             :     int32_t subpel_x, subpel_y;
    3594             :     uint8_t * src_ptr;
    3595             :     uint8_t * dst_ptr;
    3596             :     int32_t src_stride;
    3597             :     int32_t dst_stride;
    3598             :     ConvolveParams conv_params;
    3599             :     InterpFilterParams filter_params_x, filter_params_y;
    3600             : 
    3601             : 
    3602             :     {
    3603             :         //List0-Y
    3604             : 
    3605      614604 :         mv.col = mv_unit->mv[REF_LIST_0].x;
    3606      614604 :         mv.row = mv_unit->mv[REF_LIST_0].y;
    3607      614604 :         assert(ref_pic_list0 != NULL);
    3608             : 
    3609             :        {
    3610             :             //List0-Cb
    3611      614604 :             src_ptr = ref_pic_list0->buffer_cb + (ref_pic_list0->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic_list0->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list0->stride_cb;
    3612      614604 :             dst_ptr = prediction_ptr->buffer_cb + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb;
    3613      614604 :             src_stride = ref_pic_list0->stride_cb;
    3614      614604 :             dst_stride = prediction_ptr->stride_cb;
    3615             : 
    3616      614604 :             mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bwidth, bheight, 1, 1);
    3617      614612 :             subpel_x = mv_q4.col & SUBPEL_MASK;
    3618      614612 :             subpel_y = mv_q4.row & SUBPEL_MASK;
    3619      614612 :             src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    3620      614612 :             conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstCb, 64, is_compound, EB_8BIT);
    3621             : 
    3622      614611 :             av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    3623             :                 &filter_params_y, bwidth, bheight);
    3624             : 
    3625      614602 :             convolve[subpel_x != 0][subpel_y != 0][is_compound](
    3626             :                 src_ptr,
    3627             :                 src_stride,
    3628             :                 dst_ptr,
    3629             :                 dst_stride,
    3630             :                 bwidth,
    3631             :                 bheight,
    3632             :                 &filter_params_x,
    3633             :                 &filter_params_y,
    3634             :                 subpel_x,
    3635             :                 subpel_y,
    3636             :                 &conv_params);
    3637             : 
    3638             :             //List0-Cr
    3639      614603 :             src_ptr = ref_pic_list0->buffer_cr + (ref_pic_list0->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic_list0->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list0->stride_cr;
    3640      614603 :             dst_ptr = prediction_ptr->buffer_cr + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cr;
    3641      614603 :             src_stride = ref_pic_list0->stride_cr;
    3642      614603 :             dst_stride = prediction_ptr->stride_cr;
    3643             : 
    3644      614603 :             mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bwidth, bheight, 1, 1);
    3645      614607 :             subpel_x = mv_q4.col & SUBPEL_MASK;
    3646      614607 :             subpel_y = mv_q4.row & SUBPEL_MASK;
    3647      614607 :             src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    3648      614607 :             conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstCr, 64, is_compound, EB_8BIT);
    3649      614610 :             av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    3650             :                 &filter_params_y, bwidth, bheight);
    3651             : 
    3652      614604 :             convolve[subpel_x != 0][subpel_y != 0][is_compound](
    3653             :                 src_ptr,
    3654             :                 src_stride,
    3655             :                 dst_ptr,
    3656             :                 dst_stride,
    3657             :                 bwidth,
    3658             :                 bheight,
    3659             :                 &filter_params_x,
    3660             :                 &filter_params_y,
    3661             :                 subpel_x,
    3662             :                 subpel_y,
    3663             :                 &conv_params);
    3664             : 
    3665             :         }
    3666             :     }
    3667             : 
    3668      614604 :     return return_error;
    3669             : }
    3670      396680 : static INLINE void build_prediction_by_above_pred(
    3671             :     uint8_t is16bit,
    3672             :     MacroBlockD *xd,
    3673             :     int rel_mi_col,
    3674             :     uint8_t above_mi_width,
    3675             :     MbModeInfo *above_mbmi,
    3676             :     void *fun_ctxt,
    3677             :     const int num_planes)
    3678             : {
    3679      396680 :     struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
    3680      396680 :     const int above_mi_col = ctxt->mi_col + rel_mi_col;
    3681             :     int mi_x, mi_y;
    3682      396680 :     MbModeInfo backup_mbmi = *above_mbmi;
    3683             : 
    3684      396680 :     av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
    3685             :         &backup_mbmi, ctxt, num_planes,is16bit);
    3686             : 
    3687      396688 :     ctxt->prediction_ptr.origin_x  = ctxt->prediction_ptr.origin_y = 0;
    3688      396688 :     ctxt->prediction_ptr.buffer_y  = ctxt->tmp_buf[0];
    3689      396688 :     ctxt->prediction_ptr.buffer_cb = ctxt->tmp_buf[1];
    3690      396688 :     ctxt->prediction_ptr.buffer_cr = ctxt->tmp_buf[2];
    3691      396688 :     ctxt->prediction_ptr.stride_y  = ctxt->tmp_stride[0];
    3692      396688 :     ctxt->prediction_ptr.stride_cb = ctxt->tmp_stride[1];
    3693      396688 :     ctxt->prediction_ptr.stride_cr = ctxt->tmp_stride[2];
    3694             : 
    3695      396688 :     ctxt->dst_origin_x = rel_mi_col << MI_SIZE_LOG2;
    3696      396688 :     ctxt->dst_origin_y = 0;
    3697             : 
    3698      396688 :     mi_x = above_mi_col << MI_SIZE_LOG2;
    3699      396688 :     mi_y = ctxt->mi_row << MI_SIZE_LOG2;
    3700             : 
    3701      396688 :     const BlockSize bsize = xd->sb_type;
    3702             : 
    3703     1190060 :     for (int j = 0; j < num_planes; ++j) {
    3704             : 
    3705      793368 :         int subsampling_x =  j > 0 ? 1 : 0;
    3706      793368 :         int subsampling_y =  j > 0 ? 1 : 0;
    3707             : 
    3708      793368 :         int bw = (above_mi_width * MI_SIZE) >> subsampling_x;
    3709      793368 :         int bh = clamp(block_size_high[bsize] >> (subsampling_y + 1), 4,
    3710      793368 :             block_size_high[BLOCK_64X64] >> (subsampling_y + 1));
    3711             : 
    3712             : 
    3713      793358 :         if (av1_skip_u4x4_pred_in_obmc(bsize, 0, subsampling_x, subsampling_y)) continue;
    3714             : 
    3715      594017 :         if(j==0)
    3716      396677 :             if (is16bit)
    3717           0 :                 get_single_prediction_for_obmc_luma_hbd(
    3718             :                     above_mbmi->block_mi.interp_filters,
    3719             :                     xd,
    3720             :                     &ctxt->mv_unit,
    3721             :                     mi_x,
    3722             :                     mi_y,
    3723             :                     bw,
    3724             :                     bh,
    3725             :                     ctxt->ref_pic_list0,
    3726             :                     &ctxt->prediction_ptr,
    3727           0 :                     ctxt->dst_origin_x,
    3728           0 :                     ctxt->dst_origin_y);
    3729             :             else
    3730      396677 :                 get_single_prediction_for_obmc_luma(
    3731             :                     above_mbmi->block_mi.interp_filters,
    3732             :                     xd,
    3733             :                     &ctxt->mv_unit,
    3734             :                     mi_x,
    3735             :                     mi_y,
    3736             :                     bw,
    3737             :                     bh,
    3738             :                     ctxt->ref_pic_list0,
    3739             :                     &ctxt->prediction_ptr,
    3740      396677 :                     ctxt->dst_origin_x,
    3741      396677 :                     ctxt->dst_origin_y);
    3742             :         else
    3743      197340 :             if (is16bit)
    3744           0 :                 get_single_prediction_for_obmc_chroma_hbd(
    3745             :                     above_mbmi->block_mi.interp_filters,
    3746             :                     xd,
    3747             :                     &ctxt->mv_unit,
    3748             :                     mi_x,
    3749             :                     mi_y,
    3750             :                     bw,
    3751             :                     bh,
    3752             :                     ctxt->ref_pic_list0,
    3753             :                     &ctxt->prediction_ptr,
    3754           0 :                     ctxt->dst_origin_x,
    3755           0 :                     ctxt->dst_origin_y);
    3756             :             else
    3757      197340 :                 get_single_prediction_for_obmc_chroma(
    3758             :                     above_mbmi->block_mi.interp_filters,
    3759             :                     xd,
    3760             :                     &ctxt->mv_unit,
    3761             :                     mi_x,
    3762             :                     mi_y,
    3763             :                     bw,
    3764             :                     bh,
    3765             :                     ctxt->ref_pic_list0,
    3766             :                     &ctxt->prediction_ptr,
    3767      197340 :                     ctxt->dst_origin_x,
    3768      197340 :                     ctxt->dst_origin_y);
    3769             : 
    3770             :     }
    3771      396691 : }
    3772      417265 : static INLINE void build_prediction_by_left_pred(
    3773             :     uint8_t is16bit,
    3774             :     MacroBlockD *xd,
    3775             :     int rel_mi_row,
    3776             :     uint8_t left_mi_height,
    3777             :     MbModeInfo *left_mbmi,
    3778             :     void *fun_ctxt,
    3779             :     const int num_planes)
    3780             : {
    3781      417265 :     struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
    3782      417265 :     const int left_mi_row = ctxt->mi_row + rel_mi_row;
    3783             :     int mi_x, mi_y;
    3784      417265 :     MbModeInfo backup_mbmi = *left_mbmi;
    3785             : 
    3786      417265 :     av1_setup_build_prediction_by_left_pred(xd, rel_mi_row,
    3787             :         left_mi_height,
    3788             :         &backup_mbmi, ctxt, num_planes,is16bit);
    3789             : 
    3790      417267 :     mi_x = ctxt->mi_col << MI_SIZE_LOG2;
    3791      417267 :     mi_y = left_mi_row << MI_SIZE_LOG2;
    3792             : 
    3793      417267 :     ctxt->prediction_ptr.origin_x = ctxt->prediction_ptr.origin_y = 0;
    3794      417267 :     ctxt->prediction_ptr.buffer_y = ctxt->tmp_buf[0];
    3795      417267 :     ctxt->prediction_ptr.buffer_cb = ctxt->tmp_buf[1];
    3796      417267 :     ctxt->prediction_ptr.buffer_cr = ctxt->tmp_buf[2];
    3797      417267 :     ctxt->prediction_ptr.stride_y = ctxt->tmp_stride[0];
    3798      417267 :     ctxt->prediction_ptr.stride_cb = ctxt->tmp_stride[1];
    3799      417267 :     ctxt->prediction_ptr.stride_cr = ctxt->tmp_stride[2];
    3800             : 
    3801      417267 :     ctxt->dst_origin_x = 0;
    3802      417267 :     ctxt->dst_origin_y = rel_mi_row << MI_SIZE_LOG2;
    3803             : 
    3804      417267 :     const BlockSize bsize = xd->sb_type;
    3805             : 
    3806     1251790 :     for (int j = 0; j < num_planes; ++j)
    3807             :     {
    3808      834525 :         int subsampling_x = j > 0 ? 1 : 0;
    3809      834525 :         int subsampling_y = j > 0 ? 1 : 0;
    3810             : 
    3811      834525 :         int bw = clamp(block_size_wide[bsize] >> (subsampling_x + 1), 4,  block_size_wide[BLOCK_64X64] >> (subsampling_x + 1));
    3812      834526 :         int bh = (left_mi_height << MI_SIZE_LOG2) >> subsampling_y;
    3813             : 
    3814      834526 :         if (av1_skip_u4x4_pred_in_obmc(bsize, 1,subsampling_x, subsampling_y)) continue;
    3815             : 
    3816      834532 :         if (j == 0)
    3817      417267 :             if (is16bit)
    3818           0 :                 get_single_prediction_for_obmc_luma_hbd(
    3819             :                     left_mbmi->block_mi.interp_filters,
    3820             :                     xd,
    3821             :                     &ctxt->mv_unit,
    3822             :                     mi_x,
    3823             :                     mi_y,
    3824             :                     bw,
    3825             :                     bh,
    3826             :                     ctxt->ref_pic_list0,
    3827             :                     &ctxt->prediction_ptr,
    3828           0 :                     ctxt->dst_origin_x,
    3829           0 :                     ctxt->dst_origin_y);
    3830             :             else
    3831      417267 :                 get_single_prediction_for_obmc_luma(
    3832             :                     left_mbmi->block_mi.interp_filters,
    3833             :                     xd,
    3834             :                     &ctxt->mv_unit,
    3835             :                     mi_x,
    3836             :                     mi_y,
    3837             :                     bw,
    3838             :                     bh,
    3839             :                     ctxt->ref_pic_list0,
    3840             :                     &ctxt->prediction_ptr,
    3841      417267 :                     ctxt->dst_origin_x,
    3842      417267 :                     ctxt->dst_origin_y);
    3843             :         else
    3844      417265 :             if (is16bit)
    3845           0 :                 get_single_prediction_for_obmc_chroma_hbd(
    3846             :                     left_mbmi->block_mi.interp_filters,
    3847             :                     xd,
    3848             :                     &ctxt->mv_unit,
    3849             :                     mi_x,
    3850             :                     mi_y,
    3851             :                     bw,
    3852             :                     bh,
    3853             :                     ctxt->ref_pic_list0,
    3854             :                     &ctxt->prediction_ptr,
    3855           0 :                     ctxt->dst_origin_x,
    3856           0 :                     ctxt->dst_origin_y);
    3857             :             else
    3858      417265 :                 get_single_prediction_for_obmc_chroma(
    3859             :                     left_mbmi->block_mi.interp_filters,
    3860             :                     xd,
    3861             :                     &ctxt->mv_unit,
    3862             :                     mi_x,
    3863             :                     mi_y,
    3864             :                     bw,
    3865             :                     bh,
    3866             :                     ctxt->ref_pic_list0,
    3867             :                     &ctxt->prediction_ptr,
    3868      417265 :                     ctxt->dst_origin_x,
    3869      417265 :                     ctxt->dst_origin_y);
    3870             :     }
    3871      417266 : }
    3872             : 
    3873           0 : static void build_prediction_by_above_preds_hbd(
    3874             :     EbBool                  perform_chroma,
    3875             :     BlockSize              bsize,
    3876             :     PictureControlSet      *picture_control_set_ptr,
    3877             :     MacroBlockD            *xd,
    3878             :     int                     mi_row,
    3879             :     int                     mi_col,
    3880             :     uint16_t                *tmp_buf[MAX_MB_PLANE],
    3881             :     int                     tmp_stride[MAX_MB_PLANE] )
    3882             : {
    3883           0 :     if (!xd->up_available) return;
    3884             : 
    3885           0 :     uint8_t is16bit = 1;
    3886             :     // Adjust mb_to_bottom_edge to have the correct value for the OBMC
    3887             :     // prediction block. This is half the height of the original block,
    3888             :     // except for 128-wide blocks, where we only use a height of 32.
    3889           0 :     int this_height = xd->n4_h * MI_SIZE;
    3890           0 :     int pred_height = AOMMIN(this_height / 2, 32);
    3891           0 :     xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
    3892             : 
    3893             :     struct build_prediction_hbd_ctxt ctxt ;
    3894             : 
    3895           0 :     ctxt.cm = picture_control_set_ptr->parent_pcs_ptr->av1_cm;
    3896           0 :     ctxt.mi_row=  mi_row;
    3897           0 :     ctxt.mi_col=  mi_col;
    3898           0 :     ctxt.tmp_buf=  tmp_buf;
    3899           0 :     ctxt.tmp_width=  0;
    3900           0 :     ctxt.tmp_height=  0;
    3901           0 :     ctxt.tmp_stride=  tmp_stride;
    3902           0 :     ctxt.mb_to_far_edge=  xd->mb_to_right_edge;
    3903             : 
    3904           0 :     ctxt.picture_control_set_ptr = picture_control_set_ptr;
    3905           0 :     ctxt.perform_chroma          = perform_chroma;
    3906           0 :     xd->sb_type = bsize;
    3907             : 
    3908           0 :     foreach_overlappable_nb_above(is16bit,picture_control_set_ptr->parent_pcs_ptr->av1_cm, xd, mi_col,
    3909           0 :         max_neighbor_obmc[mi_size_wide_log2[bsize]],
    3910             :         build_prediction_by_above_pred, &ctxt);
    3911             : 
    3912           0 :     xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
    3913           0 :     xd->mb_to_right_edge = ctxt.mb_to_far_edge;
    3914           0 :     xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
    3915             : }
    3916             : 
    3917      401382 : static void build_prediction_by_above_preds(
    3918             :     EbBool                  perform_chroma,
    3919             :     BlockSize              bsize,
    3920             :     PictureControlSet      *picture_control_set_ptr,
    3921             :     MacroBlockD            *xd,
    3922             :     int                     mi_row,
    3923             :     int                     mi_col,
    3924             :     uint8_t                *tmp_buf[MAX_MB_PLANE],
    3925             :     int                     tmp_stride[MAX_MB_PLANE] )
    3926             : {
    3927      401382 :     if (!xd->up_available) return;
    3928             : 
    3929      379757 :     uint8_t is16bit = 0;
    3930             :     // Adjust mb_to_bottom_edge to have the correct value for the OBMC
    3931             :     // prediction block. This is half the height of the original block,
    3932             :     // except for 128-wide blocks, where we only use a height of 32.
    3933      379757 :     int this_height = xd->n4_h * MI_SIZE;
    3934      379757 :     int pred_height = AOMMIN(this_height / 2, 32);
    3935      379757 :     xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
    3936             : 
    3937             :     struct build_prediction_ctxt ctxt ;
    3938             : 
    3939      379757 :     ctxt.cm = picture_control_set_ptr->parent_pcs_ptr->av1_cm;
    3940      379757 :     ctxt.mi_row=  mi_row;
    3941      379757 :     ctxt.mi_col=  mi_col;
    3942      379757 :     ctxt.tmp_buf=  tmp_buf;
    3943      379757 :     ctxt.tmp_width=  0;
    3944      379757 :     ctxt.tmp_height=  0;
    3945      379757 :     ctxt.tmp_stride=  tmp_stride;
    3946      379757 :     ctxt.mb_to_far_edge=  xd->mb_to_right_edge;
    3947             : 
    3948      379757 :     ctxt.picture_control_set_ptr = picture_control_set_ptr;
    3949      379757 :     ctxt.perform_chroma = perform_chroma;
    3950      379757 :     xd->sb_type = bsize;
    3951             : 
    3952      379757 :     foreach_overlappable_nb_above(
    3953             :         is16bit,
    3954      379757 :         picture_control_set_ptr->parent_pcs_ptr->av1_cm,
    3955             :         xd,
    3956             :         mi_col,
    3957      379757 :         max_neighbor_obmc[mi_size_wide_log2[bsize]],
    3958             :         build_prediction_by_above_pred,
    3959             :         &ctxt);
    3960             : 
    3961      379776 :     xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
    3962      379776 :     xd->mb_to_right_edge = ctxt.mb_to_far_edge;
    3963      379776 :     xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
    3964             : }
    3965             : 
    3966           0 : static void build_prediction_by_left_preds_hbd(
    3967             :     EbBool                                perform_chroma,
    3968             :     BlockSize                            bsize,
    3969             :     PictureControlSet                    *picture_control_set_ptr,
    3970             :     MacroBlockD                          *xd,
    3971             :     int                                   mi_row,
    3972             :     int                                   mi_col,
    3973             :     uint16_t                             *tmp_buf[MAX_MB_PLANE],
    3974             :     int                                   tmp_stride[MAX_MB_PLANE])
    3975             : {
    3976           0 :     if (!xd->left_available) return;
    3977             : 
    3978           0 :      uint8_t is16bit = 1;
    3979             :     // Adjust mb_to_right_edge to have the correct value for the OBMC
    3980             :     // prediction block. This is half the width of the original block,
    3981             :     // except for 128-wide blocks, where we only use a width of 32.
    3982           0 :     int this_width = xd->n4_w * MI_SIZE;
    3983           0 :     int pred_width = AOMMIN(this_width / 2, 32);
    3984           0 :     xd->mb_to_right_edge += (this_width - pred_width) * 8;
    3985             : 
    3986             :     struct build_prediction_hbd_ctxt ctxt ;
    3987             : 
    3988           0 :     ctxt.cm = picture_control_set_ptr->parent_pcs_ptr->av1_cm;
    3989           0 :     ctxt.mi_row=  mi_row;
    3990           0 :     ctxt.mi_col=  mi_col;
    3991           0 :     ctxt.tmp_buf=  tmp_buf;
    3992           0 :     ctxt.tmp_width=  0;
    3993           0 :     ctxt.tmp_height=  0;
    3994           0 :     ctxt.tmp_stride=  tmp_stride;
    3995           0 :     ctxt.mb_to_far_edge=  xd->mb_to_bottom_edge;
    3996             : 
    3997           0 :     ctxt.picture_control_set_ptr = picture_control_set_ptr;
    3998           0 :     ctxt.perform_chroma = perform_chroma;
    3999             : 
    4000           0 :     xd->sb_type = bsize;
    4001             : 
    4002           0 :     foreach_overlappable_nb_left(
    4003             :         is16bit,
    4004           0 :         picture_control_set_ptr->parent_pcs_ptr->av1_cm,
    4005             :         xd,
    4006             :         mi_row,
    4007           0 :         max_neighbor_obmc[mi_size_high_log2[bsize]],
    4008             :         build_prediction_by_left_pred,
    4009             :         &ctxt);
    4010             : 
    4011           0 :     xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
    4012           0 :     xd->mb_to_right_edge -= (this_width - pred_width) * 8;
    4013           0 :     xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
    4014             : }
    4015             : 
    4016      401400 : static void build_prediction_by_left_preds(
    4017             :     EbBool                                perform_chroma,
    4018             :     BlockSize                            bsize,
    4019             :     PictureControlSet                    *picture_control_set_ptr,
    4020             :     MacroBlockD                          *xd,
    4021             :     int                                   mi_row,
    4022             :     int                                   mi_col,
    4023             :     uint8_t                             *tmp_buf[MAX_MB_PLANE],
    4024             :     int                                   tmp_stride[MAX_MB_PLANE])
    4025             : {
    4026      401400 :     if (!xd->left_available) return;
    4027             : 
    4028      384532 :      uint8_t is16bit =0;
    4029             :     // Adjust mb_to_right_edge to have the correct value for the OBMC
    4030             :     // prediction block. This is half the width of the original block,
    4031             :     // except for 128-wide blocks, where we only use a width of 32.
    4032      384532 :     int this_width = xd->n4_w * MI_SIZE;
    4033      384532 :     int pred_width = AOMMIN(this_width / 2, 32);
    4034      384532 :     xd->mb_to_right_edge += (this_width - pred_width) * 8;
    4035             : 
    4036             :     struct build_prediction_ctxt ctxt ;
    4037             : 
    4038      384532 :     ctxt.cm = picture_control_set_ptr->parent_pcs_ptr->av1_cm;
    4039      384532 :     ctxt.mi_row=  mi_row;
    4040      384532 :     ctxt.mi_col=  mi_col;
    4041      384532 :     ctxt.tmp_buf=  tmp_buf;
    4042      384532 :     ctxt.tmp_width=  0;
    4043      384532 :     ctxt.tmp_height=  0;
    4044      384532 :     ctxt.tmp_stride=  tmp_stride;
    4045      384532 :     ctxt.mb_to_far_edge=  xd->mb_to_bottom_edge;
    4046             : 
    4047      384532 :     ctxt.picture_control_set_ptr = picture_control_set_ptr;
    4048      384532 :     ctxt.perform_chroma = perform_chroma;
    4049             : 
    4050      384532 :     xd->sb_type = bsize;
    4051             : 
    4052      384532 :     foreach_overlappable_nb_left(
    4053             :         is16bit,
    4054      384532 :         picture_control_set_ptr->parent_pcs_ptr->av1_cm,
    4055             :         xd,
    4056             :         mi_row,
    4057      384532 :         max_neighbor_obmc[mi_size_high_log2[bsize]],
    4058             :         build_prediction_by_left_pred, &ctxt);
    4059             : 
    4060      384533 :     xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
    4061      384533 :     xd->mb_to_right_edge -= (this_width - pred_width) * 8;
    4062      384533 :     xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
    4063             : }
    4064             : 
    4065             : 
    4066             : struct obmc_inter_pred_ctxt {
    4067             :     uint8_t **adjacent;
    4068             :     uint16_t **adjacent_hbd;
    4069             :     int *adjacent_stride;
    4070             :     uint8_t *final_dst_ptr_y;
    4071             :     uint16_t *final_dst_ptr_y_hbd;
    4072             :     uint16_t final_dst_stride_y;
    4073             :     uint8_t *final_dst_ptr_u;
    4074             :     uint16_t *final_dst_ptr_u_hbd;
    4075             :     uint16_t final_dst_stride_u;
    4076             :     uint8_t *final_dst_ptr_v;
    4077             :     uint16_t *final_dst_ptr_v_hbd;
    4078             :     uint16_t final_dst_stride_v;
    4079             :     EbBool   perform_chroma;
    4080             : };
    4081             : // obmc_mask_N[overlap_position]
    4082             : static const uint8_t obmc_mask_1[1] = { 64 };
    4083             : DECLARE_ALIGNED(2, static const uint8_t, obmc_mask_2[2]) = { 45, 64 };
    4084             : 
    4085             : DECLARE_ALIGNED(4, static const uint8_t, obmc_mask_4[4]) = { 39, 50, 59, 64 };
    4086             : 
    4087             : static const uint8_t obmc_mask_8[8] = { 36, 42, 48, 53, 57, 61, 64, 64 };
    4088             : 
    4089             : static const uint8_t obmc_mask_16[16] = { 34, 37, 40, 43, 46, 49, 52, 54,
    4090             :                                           56, 58, 60, 61, 64, 64, 64, 64 };
    4091             : 
    4092             : static const uint8_t obmc_mask_32[32] = { 33, 35, 36, 38, 40, 41, 43, 44,
    4093             :                                           45, 47, 48, 50, 51, 52, 53, 55,
    4094             :                                           56, 57, 58, 59, 60, 60, 61, 62,
    4095             :                                           64, 64, 64, 64, 64, 64, 64, 64 };
    4096             : 
    4097             : static const uint8_t obmc_mask_64[64] = {
    4098             :   33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44,
    4099             :   45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56,
    4100             :   56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62,
    4101             :   62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
    4102             : };
    4103             : 
    4104    28436900 : const uint8_t *av1_get_obmc_mask(int length) {
    4105    28436900 :     switch (length) {
    4106           0 :     case 1: return obmc_mask_1;
    4107      342460 :     case 2: return obmc_mask_2;
    4108    12491300 :     case 4: return obmc_mask_4;
    4109     9055840 :     case 8: return obmc_mask_8;
    4110     5033190 :     case 16: return obmc_mask_16;
    4111     1534580 :     case 32: return obmc_mask_32;
    4112           0 :     case 64: return obmc_mask_64;
    4113           0 :     default: assert(0); return NULL;
    4114             :     }
    4115             : }
    4116             : 
    4117             : 
    4118           0 : void eb_aom_highbd_blend_a64_hmask_c(uint16_t *dst, uint32_t dst_stride,
    4119             :                                   const uint16_t *src0, uint32_t src0_stride,
    4120             :                                   const uint16_t *src1, uint32_t src1_stride,
    4121             :                                   const uint8_t *mask, int w, int h, int bd) {
    4122             :   (void)bd;
    4123             :   int i, j;
    4124             : 
    4125           0 :   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
    4126           0 :   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
    4127             : 
    4128           0 :   assert(h >= 1);
    4129           0 :   assert(w >= 1);
    4130           0 :   assert(IS_POWER_OF_TWO(h));
    4131           0 :   assert(IS_POWER_OF_TWO(w));
    4132             : 
    4133           0 :   assert(bd == 8 || bd == 10 || bd == 12);
    4134             : 
    4135           0 :   for (i = 0; i < h; ++i) {
    4136           0 :     for (j = 0; j < w; ++j) {
    4137           0 :       dst[i * dst_stride + j] = AOM_BLEND_A64(
    4138             :           mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
    4139             :     }
    4140             :   }
    4141           0 : }
    4142           0 : void eb_aom_highbd_blend_a64_vmask_c(uint16_t *dst, uint32_t dst_stride,
    4143             :                                   const uint16_t *src0, uint32_t src0_stride,
    4144             :                                   const uint16_t *src1, uint32_t src1_stride,
    4145             :                                   const uint8_t *mask, int w, int h, int bd) {
    4146             :   (void)bd;
    4147             :   int i, j;
    4148             : 
    4149           0 :   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
    4150           0 :   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
    4151             : 
    4152           0 :   assert(h >= 1);
    4153           0 :   assert(w >= 1);
    4154           0 :   assert(IS_POWER_OF_TWO(h));
    4155           0 :   assert(IS_POWER_OF_TWO(w));
    4156             : 
    4157           0 :   assert(bd == 8 || bd == 10 || bd == 12);
    4158             : 
    4159           0 :   for (i = 0; i < h; ++i) {
    4160           0 :     const int m = mask[i];
    4161           0 :     for (j = 0; j < w; ++j) {
    4162           0 :       dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
    4163             :                                               src1[i * src1_stride + j]);
    4164             :     }
    4165             :   }
    4166           0 : }
    4167             : 
    4168           0 : static INLINE void build_obmc_inter_pred_above_hbd(
    4169             :     uint8_t is16bit ,MacroBlockD *xd, int rel_mi_col,
    4170             :     uint8_t above_mi_width,
    4171             :     MbModeInfo *above_mi,
    4172             :     void *fun_ctxt,
    4173             :     const int num_planes)
    4174             : {
    4175             :     (void)above_mi;
    4176             :     (void)is16bit;
    4177             :     (void)num_planes;
    4178           0 :     struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
    4179           0 :     const BlockSize bsize = xd->sb_type;
    4180             : 
    4181             : 
    4182           0 :     const int overlap =
    4183           0 :         AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
    4184             : 
    4185           0 :     int32_t tot_planes = (ctxt->perform_chroma ? 3 : 1);
    4186             : 
    4187           0 :     for (int plane = 0; plane < tot_planes; ++plane)
    4188             :     {
    4189           0 :         int subsampling_x = plane > 0 ? 1 : 0;
    4190           0 :         int subsampling_y = plane > 0 ? 1 : 0;
    4191             : 
    4192           0 :         const int bw = (above_mi_width * MI_SIZE) >> subsampling_x;
    4193           0 :         const int bh = overlap >> subsampling_y;
    4194           0 :         const int plane_col = (rel_mi_col * MI_SIZE) >> subsampling_x;
    4195             : 
    4196           0 :         if (av1_skip_u4x4_pred_in_obmc(bsize, 0, subsampling_x, subsampling_y)) continue;
    4197             : 
    4198             : 
    4199           0 :         const int dst_stride = plane == 0 ? ctxt->final_dst_stride_y : plane == 1 ? ctxt->final_dst_stride_u : ctxt->final_dst_stride_v;
    4200           0 :         uint16_t *const dst = plane == 0 ? &ctxt->final_dst_ptr_y_hbd[plane_col] : plane == 1 ? &ctxt->final_dst_ptr_u_hbd[plane_col] : &ctxt->final_dst_ptr_v_hbd[plane_col];
    4201             : 
    4202           0 :         const int tmp_stride = ctxt->adjacent_stride[plane];
    4203           0 :         const uint16_t *const tmp = &ctxt->adjacent_hbd[plane][plane_col];
    4204           0 :         const uint8_t *const mask = av1_get_obmc_mask(bh);
    4205             : 
    4206           0 :             eb_aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
    4207             :                 tmp_stride, mask, bw, bh, 10);
    4208             : 
    4209             :     }
    4210           0 : }
    4211             : 
    4212             : 
    4213    12923900 : static INLINE void build_obmc_inter_pred_above(
    4214             :     uint8_t is16bit ,
    4215             :     MacroBlockD *xd,
    4216             :     int rel_mi_col,
    4217             :     uint8_t above_mi_width,
    4218             :     MbModeInfo *above_mi,
    4219             :     void *fun_ctxt,
    4220             :     const int num_planes)
    4221             : {
    4222             :     (void)above_mi;
    4223             :     (void)is16bit;
    4224             :     (void)num_planes;
    4225    12923900 :     struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
    4226    12923900 :     const BlockSize bsize = xd->sb_type;
    4227             : 
    4228    12923900 :     const int overlap =
    4229    12923900 :         AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
    4230             : 
    4231    12923900 :     int32_t tot_planes = (ctxt->perform_chroma ? 3 : 1);
    4232             : 
    4233    26544800 :     for (int plane = 0; plane < tot_planes; ++plane)
    4234             :     {
    4235    13621400 :         int subsampling_x = plane > 0 ? 1 : 0;
    4236    13621400 :         int subsampling_y = plane > 0 ? 1 : 0;
    4237             : 
    4238    13621400 :         const int bw = (above_mi_width * MI_SIZE) >> subsampling_x;
    4239    13621400 :         const int bh = overlap >> subsampling_y;
    4240    13621400 :         const int plane_col = (rel_mi_col * MI_SIZE) >> subsampling_x;
    4241             : 
    4242    13621400 :         if (av1_skip_u4x4_pred_in_obmc(bsize, 0, subsampling_x, subsampling_y)) continue;
    4243             : 
    4244             : 
    4245    13325500 :         const int dst_stride = plane == 0 ? ctxt->final_dst_stride_y : plane == 1 ? ctxt->final_dst_stride_u : ctxt->final_dst_stride_v;
    4246    13325500 :         uint8_t *const dst = plane == 0 ? &ctxt->final_dst_ptr_y[plane_col] : plane == 1 ? &ctxt->final_dst_ptr_u[plane_col] : &ctxt->final_dst_ptr_v[plane_col];
    4247             : 
    4248    13325500 :         const int tmp_stride = ctxt->adjacent_stride[plane];
    4249    13325500 :         const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col];
    4250    13325500 :         const uint8_t *const mask = av1_get_obmc_mask(bh);
    4251             : 
    4252    13325800 :         aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
    4253             :             mask, bw, bh);
    4254             :     }
    4255    12923400 : }
    4256             : 
    4257           0 : static INLINE void build_obmc_inter_pred_left_hbd(
    4258             :     uint8_t         is16bit ,
    4259             :     MacroBlockD     *xd,
    4260             :     int             rel_mi_row,
    4261             :     uint8_t         left_mi_height,
    4262             :     MbModeInfo      *left_mi,
    4263             :     void            *fun_ctxt,
    4264             :     const int       num_planes)
    4265             : {
    4266             :     (void)left_mi;
    4267             :     (void)is16bit;
    4268             :     (void)num_planes;
    4269           0 :     struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
    4270           0 :     const BlockSize bsize = xd->sb_type;
    4271           0 :     const int overlap =  AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
    4272             : 
    4273           0 :     int32_t tot_planes = (ctxt->perform_chroma ? 3 : 1);
    4274             : 
    4275           0 :     for (int plane = 0; plane < tot_planes ; ++plane)
    4276             :     {
    4277           0 :         int subsampling_x = plane > 0 ? 1 : 0;
    4278           0 :         int subsampling_y = plane > 0 ? 1 : 0;
    4279             : 
    4280             :         //const struct macroblockd_plane *pd = &xd->plane[plane];
    4281           0 :         const int bw = overlap >> subsampling_x;
    4282           0 :         const int bh = (left_mi_height * MI_SIZE) >> subsampling_y;
    4283           0 :         const int plane_row = (rel_mi_row * MI_SIZE) >> subsampling_y;
    4284             : 
    4285           0 :         if (av1_skip_u4x4_pred_in_obmc(bsize,1,subsampling_x, subsampling_y)) continue;
    4286             : 
    4287           0 :         const int dst_stride = plane == 0  ? ctxt->final_dst_stride_y                       : plane == 1 ? ctxt->final_dst_stride_u : ctxt->final_dst_stride_v;
    4288           0 :         uint16_t *const dst   = plane == 0  ? &ctxt->final_dst_ptr_y_hbd[plane_row * dst_stride] : plane == 1 ? &ctxt->final_dst_ptr_u_hbd[plane_row * dst_stride] : &ctxt->final_dst_ptr_v_hbd[plane_row * dst_stride];
    4289           0 :         const int tmp_stride = ctxt->adjacent_stride[plane];
    4290           0 :         const uint16_t *const tmp = &ctxt->adjacent_hbd[plane][plane_row * tmp_stride];
    4291           0 :         const uint8_t *const mask = av1_get_obmc_mask(bw);
    4292             : 
    4293             : 
    4294           0 :             eb_aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
    4295             :                 tmp_stride, mask, bw, bh, 10);
    4296             : 
    4297             :     }
    4298           0 : }
    4299             : 
    4300    13596700 : static INLINE void build_obmc_inter_pred_left(
    4301             :     uint8_t         is16bit ,
    4302             :     MacroBlockD     *xd,
    4303             :     int             rel_mi_row,
    4304             :     uint8_t         left_mi_height,
    4305             :     MbModeInfo      *left_mi,
    4306             :     void            *fun_ctxt,
    4307             :     const int       num_planes)
    4308             : {
    4309             :     (void)left_mi;
    4310             :     (void)is16bit;
    4311             :     (void)num_planes;
    4312    13596700 :     struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
    4313    13596700 :     const BlockSize bsize = xd->sb_type;
    4314    13596700 :     const int overlap =  AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
    4315             : 
    4316    13596700 :     int32_t tot_planes = (ctxt->perform_chroma ? 3 : 1);
    4317             : 
    4318    27909000 :     for (int plane = 0; plane < tot_planes ; ++plane)
    4319             :     {
    4320    14313300 :         int subsampling_x = plane > 0 ? 1 : 0;
    4321    14313300 :         int subsampling_y = plane > 0 ? 1 : 0;
    4322             : 
    4323             :         //const struct macroblockd_plane *pd = &xd->plane[plane];
    4324    14313300 :         const int bw = overlap >> subsampling_x;
    4325    14313300 :         const int bh = (left_mi_height * MI_SIZE) >> subsampling_y;
    4326    14313300 :         const int plane_row = (rel_mi_row * MI_SIZE) >> subsampling_y;
    4327             : 
    4328    14313300 :         if (av1_skip_u4x4_pred_in_obmc(bsize,1,subsampling_x, subsampling_y)) continue;
    4329             : 
    4330    14312800 :         const int dst_stride = plane == 0  ? ctxt->final_dst_stride_y                       : plane == 1 ? ctxt->final_dst_stride_u : ctxt->final_dst_stride_v;
    4331    14312800 :         uint8_t *const dst   = plane == 0  ? &ctxt->final_dst_ptr_y[plane_row * dst_stride] : plane == 1 ? &ctxt->final_dst_ptr_u[plane_row * dst_stride] : &ctxt->final_dst_ptr_v[plane_row * dst_stride];
    4332    14312800 :         const int tmp_stride = ctxt->adjacent_stride[plane];
    4333    14312800 :         const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride];
    4334    14312800 :         const uint8_t *const mask = av1_get_obmc_mask(bw);
    4335             : 
    4336             : 
    4337    14312600 :             aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
    4338             :                 mask, bw, bh);
    4339             :     }
    4340    13595800 : }
    4341             : 
    4342             : 
    4343             : // This function combines motion compensated predictions that are generated by
    4344             : // top/left neighboring blocks' inter predictors with the regular inter
    4345             : // prediction. We assume the original prediction (bmc) is stored in
    4346             : // xd->plane[].dst.buf
    4347           0 : void av1_build_obmc_inter_prediction_hbd(
    4348             :     uint16_t     *final_dst_ptr_y,
    4349             :     uint16_t     final_dst_stride_y,
    4350             :     uint16_t     *final_dst_ptr_u,
    4351             :     uint16_t     final_dst_stride_u,
    4352             :     uint16_t     *final_dst_ptr_v,
    4353             :     uint16_t     final_dst_stride_v,
    4354             :     EbBool      perform_chroma,
    4355             :     BlockSize   bsize,
    4356             :     PictureControlSet  *picture_control_set_ptr,
    4357             :     MacroBlockD    *xd,
    4358             :     int          mi_row,
    4359             :     int          mi_col,
    4360             :     uint16_t     *above[MAX_MB_PLANE],
    4361             :     int          above_stride[MAX_MB_PLANE],
    4362             :     uint16_t     *left[MAX_MB_PLANE],
    4363             :     int        left_stride[MAX_MB_PLANE])
    4364             : {
    4365           0 :     uint8_t is16bit = 1;
    4366             :     // handle above row
    4367             :     struct obmc_inter_pred_ctxt ctxt_above ;
    4368             : 
    4369           0 :     ctxt_above.adjacent =(uint8_t**)above;
    4370           0 :     ctxt_above.adjacent_hbd = above;
    4371           0 :     ctxt_above.adjacent_stride = above_stride;
    4372             : 
    4373           0 :     ctxt_above.final_dst_ptr_y = (uint8_t*)final_dst_ptr_y;
    4374           0 :     ctxt_above.final_dst_ptr_y_hbd = final_dst_ptr_y;
    4375           0 :     ctxt_above.final_dst_stride_y = final_dst_stride_y;
    4376           0 :     ctxt_above.final_dst_ptr_u = (uint8_t*)final_dst_ptr_u;
    4377           0 :     ctxt_above.final_dst_ptr_u_hbd = final_dst_ptr_u;
    4378           0 :     ctxt_above.final_dst_stride_u = final_dst_stride_u;
    4379           0 :     ctxt_above.final_dst_ptr_v = (uint8_t*)final_dst_ptr_v;
    4380           0 :     ctxt_above.final_dst_ptr_v_hbd = final_dst_ptr_v;
    4381           0 :     ctxt_above.final_dst_stride_v = final_dst_stride_v;
    4382           0 :     ctxt_above.perform_chroma =  perform_chroma;
    4383             : 
    4384           0 :     foreach_overlappable_nb_above(
    4385             :         is16bit,
    4386           0 :         picture_control_set_ptr->parent_pcs_ptr->av1_cm,
    4387             :         xd,
    4388             :         mi_col,
    4389           0 :         max_neighbor_obmc[mi_size_wide_log2[bsize]],
    4390             :         build_obmc_inter_pred_above_hbd,
    4391             :         &ctxt_above);
    4392             : 
    4393             :     // handle left column
    4394             :     struct obmc_inter_pred_ctxt ctxt_left ;
    4395             : 
    4396           0 :     ctxt_left.adjacent = (uint8_t**)left;
    4397           0 :     ctxt_left.adjacent_hbd = left;
    4398           0 :     ctxt_left.adjacent_stride = left_stride;
    4399             : 
    4400           0 :     ctxt_left.final_dst_ptr_y = (uint8_t*)final_dst_ptr_y;
    4401           0 :     ctxt_left.final_dst_ptr_y_hbd = final_dst_ptr_y;
    4402           0 :     ctxt_left.final_dst_stride_y = final_dst_stride_y;
    4403           0 :     ctxt_left.final_dst_ptr_u = (uint8_t*)final_dst_ptr_u;
    4404           0 :     ctxt_left.final_dst_ptr_u_hbd = final_dst_ptr_u;
    4405           0 :     ctxt_left.final_dst_stride_u = final_dst_stride_u;
    4406           0 :     ctxt_left.final_dst_ptr_v = (uint8_t*)final_dst_ptr_v;
    4407           0 :     ctxt_left.final_dst_ptr_v_hbd = final_dst_ptr_v;
    4408           0 :     ctxt_left.final_dst_stride_v = final_dst_stride_v;
    4409           0 :     ctxt_left.perform_chroma =  perform_chroma;
    4410             : 
    4411           0 :     foreach_overlappable_nb_left(
    4412             :         is16bit,
    4413           0 :         picture_control_set_ptr->parent_pcs_ptr->av1_cm,
    4414             :         xd,
    4415             :         mi_row,
    4416           0 :         max_neighbor_obmc[mi_size_high_log2[bsize]],
    4417             :         build_obmc_inter_pred_left_hbd,
    4418             :         &ctxt_left);
    4419           0 : }
    4420             : 
    4421             : // This function combines motion compensated predictions that are generated by
    4422             : // top/left neighboring blocks' inter predictors with the regular inter
    4423             : // prediction. We assume the original prediction (bmc) is stored in
    4424             : // xd->plane[].dst.buf
    4425    13103800 : void av1_build_obmc_inter_prediction(
    4426             :     uint8_t     *final_dst_ptr_y,
    4427             :     uint16_t     final_dst_stride_y,
    4428             :     uint8_t     *final_dst_ptr_u,
    4429             :     uint16_t     final_dst_stride_u,
    4430             :     uint8_t     *final_dst_ptr_v,
    4431             :     uint16_t     final_dst_stride_v,
    4432             :     EbBool      perform_chroma,
    4433             :     BlockSize   bsize,
    4434             :     PictureControlSet  *picture_control_set_ptr,
    4435             :     MacroBlockD    *xd,
    4436             :     int          mi_row,
    4437             :     int          mi_col,
    4438             :     uint8_t     *above[MAX_MB_PLANE],
    4439             :     int          above_stride[MAX_MB_PLANE],
    4440             :     uint8_t     *left[MAX_MB_PLANE],
    4441             :     int        left_stride[MAX_MB_PLANE])
    4442             : {
    4443    13103800 :     uint8_t is16bit = 0;
    4444             :     // handle above row
    4445             :     struct obmc_inter_pred_ctxt ctxt_above ;
    4446             : 
    4447    13103800 :     ctxt_above.adjacent = above;
    4448    13103800 :     ctxt_above.adjacent_hbd = (uint16_t**)above;
    4449    13103800 :     ctxt_above.adjacent_stride = above_stride;
    4450             : 
    4451    13103800 :     ctxt_above.final_dst_ptr_y = final_dst_ptr_y;
    4452    13103800 :     ctxt_above.final_dst_ptr_y_hbd = (uint16_t*)final_dst_ptr_y;
    4453    13103800 :     ctxt_above.final_dst_stride_y = final_dst_stride_y;
    4454    13103800 :     ctxt_above.final_dst_ptr_u = final_dst_ptr_u;
    4455    13103800 :     ctxt_above.final_dst_ptr_u_hbd = (uint16_t*)final_dst_ptr_u;
    4456    13103800 :     ctxt_above.final_dst_stride_u = final_dst_stride_u;
    4457    13103800 :     ctxt_above.final_dst_ptr_v = final_dst_ptr_v;
    4458    13103800 :     ctxt_above.final_dst_ptr_v_hbd = (uint16_t*)final_dst_ptr_v;
    4459    13103800 :     ctxt_above.final_dst_stride_v = final_dst_stride_v;
    4460    13103800 :     ctxt_above.perform_chroma =  perform_chroma;
    4461             : 
    4462    13103800 :     foreach_overlappable_nb_above(
    4463             :         is16bit,
    4464    13103800 :         picture_control_set_ptr->parent_pcs_ptr->av1_cm,
    4465             :         xd,
    4466             :         mi_col,
    4467    13103800 :         max_neighbor_obmc[mi_size_wide_log2[bsize]],
    4468             :         build_obmc_inter_pred_above,
    4469             :         &ctxt_above);
    4470             : 
    4471             :     // handle left column
    4472             :     struct obmc_inter_pred_ctxt ctxt_left;
    4473             : 
    4474    13102300 :     ctxt_left.adjacent = left;
    4475    13102300 :     ctxt_left.adjacent_hbd = (uint16_t**)left;
    4476    13102300 :     ctxt_left.adjacent_stride = left_stride;
    4477             : 
    4478    13102300 :     ctxt_left.final_dst_ptr_y = final_dst_ptr_y;
    4479    13102300 :     ctxt_left.final_dst_ptr_y_hbd = (uint16_t*)final_dst_ptr_y;
    4480    13102300 :     ctxt_left.final_dst_stride_y = final_dst_stride_y;
    4481    13102300 :     ctxt_left.final_dst_ptr_u = final_dst_ptr_u;
    4482    13102300 :     ctxt_left.final_dst_ptr_u_hbd = (uint16_t*)final_dst_ptr_u;
    4483    13102300 :     ctxt_left.final_dst_stride_u = final_dst_stride_u;
    4484    13102300 :     ctxt_left.final_dst_ptr_v = final_dst_ptr_v;
    4485    13102300 :     ctxt_left.final_dst_ptr_v_hbd = (uint16_t*)final_dst_ptr_v;
    4486    13102300 :     ctxt_left.final_dst_stride_v = final_dst_stride_v;
    4487    13102300 :     ctxt_left.perform_chroma =  perform_chroma;
    4488             : 
    4489    13102300 :     foreach_overlappable_nb_left(
    4490             :         is16bit,
    4491    13102300 :         picture_control_set_ptr->parent_pcs_ptr->av1_cm,
    4492             :         xd,
    4493             :         mi_row,
    4494    13102300 :         max_neighbor_obmc[mi_size_high_log2[bsize]],
    4495             :         build_obmc_inter_pred_left,
    4496             :         &ctxt_left);
    4497    13103000 : }
    4498             : struct calc_target_weighted_pred_ctxt {
    4499             :     int32_t *mask_buf;
    4500             :     int32_t *wsrc_buf;
    4501             :     const uint8_t *tmp;
    4502             :     int tmp_stride;
    4503             :     int overlap;
    4504             : };
    4505             : 
    4506      395029 : static INLINE void calc_target_weighted_pred_above(
    4507             :     uint8_t         is16bit,
    4508             :     MacroBlockD     *xd,
    4509             :     int             rel_mi_col,
    4510             :     uint8_t         nb_mi_width,
    4511             :     MbModeInfo      *nb_mi,
    4512             :     void            *fun_ctxt,
    4513             :     const int       num_planes)
    4514             : {
    4515             :     (void)nb_mi;
    4516             :     (void)num_planes;
    4517             :     (void)is16bit;
    4518      395029 :     struct calc_target_weighted_pred_ctxt *ctxt =
    4519             :         (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
    4520             : 
    4521      395029 :     const int bw = xd->n4_w << MI_SIZE_LOG2;
    4522      395029 :     const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
    4523             : 
    4524      395031 :     int32_t *wsrc = ctxt->wsrc_buf + (rel_mi_col * MI_SIZE);
    4525      395031 :     int32_t *mask = ctxt->mask_buf + (rel_mi_col * MI_SIZE);
    4526      395031 :     const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
    4527             : 
    4528             :     {
    4529     3971690 :         for (int row = 0; row < ctxt->overlap; ++row) {
    4530     3576660 :             const uint8_t m0 = mask1d[row];
    4531     3576660 :             const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
    4532    71445200 :             for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
    4533    67868600 :                 wsrc[col] = m1 * tmp[col];
    4534    67868600 :                 mask[col] = m0;
    4535             :             }
    4536     3576660 :             wsrc += bw;
    4537     3576660 :             mask += bw;
    4538     3576660 :             tmp += ctxt->tmp_stride;
    4539             :         }
    4540             :     }
    4541      395031 : }
    4542             : 
    4543      415741 : static INLINE void calc_target_weighted_pred_left(
    4544             :     uint8_t         is16bit,
    4545             :     MacroBlockD     *xd,
    4546             :     int             rel_mi_row,
    4547             :     uint8_t         nb_mi_height,
    4548             :     MbModeInfo      *nb_mi,
    4549             :     void            *fun_ctxt,
    4550             :     const int       num_planes)
    4551             : {
    4552             :     (void)nb_mi;
    4553             :     (void)num_planes;
    4554             :     (void)is16bit;
    4555             : 
    4556      415741 :     struct calc_target_weighted_pred_ctxt *ctxt =
    4557             :         (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
    4558             : 
    4559      415741 :     const int bw = xd->n4_w << MI_SIZE_LOG2;
    4560      415741 :     const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
    4561             : 
    4562      415740 :     int32_t *wsrc = ctxt->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
    4563      415740 :     int32_t *mask = ctxt->mask_buf + (rel_mi_row * MI_SIZE * bw);
    4564      415740 :     const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
    4565             : 
    4566             :     {
    4567     7434320 :         for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
    4568    77138500 :             for (int col = 0; col < ctxt->overlap; ++col) {
    4569    70120000 :                 const uint8_t m0 = mask1d[col];
    4570    70120000 :                 const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
    4571    70120000 :                 wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
    4572    70120000 :                     (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
    4573    70120000 :                 mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
    4574             :             }
    4575     7018580 :             wsrc += bw;
    4576     7018580 :             mask += bw;
    4577     7018580 :             tmp += ctxt->tmp_stride;
    4578             :         }
    4579             :     }
    4580             : 
    4581      415740 : }
    4582             : // This function has a structure similar to av1_build_obmc_inter_prediction
    4583             : //
    4584             : // The OBMC predictor is computed as:
    4585             : //
    4586             : //  PObmc(x,y) =
    4587             : //    AOM_BLEND_A64(Mh(x),
    4588             : //                  AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)),
    4589             : //                  PLeft(x, y))
    4590             : //
    4591             : // Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate
    4592             : // rounding, this can be written as:
    4593             : //
    4594             : //  AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) =
    4595             : //    Mh(x) * Mv(y) * P(x,y) +
    4596             : //      Mh(x) * Cv(y) * Pabove(x,y) +
    4597             : //      AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
    4598             : //
    4599             : // Where :
    4600             : //
    4601             : //  Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y)
    4602             : //  Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y)
    4603             : //
    4604             : // This function computes 'wsrc' and 'mask' as:
    4605             : //
    4606             : //  wsrc(x, y) =
    4607             : //    AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) -
    4608             : //      Mh(x) * Cv(y) * Pabove(x,y) +
    4609             : //      AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
    4610             : //
    4611             : //  mask(x, y) = Mh(x) * Mv(y)
    4612             : //
    4613             : // These can then be used to efficiently approximate the error for any
    4614             : // predictor P in the context of the provided neighbouring predictors by
    4615             : // computing:
    4616             : //
    4617             : //  error(x, y) =
    4618             : //    wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2)
    4619             : //
    4620      399602 : static void calc_target_weighted_pred(
    4621             :     PictureControlSet       *picture_control_set_ptr,
    4622             :     ModeDecisionContext     *context_ptr,
    4623             :     const AV1_COMMON        *cm,
    4624             :     const MacroBlockD       *xd,
    4625             :     int                     mi_row,
    4626             :     int                     mi_col,
    4627             :     const uint8_t           *above,
    4628             :     int                     above_stride,
    4629             :     const uint8_t           *left,
    4630             :     int                     left_stride)
    4631             : {
    4632      399602 :     uint8_t is16bit =0;
    4633      399602 :     const BlockSize bsize = context_ptr->blk_geom->bsize;
    4634      399602 :     const int bw = xd->n4_w << MI_SIZE_LOG2;
    4635      399602 :     const int bh = xd->n4_h << MI_SIZE_LOG2;
    4636      399602 :     int32_t *mask_buf = context_ptr->mask_buf;
    4637      399602 :     int32_t *wsrc_buf = context_ptr->wsrc_buf;
    4638             : 
    4639      399602 :     const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
    4640             : 
    4641      399602 :     memset(wsrc_buf,0, sizeof(int32_t)*bw * bh);
    4642   150056000 :     for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
    4643             : 
    4644             :     // handle above row
    4645      399602 :     if (xd->up_available) {
    4646      378202 :         const int overlap =
    4647      378202 :             AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
    4648      378202 :         struct calc_target_weighted_pred_ctxt ctxt = {
    4649             :             mask_buf,
    4650             :             wsrc_buf,
    4651             :             above,
    4652             :             above_stride,
    4653             :             overlap };
    4654             : 
    4655      378202 :         foreach_overlappable_nb_above(
    4656             :             is16bit,
    4657             :             cm,
    4658             :             (MacroBlockD *)xd,
    4659             :             mi_col,
    4660      378202 :             max_neighbor_obmc[mi_size_wide_log2[bsize]],
    4661             :             calc_target_weighted_pred_above,
    4662             :             &ctxt);
    4663             :     }
    4664             : 
    4665   150037000 :     for (int i = 0; i < bw * bh; ++i) {
    4666   149638000 :         wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
    4667   149638000 :         mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
    4668             :     }
    4669             : 
    4670             :     // handle left column
    4671      399601 :     if (xd->left_available) {
    4672      383089 :         const int overlap =
    4673      383089 :             AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
    4674      383089 :         struct calc_target_weighted_pred_ctxt ctxt = { mask_buf,
    4675             :                     wsrc_buf, left, left_stride,
    4676             :                                                        overlap };
    4677             : 
    4678      383089 :         foreach_overlappable_nb_left(
    4679             :             is16bit,
    4680             :             cm,
    4681             :             (MacroBlockD *)xd,
    4682             :             mi_row,
    4683      383089 :             max_neighbor_obmc[mi_size_high_log2[bsize]],
    4684             :             calc_target_weighted_pred_left,
    4685             :             &ctxt);
    4686             :     }
    4687             : 
    4688      399602 :     EbPictureBufferDesc   *src_pic = picture_control_set_ptr->parent_pcs_ptr->enhanced_picture_ptr;
    4689      399602 :     const uint8_t         *src = src_pic->buffer_y + (context_ptr->cu_origin_x + src_pic->origin_x) + (context_ptr->cu_origin_y + src_pic->origin_y) * src_pic->stride_y;
    4690             : 
    4691     7744220 :     for (int row = 0; row < bh; ++row) {
    4692   156973000 :         for (int col = 0; col < bw; ++col) {
    4693   149629000 :             wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
    4694             :         }
    4695     7344620 :         wsrc_buf += bw;
    4696     7344620 :         src += src_pic->stride_y;
    4697             :     }
    4698             : 
    4699      399602 : }
    4700             : /* perform all neigh predictions and get wighted src to be used for obmc
    4701             : motion refinement
    4702             : */
    4703      399589 : void precompute_obmc_data(
    4704             :     PictureControlSet            *picture_control_set_ptr,
    4705             :     ModeDecisionContext          *context_ptr)
    4706             : {
    4707             : 
    4708             :     uint8_t * tmp_obmc_bufs[2];
    4709             : 
    4710      399589 :     tmp_obmc_bufs[0] = context_ptr->obmc_buff_0;
    4711      399589 :     tmp_obmc_bufs[1] = context_ptr->obmc_buff_1;
    4712             : 
    4713             : 
    4714             :     uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
    4715      399589 :     int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
    4716      399589 :     int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
    4717             : 
    4718             :     {
    4719      399589 :         dst_buf1[0] = tmp_obmc_bufs[0];
    4720      399589 :         dst_buf1[1] = tmp_obmc_bufs[0] + MAX_SB_SQUARE;
    4721      399589 :         dst_buf1[2] = tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
    4722      399589 :         dst_buf2[0] = tmp_obmc_bufs[1];
    4723      399589 :         dst_buf2[1] = tmp_obmc_bufs[1] + MAX_SB_SQUARE;
    4724      399589 :         dst_buf2[2] = tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
    4725             :     }
    4726             : 
    4727      399589 :     int mi_row = context_ptr->cu_origin_y >> 2;
    4728      399589 :     int mi_col = context_ptr->cu_origin_x >> 2;
    4729             : 
    4730      399589 :     build_prediction_by_above_preds(
    4731             :         1,
    4732      399589 :         context_ptr->blk_geom->bsize, picture_control_set_ptr, context_ptr->cu_ptr->av1xd, mi_row, mi_col, dst_buf1,
    4733             :         dst_stride1);
    4734             : 
    4735      399606 :     build_prediction_by_left_preds(
    4736             :         1,
    4737      399606 :         context_ptr->blk_geom->bsize, picture_control_set_ptr, context_ptr->cu_ptr->av1xd, mi_row, mi_col, dst_buf2,
    4738             :         dst_stride2);
    4739             : 
    4740             : 
    4741      399602 :     calc_target_weighted_pred(
    4742             :         picture_control_set_ptr,
    4743             :         context_ptr,
    4744      399602 :         picture_control_set_ptr->parent_pcs_ptr->av1_cm, context_ptr->cu_ptr->av1xd, mi_row, mi_col, dst_buf1[0],
    4745      399602 :         dst_stride1[0] , dst_buf2[0]  ,
    4746             :         dst_stride2[0] );
    4747             : 
    4748      399607 : }
    4749             : #endif
    4750   328108000 : EbErrorType av1_inter_prediction(
    4751             :     PictureControlSet              *picture_control_set_ptr,
    4752             :     uint32_t                        interp_filters,
    4753             :     CodingUnit                     *cu_ptr,
    4754             :     uint8_t                         ref_frame_type,
    4755             :     MvUnit                         *mv_unit,
    4756             :     uint8_t                         use_intrabc,
    4757             : #if OBMC_FLAG
    4758             :     MotionMode                      motion_mode,
    4759             :     uint8_t                         use_precomputed_obmc,
    4760             :     struct ModeDecisionContext     *md_context,
    4761             : #endif
    4762             :     uint8_t                         compound_idx,
    4763             :     InterInterCompoundData         *interinter_comp,
    4764             : #if II_COMP_FLAG
    4765             :     TileInfo                       * tile,
    4766             :     NeighborArrayUnit              *luma_recon_neighbor_array,
    4767             :     NeighborArrayUnit              *cb_recon_neighbor_array ,
    4768             :     NeighborArrayUnit              *cr_recon_neighbor_array ,
    4769             :     uint8_t                         is_interintra_used ,
    4770             :     INTERINTRA_MODE                 interintra_mode,
    4771             :     uint8_t                         use_wedge_interintra,
    4772             :     int32_t                         interintra_wedge_index,
    4773             : #endif
    4774             :     uint16_t                        pu_origin_x,
    4775             :     uint16_t                        pu_origin_y,
    4776             :     uint8_t                         bwidth,
    4777             :     uint8_t                         bheight,
    4778             :     EbPictureBufferDesc             *ref_pic_list0,
    4779             :     EbPictureBufferDesc             *ref_pic_list1,
    4780             :     EbPictureBufferDesc             *prediction_ptr,
    4781             :     uint16_t                        dst_origin_x,
    4782             :     uint16_t                        dst_origin_y,
    4783             :     EbBool                          perform_chroma,
    4784             :     uint8_t                         bit_depth)
    4785             : {
    4786             : 
    4787   328108000 :     EbErrorType  return_error = EB_ErrorNone;
    4788   328108000 :     uint8_t         is_compound = (mv_unit->pred_direction == BI_PRED) ? 1 : 0;
    4789             :     DECLARE_ALIGNED(32, uint16_t, tmp_dstY[128 * 128]);//move this to context if stack does not hold.
    4790             :     DECLARE_ALIGNED(32, uint16_t, tmp_dstCb[64 * 64]);
    4791             :     DECLARE_ALIGNED(32, uint16_t, tmp_dstCr[64 * 64]);
    4792             : 
    4793             :     MV  mv, mv_q4;
    4794             : 
    4795             :     int32_t subpel_x, subpel_y;
    4796             :     uint8_t * src_ptr;
    4797             :     uint8_t * dst_ptr;
    4798             :     int32_t src_stride;
    4799             :     int32_t dst_stride;
    4800             :     ConvolveParams conv_params;
    4801             : 
    4802             :     InterpFilterParams filter_params_x, filter_params_y;
    4803             : 
    4804   328108000 :     const BlockGeom * blk_geom = get_blk_geom_mds(cu_ptr->mds_idx);
    4805             : 
    4806             : #if OBMC_FLAG
    4807   327811000 :     if (motion_mode == OBMC_CAUSAL) {
    4808    13100600 :         assert(is_compound == 0);
    4809    13100600 :         assert(blk_geom->bwidth > 4 && blk_geom->bheight > 4);
    4810             :     }
    4811             : #endif
    4812             :     //special treatment for chroma in 4XN/NX4 blocks
    4813             :     //if one of the neighbour blocks of the parent square is intra the chroma prediction will follow the normal path using the luma MV of the current nsq block which is the latest sub8x8.
    4814             :     //for this case: only uniPred is allowed.
    4815             : 
    4816   327811000 :     int32_t sub8x8_inter = 0;
    4817   327811000 :     if(perform_chroma && (blk_geom->has_uv && (blk_geom->bwidth == 4 || blk_geom->bheight == 4)))
    4818             : 
    4819             :     {
    4820             :         //CHKN setup input param
    4821             : 
    4822      235508 :         int32_t bw = blk_geom->bwidth_uv;
    4823      235508 :         int32_t bh = blk_geom->bheight_uv;
    4824             :         UNUSED(bw);
    4825             :         UNUSED(bh);
    4826             : 
    4827      235508 :         uint32_t mi_x = pu_origin_x;       //these are luma picture wise
    4828      235508 :         uint32_t mi_y = pu_origin_y;
    4829             : 
    4830      235508 :         MacroBlockD  *xd = cu_ptr->av1xd;
    4831      235508 :         xd->mi_stride = picture_control_set_ptr->mi_stride;
    4832      235508 :         const int32_t offset = (mi_y >> MI_SIZE_LOG2) * xd->mi_stride + (mi_x >> MI_SIZE_LOG2);
    4833      235508 :         xd->mi = picture_control_set_ptr->mi_grid_base + offset;
    4834             : 
    4835             :         //CHKN fill current mi from current block
    4836             :         {
    4837      235508 :             ModeInfo *miPtr = *xd->mi;
    4838             :             uint8_t  miX, miY;
    4839             :             MvReferenceFrame rf[2];
    4840      235508 :             av1_set_ref_frame(rf, ref_frame_type);
    4841      718475 :             for (miY = 0; miY < (blk_geom->bheight >> MI_SIZE_LOG2); miY++) {
    4842     1216110 :                 for (miX = 0; miX < (blk_geom->bwidth >> MI_SIZE_LOG2); miX++) {
    4843      733162 :                     miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.use_intrabc = use_intrabc;
    4844      733162 :                     miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.ref_frame[0] = rf[0];
    4845      733162 :                     if (mv_unit->pred_direction == UNI_PRED_LIST_0) {
    4846      411065 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[0].as_mv.col = mv_unit->mv[REF_LIST_0].x;
    4847      411065 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[0].as_mv.row = mv_unit->mv[REF_LIST_0].y;
    4848             :                     }
    4849      322097 :                     else if (mv_unit->pred_direction == UNI_PRED_LIST_1) {
    4850      322097 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[0].as_mv.col = mv_unit->mv[REF_LIST_1].x;
    4851      322097 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[0].as_mv.row = mv_unit->mv[REF_LIST_1].y;
    4852             :                     }
    4853             :                     else {
    4854           0 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[0].as_mv.col = mv_unit->mv[REF_LIST_0].x;
    4855           0 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[0].as_mv.row = mv_unit->mv[REF_LIST_0].y;
    4856           0 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[1].as_mv.col = mv_unit->mv[REF_LIST_1].x;
    4857           0 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[1].as_mv.row = mv_unit->mv[REF_LIST_1].y;
    4858             :                     }
    4859             :                 }
    4860             :             }
    4861             :         }
    4862             : 
    4863      235527 :         int32_t build_for_obmc = 0;
    4864             : 
    4865      235527 :         const BlockSize bsize = blk_geom->bsize;//mi->sb_type;
    4866      235527 :         assert(bsize < BlockSizeS_ALL);
    4867      235527 :         const int32_t ss_x = 1;// pd->subsampling_x;
    4868      235527 :         const int32_t ss_y = 1;//pd->subsampling_y;
    4869      351983 :         sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
    4870      116456 :             (block_size_high[bsize] < 8 && ss_y);
    4871             : 
    4872      235527 :         if (use_intrabc) sub8x8_inter = 0;
    4873             : 
    4874             :         // For sub8x8 chroma blocks, we may be covering more than one luma block's
    4875             :         // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
    4876             :         // the top-left corner of the prediction source - the correct top-left corner
    4877             :         // is at (pre_x, pre_y).
    4878      235527 :         const int32_t row_start =
    4879      235527 :             (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
    4880      235527 :         const int32_t col_start =
    4881      235527 :             (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
    4882             : 
    4883      235527 :         const int32_t pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
    4884      235527 :         const int32_t pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
    4885             :         UNUSED(pre_x);
    4886             :         UNUSED(pre_y);
    4887             : 
    4888      235527 :         sub8x8_inter = sub8x8_inter && !build_for_obmc;
    4889      235527 :         if (sub8x8_inter) {
    4890      585434 :             for (int32_t row = row_start; row <= 0 && sub8x8_inter; ++row) {
    4891      822189 :                 for (int32_t col = col_start; col <= 0; ++col) {
    4892      472282 :                     ModeInfo *miPtr = *xd->mi;
    4893      472282 :                     const MbModeInfo *this_mbmi = &miPtr[row * xd->mi_stride + col].mbmi;
    4894             : 
    4895      472282 :                     if (!is_inter_block(&this_mbmi->block_mi)) sub8x8_inter = 0;
    4896             :                 }
    4897             :             }
    4898             :         }
    4899             : 
    4900      235528 :         if (sub8x8_inter) {
    4901             :             // block size
    4902      223550 :             const int32_t b4_w = block_size_wide[bsize] >> ss_x;
    4903      223550 :             const int32_t b4_h = block_size_high[bsize] >> ss_y;
    4904      223550 :             const BlockSize plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
    4905      223550 :             assert(plane_bsize < BlockSizeS_ALL);
    4906      223550 :             const int32_t b8_w = block_size_wide[plane_bsize] >> ss_x;
    4907      223550 :             const int32_t b8_h = block_size_high[plane_bsize] >> ss_y;
    4908             : 
    4909      223550 :             assert(!is_compound);
    4910             : 
    4911      223550 :             int32_t row = row_start;
    4912             :             int32_t src_stride;
    4913      561323 :             for (int32_t y = 0; y < b8_h; y += b4_h) {
    4914      337774 :                 int32_t col = col_start;
    4915      791175 :                 for (int32_t x = 0; x < b8_w; x += b4_w) {
    4916      453402 :                     ModeInfo *miPtr = *xd->mi;
    4917      453402 :                     const MbModeInfo *this_mbmi = &miPtr[row * xd->mi_stride + col].mbmi;
    4918             : 
    4919      453402 :                     int32_t tmp_dst_stride = 8;
    4920             :                     UNUSED(tmp_dst_stride);
    4921      453402 :                     assert(bw < 8 || bh < 8);
    4922             : 
    4923      453402 :                     conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstCb, BLOCK_SIZE_64, is_compound, bit_depth);
    4924      453398 :                     conv_params.use_jnt_comp_avg = 0;
    4925      453398 :                     uint8_t ref_idx = get_ref_frame_idx(this_mbmi->block_mi.ref_frame[0]);
    4926      453402 :                     assert(ref_idx < REF_LIST_MAX_DEPTH);
    4927      906804 :                     EbPictureBufferDesc  *ref_pic = this_mbmi->block_mi.ref_frame[0] ==
    4928      187745 :                         LAST_FRAME || this_mbmi->block_mi.ref_frame[0] == LAST2_FRAME || this_mbmi->block_mi.ref_frame[0] == LAST3_FRAME || this_mbmi->block_mi.ref_frame[0] == GOLDEN_FRAME ?
    4929      454314 :                         ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[REF_LIST_0][ref_idx]->object_ptr)->reference_picture :
    4930      186833 :                         ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[REF_LIST_1][ref_idx]->object_ptr)->reference_picture;
    4931      453402 :                     assert(ref_pic != NULL);
    4932      453402 :                     src_ptr = ref_pic->buffer_cb + (ref_pic->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic->stride_cb;
    4933      453402 :                     dst_ptr = prediction_ptr->buffer_cb + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb;
    4934      453402 :                     src_stride = ref_pic->stride_cb;
    4935      453402 :                     dst_stride = prediction_ptr->stride_cb;
    4936      453402 :                     src_ptr = src_ptr + x + y * ref_pic->stride_cb;
    4937      453402 :                     dst_ptr = dst_ptr + x + y * prediction_ptr->stride_cb;
    4938             : 
    4939      453402 :                     const MV mv = this_mbmi->block_mi.mv[0].as_mv;
    4940      453402 :                     mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, blk_geom->bwidth_uv, blk_geom->bheight_uv, 1, 1);
    4941      453402 :                     subpel_x = mv_q4.col & SUBPEL_MASK;
    4942      453402 :                     subpel_y = mv_q4.row & SUBPEL_MASK;
    4943      453402 :                     src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    4944             : 
    4945      453402 :                     av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    4946      453402 :                         &filter_params_y, blk_geom->bwidth_uv, blk_geom->bheight_uv);
    4947             : 
    4948      453397 :                     convolve[subpel_x != 0][subpel_y != 0][is_compound](
    4949             :                         src_ptr,
    4950             :                         src_stride,
    4951             :                         dst_ptr,
    4952             :                         dst_stride,
    4953             :                         b4_w,
    4954             :                         b4_h,
    4955             :                         &filter_params_x,
    4956             :                         &filter_params_y,
    4957             :                         subpel_x,
    4958             :                         subpel_y,
    4959             :                         &conv_params);
    4960             : 
    4961             :                     //Cr
    4962      453394 :                     conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstCr, BLOCK_SIZE_64, is_compound, bit_depth);
    4963      453396 :                     conv_params.use_jnt_comp_avg = 0;
    4964             : 
    4965      453396 :                     src_ptr = ref_pic->buffer_cr + (ref_pic->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic->stride_cr;
    4966      453396 :                     dst_ptr = prediction_ptr->buffer_cr + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cr;
    4967             : 
    4968      453396 :                     src_stride = ref_pic->stride_cr;
    4969      453396 :                     dst_stride = prediction_ptr->stride_cr;
    4970      453396 :                     src_ptr = src_ptr + x + y * ref_pic->stride_cr;
    4971      453396 :                     dst_ptr = dst_ptr + x + y * prediction_ptr->stride_cr;
    4972             : 
    4973      453396 :                     mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, blk_geom->bwidth_uv, blk_geom->bheight_uv, 1, 1);
    4974      453401 :                     subpel_x = mv_q4.col & SUBPEL_MASK;
    4975      453401 :                     subpel_y = mv_q4.row & SUBPEL_MASK;
    4976      453401 :                     src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    4977             : 
    4978      453401 :                     av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    4979      453401 :                         &filter_params_y, blk_geom->bwidth_uv, blk_geom->bheight_uv);
    4980             : 
    4981      453398 :                     convolve[subpel_x != 0][subpel_y != 0][is_compound](
    4982             :                         src_ptr,
    4983             :                         src_stride,
    4984             :                         dst_ptr,
    4985             :                         dst_stride,
    4986             :                         b4_w,
    4987             :                         b4_h,
    4988             :                         &filter_params_x,
    4989             :                         &filter_params_y,
    4990             :                         subpel_x,
    4991             :                         subpel_y,
    4992             :                         &conv_params);
    4993             : 
    4994      453401 :                     ++col;
    4995             :                 }
    4996      337773 :                 ++row;
    4997             :             }
    4998             :         }
    4999             :     }
    5000             : 
    5001             :     MvReferenceFrame rf[2];
    5002   327811000 :     av1_set_ref_frame(rf, ref_frame_type);
    5003   327490000 :     if (mv_unit->pred_direction == UNI_PRED_LIST_0 || mv_unit->pred_direction == BI_PRED) {
    5004             :         //List0-Y
    5005   235061000 :         mv.col = mv_unit->mv[REF_LIST_0].x;
    5006   235061000 :         mv.row = mv_unit->mv[REF_LIST_0].y;
    5007   235061000 :         assert(ref_pic_list0 != NULL);
    5008   235061000 :         src_ptr = ref_pic_list0->buffer_y + ref_pic_list0->origin_x + pu_origin_x + (ref_pic_list0->origin_y + pu_origin_y) * ref_pic_list0->stride_y;
    5009   235061000 :         dst_ptr = prediction_ptr->buffer_y + prediction_ptr->origin_x + dst_origin_x + (prediction_ptr->origin_y + dst_origin_y) * prediction_ptr->stride_y;
    5010   235061000 :         src_stride = ref_pic_list0->stride_y;
    5011   235061000 :         dst_stride = prediction_ptr->stride_y;
    5012             : 
    5013   235061000 :         mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, bwidth, bheight, 0, 0);//mv_q4 has 1 extra bit for fractionnal to accomodate chroma when accessing filter coeffs.
    5014   236556000 :         subpel_x = mv_q4.col & SUBPEL_MASK;
    5015   236556000 :         subpel_y = mv_q4.row & SUBPEL_MASK;
    5016   236556000 :         src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    5017   236556000 :         conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstY, 128, is_compound, bit_depth);
    5018   236656000 :         av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    5019             :             &filter_params_y, bwidth, bheight);
    5020             : 
    5021   235348000 :         convolve[subpel_x != 0][subpel_y != 0][is_compound](
    5022             :             src_ptr,
    5023             :             src_stride,
    5024             :             dst_ptr,
    5025             :             dst_stride,
    5026             :             bwidth,
    5027             :             bheight,
    5028             :             &filter_params_x,
    5029             :             &filter_params_y,
    5030             :             subpel_x,
    5031             :             subpel_y,
    5032             :             &conv_params);
    5033   235718000 :         if (perform_chroma && blk_geom->has_uv && sub8x8_inter == 0) {
    5034             :             //List0-Cb
    5035     2097010 :             src_ptr = ref_pic_list0->buffer_cb + (ref_pic_list0->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic_list0->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list0->stride_cb;
    5036     2097010 :             dst_ptr = prediction_ptr->buffer_cb + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb;
    5037     2097010 :             src_stride = ref_pic_list0->stride_cb;
    5038     2097010 :             dst_stride = prediction_ptr->stride_cb;
    5039             : 
    5040     2097010 :             mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, blk_geom->bwidth_uv, blk_geom->bheight_uv, 1, 1);
    5041     2097660 :             subpel_x = mv_q4.col & SUBPEL_MASK;
    5042     2097660 :             subpel_y = mv_q4.row & SUBPEL_MASK;
    5043     2097660 :             src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    5044     2097660 :             conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstCb, 64, is_compound, bit_depth);
    5045             : 
    5046     2097800 :             av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    5047     2097800 :                 &filter_params_y, blk_geom->bwidth_uv, blk_geom->bheight_uv);
    5048             : 
    5049     2097260 :             if (use_intrabc && (subpel_x != 0 || subpel_y != 0))
    5050           0 :                 convolve_2d_for_intrabc(
    5051             :                     (const uint8_t *)src_ptr,
    5052             :                     src_stride,
    5053             :                     dst_ptr,
    5054             :                     dst_stride,
    5055           0 :                     blk_geom->bwidth_uv,
    5056           0 :                     blk_geom->bheight_uv,
    5057             :                     subpel_x,
    5058             :                     subpel_y,
    5059             :                     &conv_params);
    5060             :             else
    5061     2097260 :                 convolve[subpel_x != 0][subpel_y != 0][is_compound](
    5062             :                     src_ptr,
    5063             :                     src_stride,
    5064             :                     dst_ptr,
    5065             :                     dst_stride,
    5066     2097260 :                     blk_geom->bwidth_uv,
    5067     2097260 :                     blk_geom->bheight_uv,
    5068             :                     &filter_params_x,
    5069             :                     &filter_params_y,
    5070             :                     subpel_x,
    5071             :                     subpel_y,
    5072             :                     &conv_params);
    5073             : 
    5074             :             //List0-Cr
    5075     2097050 :             src_ptr = ref_pic_list0->buffer_cr + (ref_pic_list0->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic_list0->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list0->stride_cr;
    5076     2097050 :             dst_ptr = prediction_ptr->buffer_cr + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cr;
    5077     2097050 :             src_stride = ref_pic_list0->stride_cr;
    5078     2097050 :             dst_stride = prediction_ptr->stride_cr;
    5079             : 
    5080     2097050 :             mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, blk_geom->bwidth_uv, blk_geom->bheight_uv, 1, 1);
    5081     2097720 :             subpel_x = mv_q4.col & SUBPEL_MASK;
    5082     2097720 :             subpel_y = mv_q4.row & SUBPEL_MASK;
    5083     2097720 :             src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    5084     2097720 :             conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstCr, 64, is_compound, bit_depth);
    5085             : 
    5086     2097750 :             if (use_intrabc && (subpel_x != 0 || subpel_y != 0))
    5087           0 :                 convolve_2d_for_intrabc(
    5088             :                     (const uint8_t *)src_ptr,
    5089             :                     src_stride,
    5090             :                     dst_ptr,
    5091             :                     dst_stride,
    5092           0 :                     blk_geom->bwidth_uv,
    5093           0 :                     blk_geom->bheight_uv,
    5094             :                     subpel_x,
    5095             :                     subpel_y,
    5096             :                     &conv_params);
    5097             :             else
    5098     2097750 :                 convolve[subpel_x != 0][subpel_y != 0][is_compound](
    5099             :                     src_ptr,
    5100             :                     src_stride,
    5101             :                     dst_ptr,
    5102             :                     dst_stride,
    5103     2097750 :                     blk_geom->bwidth_uv,
    5104     2097750 :                     blk_geom->bheight_uv,
    5105             :                     &filter_params_x,
    5106             :                     &filter_params_y,
    5107             :                     subpel_x,
    5108             :                     subpel_y,
    5109             :                     &conv_params);
    5110             :         }
    5111             :     }
    5112             : 
    5113   328147000 :     if (mv_unit->pred_direction == UNI_PRED_LIST_1 || mv_unit->pred_direction == BI_PRED) {
    5114             :         //List0-Y
    5115   202852000 :         mv.col = mv_unit->mv[REF_LIST_1].x;
    5116   202852000 :         mv.row = mv_unit->mv[REF_LIST_1].y;
    5117   202852000 :         assert(ref_pic_list1 != NULL);
    5118   202852000 :         src_ptr = ref_pic_list1->buffer_y + ref_pic_list1->origin_x + pu_origin_x + (ref_pic_list1->origin_y + pu_origin_y) * ref_pic_list1->stride_y;
    5119   202852000 :         dst_ptr = prediction_ptr->buffer_y + prediction_ptr->origin_x + dst_origin_x + (prediction_ptr->origin_y + dst_origin_y) * prediction_ptr->stride_y;
    5120   202852000 :         src_stride = ref_pic_list1->stride_y;
    5121   202852000 :         dst_stride = prediction_ptr->stride_y;
    5122             : 
    5123   202852000 :         mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, bwidth, bheight, 0, 0);//mv_q4 has 1 extra bit for fractionnal to accomodate chroma when accessing filter coeffs.
    5124   203993000 :         subpel_x = mv_q4.col & SUBPEL_MASK;
    5125   203993000 :         subpel_y = mv_q4.row & SUBPEL_MASK;
    5126             : 
    5127   203993000 :         src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    5128   203993000 :         conv_params = get_conv_params_no_round(0, (mv_unit->pred_direction == BI_PRED) ? 1 : 0, 0, tmp_dstY, 128, is_compound, bit_depth);
    5129             : 
    5130   204348000 :         av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    5131             :             &filter_params_y, bwidth, bheight);
    5132             : 
    5133             :         //the luma data is applied to chroma below
    5134   203190000 :         av1_dist_wtd_comp_weight_assign(
    5135   203190000 :             &picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header,
    5136   203190000 :             picture_control_set_ptr->parent_pcs_ptr->cur_order_hint,// cur_frame_index,
    5137   203190000 :             picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[0] - 1],// bck_frame_index,
    5138   203190000 :             picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[1] - 1],// fwd_frame_index,
    5139             :             compound_idx,
    5140             :             0,// order_idx,
    5141             :             &conv_params.fwd_offset, &conv_params.bck_offset,
    5142             :             &conv_params.use_dist_wtd_comp_avg, is_compound);
    5143             : 
    5144   203254000 :         conv_params.use_jnt_comp_avg =  conv_params.use_dist_wtd_comp_avg;
    5145             : 
    5146   203254000 :         if (is_compound && is_masked_compound_type(interinter_comp->type)) {
    5147    39412400 :             conv_params.do_average = 0;
    5148    39412400 :             av1_make_masked_inter_predictor(
    5149             :                 src_ptr,
    5150             :                 src_stride,
    5151             :                 dst_ptr,
    5152             :                 dst_stride,
    5153             :                 blk_geom,
    5154             :                 bwidth,
    5155             :                 bheight,
    5156             :                 &filter_params_x,
    5157             :                 &filter_params_y,
    5158             :                 subpel_x,
    5159             :                 subpel_y,
    5160             :                 &conv_params,
    5161             :                 interinter_comp,
    5162             :                 bit_depth,
    5163             :                 0//plane=Luma  seg_mask is computed based on luma and used for chroma
    5164             :                 );
    5165             :         }
    5166             :         else
    5167   163802000 :             convolve[subpel_x != 0][subpel_y != 0][is_compound](
    5168             :                 src_ptr,
    5169             :                 src_stride,
    5170             :                 dst_ptr,
    5171             :                 dst_stride,
    5172             :                 bwidth,
    5173             :                 bheight,
    5174             :                 &filter_params_x,
    5175             :                 &filter_params_y,
    5176             :                 subpel_x,
    5177             :                 subpel_y,
    5178             :                 &conv_params);
    5179   203858000 :         if (perform_chroma && blk_geom->has_uv && sub8x8_inter == 0) {
    5180             :             //List0-Cb
    5181     1526100 :             src_ptr = ref_pic_list1->buffer_cb + (ref_pic_list1->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic_list1->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list1->stride_cb;
    5182     1526100 :             dst_ptr = prediction_ptr->buffer_cb + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb;
    5183     1526100 :             src_stride = ref_pic_list1->stride_cb;
    5184     1526100 :             dst_stride = prediction_ptr->stride_cb;
    5185             : 
    5186     1526100 :             mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, blk_geom->bwidth_uv, blk_geom->bheight_uv, 1, 1);
    5187     1526110 :             subpel_x = mv_q4.col & SUBPEL_MASK;
    5188     1526110 :             subpel_y = mv_q4.row & SUBPEL_MASK;
    5189     1526110 :             src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    5190     1526110 :             conv_params = get_conv_params_no_round(0, (mv_unit->pred_direction == BI_PRED) ? 1 : 0, 0, tmp_dstCb, 64, is_compound, bit_depth);
    5191             : 
    5192     1526110 :             av1_dist_wtd_comp_weight_assign(
    5193     1526110 :                 &picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header,
    5194     1526110 :                 picture_control_set_ptr->parent_pcs_ptr->cur_order_hint,// cur_frame_index,
    5195     1526110 :                 picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[0] - 1],// bck_frame_index,
    5196     1526110 :                 picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[1] - 1],// fwd_frame_index,
    5197             :                 compound_idx,
    5198             :                 0,// order_idx,
    5199             :                 &conv_params.fwd_offset, &conv_params.bck_offset,
    5200             :                 &conv_params.use_dist_wtd_comp_avg, is_compound);
    5201             : 
    5202     1526090 :             conv_params.use_jnt_comp_avg = conv_params.use_dist_wtd_comp_avg;
    5203             : 
    5204     1526090 :             av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    5205     1526090 :                 &filter_params_y, blk_geom->bwidth_uv, blk_geom->bheight_uv);
    5206             : 
    5207     1526050 :             if (is_compound && is_masked_compound_type(interinter_comp->type)) {
    5208      210174 :                 conv_params.do_average = 0;
    5209      210174 :                 av1_make_masked_inter_predictor(
    5210             :                     src_ptr,
    5211             :                     src_stride,
    5212             :                     dst_ptr,
    5213             :                     dst_stride,
    5214             :                     blk_geom,
    5215      210174 :                     blk_geom->bwidth_uv,
    5216      210174 :                     blk_geom->bheight_uv,
    5217             :                     &filter_params_x,
    5218             :                     &filter_params_y,
    5219             :                     subpel_x,
    5220             :                     subpel_y,
    5221             :                     &conv_params,
    5222             :                     interinter_comp,
    5223             :                     bit_depth,
    5224             :                     1//plane=cb  seg_mask is computed based on luma and used for chroma
    5225             :                 );
    5226             :             }
    5227             :             else
    5228     1315870 :                 convolve[subpel_x != 0][subpel_y != 0][is_compound](
    5229             :                     src_ptr,
    5230             :                     src_stride,
    5231             :                     dst_ptr,
    5232             :                     dst_stride,
    5233     1315870 :                     blk_geom->bwidth_uv,
    5234     1315870 :                     blk_geom->bheight_uv,
    5235             :                     &filter_params_x,
    5236             :                     &filter_params_y,
    5237             :                     subpel_x,
    5238             :                     subpel_y,
    5239             :                     &conv_params);
    5240             : 
    5241             :             //List0-Cr
    5242     1526110 :             src_ptr = ref_pic_list1->buffer_cr + (ref_pic_list1->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic_list1->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list1->stride_cr;
    5243     1526110 :             dst_ptr = prediction_ptr->buffer_cr + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cr;
    5244     1526110 :             src_stride = ref_pic_list1->stride_cr;
    5245     1526110 :             dst_stride = prediction_ptr->stride_cr;
    5246             : 
    5247     1526110 :             mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, blk_geom->bwidth_uv, blk_geom->bheight_uv, 1, 1);
    5248     1526110 :             subpel_x = mv_q4.col & SUBPEL_MASK;
    5249     1526110 :             subpel_y = mv_q4.row & SUBPEL_MASK;
    5250     1526110 :             src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    5251     1526110 :             conv_params = get_conv_params_no_round(0, (mv_unit->pred_direction == BI_PRED) ? 1 : 0, 0, tmp_dstCr, 64, is_compound, bit_depth);
    5252     1526120 :             av1_dist_wtd_comp_weight_assign(
    5253     1526120 :                 &picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header,
    5254     1526120 :                 picture_control_set_ptr->parent_pcs_ptr->cur_order_hint,// cur_frame_index,
    5255     1526120 :                 picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[0] - 1],// bck_frame_index,
    5256     1526120 :                 picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[1] - 1],// fwd_frame_index,
    5257             :                 compound_idx,
    5258             :                 0,// order_idx,
    5259             :                 &conv_params.fwd_offset, &conv_params.bck_offset,
    5260             :                 &conv_params.use_dist_wtd_comp_avg, is_compound);
    5261             : 
    5262     1526090 :             conv_params.use_jnt_comp_avg = conv_params.use_dist_wtd_comp_avg;
    5263             : 
    5264     1526090 :             if (is_compound && is_masked_compound_type(interinter_comp->type)) {
    5265      210173 :                 conv_params.do_average = 0;
    5266      210173 :                 av1_make_masked_inter_predictor(
    5267             :                     src_ptr,
    5268             :                     src_stride,
    5269             :                     dst_ptr,
    5270             :                     dst_stride,
    5271             :                     blk_geom,
    5272      210173 :                     blk_geom->bwidth_uv,
    5273      210173 :                     blk_geom->bheight_uv,
    5274             :                     &filter_params_x,
    5275             :                     &filter_params_y,
    5276             :                     subpel_x,
    5277             :                     subpel_y,
    5278             :                     &conv_params,
    5279             :                     interinter_comp,
    5280             :                     bit_depth,
    5281             :                     1//plane=Cr  seg_mask is computed based on luma and used for chroma
    5282             :                 );
    5283             :             }
    5284             :             else
    5285     1315910 :             convolve[subpel_x != 0][subpel_y != 0][is_compound](
    5286             :                 src_ptr,
    5287             :                 src_stride,
    5288             :                 dst_ptr,
    5289             :                 dst_stride,
    5290     1315910 :                 blk_geom->bwidth_uv,
    5291     1315910 :                 blk_geom->bheight_uv,
    5292             :                 &filter_params_x,
    5293             :                 &filter_params_y,
    5294             :                 subpel_x,
    5295             :                 subpel_y,
    5296             :                 &conv_params);
    5297             :         }
    5298             :     }
    5299             : #if II_COMP_FLAG
    5300   328722000 :     if ( is_interintra_used ) {
    5301    11711000 :         int32_t start_plane = 0;
    5302    11711000 :         int32_t end_plane = perform_chroma && blk_geom->has_uv ? MAX_MB_PLANE: 1;
    5303             :         // temp buffer for intra pred
    5304             :         DECLARE_ALIGNED(16, uint8_t, intra_pred[MAX_SB_SQUARE]);
    5305             :         DECLARE_ALIGNED(16, uint8_t, intra_pred_cb[MAX_SB_SQUARE]);
    5306             :         DECLARE_ALIGNED(16, uint8_t, intra_pred_cr[MAX_SB_SQUARE]);
    5307             : 
    5308             :         int32_t  intra_stride;
    5309             : 
    5310    23585000 :         for (int32_t plane = start_plane; plane < end_plane; ++plane) {
    5311             : 
    5312             :             EbPictureBufferDesc  intra_pred_desc;
    5313    11871900 :             intra_pred_desc.origin_x     = intra_pred_desc.origin_y  = 0;
    5314    11871900 :             intra_pred_desc.stride_y     = bwidth;
    5315    11871900 :             intra_pred_desc.stride_cb    = bwidth/2;
    5316    11871900 :             intra_pred_desc.stride_cr    = bwidth/2;
    5317    11871900 :             intra_pred_desc.buffer_y     = intra_pred;
    5318    11871900 :             intra_pred_desc.buffer_cb    = intra_pred_cb;
    5319    11871900 :             intra_pred_desc.buffer_cr    = intra_pred_cr;
    5320             : 
    5321    11871900 :             const int ssx = plane ? 1 : 0;
    5322    11871900 :             const int ssy = plane ? 1 : 0;
    5323    11871900 :             const BlockSize plane_bsize = get_plane_block_size(blk_geom->bsize, ssx, ssy);
    5324             :             //av1_build_interintra_predictors_sbp
    5325             :             uint8_t    topNeighArray[64 * 2 + 1];
    5326             :             uint8_t    leftNeighArray[64 * 2 + 1];
    5327             : 
    5328    11871000 :             uint32_t cu_originx_uv = (pu_origin_x >> 3 << 3) >> 1;
    5329    11871000 :             uint32_t cu_originy_uv = (pu_origin_y >> 3 << 3) >> 1;
    5330             : 
    5331    11871000 :             if (plane == 0) {
    5332    11709800 :                 dst_ptr = prediction_ptr->buffer_y + prediction_ptr->origin_x + dst_origin_x + (prediction_ptr->origin_y + dst_origin_y) * prediction_ptr->stride_y;
    5333    11709800 :                 dst_stride = prediction_ptr->stride_y;
    5334    11709800 :                 intra_stride = intra_pred_desc.stride_y;
    5335             : 
    5336    11709800 :                 if (pu_origin_y != 0)
    5337    11184700 :                     memcpy(topNeighArray + 1, luma_recon_neighbor_array->top_array + pu_origin_x, blk_geom->bwidth * 2);
    5338             : 
    5339    11709800 :                 if (pu_origin_x != 0)
    5340    11255400 :                     memcpy(leftNeighArray + 1, luma_recon_neighbor_array->left_array + pu_origin_y, blk_geom->bheight * 2);
    5341             : 
    5342    11709800 :                 if (pu_origin_y != 0 && pu_origin_x != 0)
    5343    10742600 :                     topNeighArray[0] = leftNeighArray[0] = luma_recon_neighbor_array->top_left_array[MAX_PICTURE_HEIGHT_SIZE + pu_origin_x - pu_origin_y];
    5344             : 
    5345             :             }
    5346             : 
    5347      161193 :             else if (plane == 1) {
    5348       80596 :                 dst_ptr = prediction_ptr->buffer_cb + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb;
    5349       80596 :                  dst_stride = prediction_ptr->stride_cb;
    5350       80596 :                 intra_stride = intra_pred_desc.stride_cb;
    5351             : 
    5352       80596 :                 if (cu_originy_uv != 0)
    5353       75211 :                     memcpy(topNeighArray + 1, cb_recon_neighbor_array->top_array + cu_originx_uv, blk_geom->bwidth_uv * 2);
    5354             : 
    5355       80596 :                 if (cu_originx_uv != 0)
    5356       70003 :                     memcpy(leftNeighArray + 1, cb_recon_neighbor_array->left_array + cu_originy_uv, blk_geom->bheight_uv * 2);
    5357             : 
    5358       80596 :                 if (cu_originy_uv != 0 && cu_originx_uv != 0)
    5359       64833 :                     topNeighArray[0] = leftNeighArray[0] = cb_recon_neighbor_array->top_left_array[MAX_PICTURE_HEIGHT_SIZE / 2 + cu_originx_uv - cu_originy_uv / 2];
    5360             :             }
    5361             :             else {
    5362       80597 :                 dst_ptr = prediction_ptr->buffer_cr + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cr;
    5363       80597 :                  dst_stride = prediction_ptr->stride_cr;
    5364       80597 :                  intra_stride = intra_pred_desc.stride_cr;
    5365             : 
    5366       80597 :                 if (cu_originy_uv != 0)
    5367       75212 :                     memcpy(topNeighArray + 1, cr_recon_neighbor_array->top_array + cu_originx_uv, blk_geom->bwidth_uv * 2);
    5368             : 
    5369       80597 :                 if (cu_originx_uv != 0)
    5370       70003 :                     memcpy(leftNeighArray + 1, cr_recon_neighbor_array->left_array + cu_originy_uv, blk_geom->bheight_uv * 2);
    5371             : 
    5372       80597 :                 if (cu_originy_uv != 0 && cu_originx_uv != 0)
    5373       64833 :                     topNeighArray[0] = leftNeighArray[0] = cr_recon_neighbor_array->top_left_array[MAX_PICTURE_HEIGHT_SIZE / 2 + cu_originx_uv - cu_originy_uv / 2];
    5374             :             }
    5375    11871000 :             TxSize  tx_size = blk_geom->txsize[0][0];               // Nader - Intra 128x128 not supported
    5376    11871000 :             TxSize  tx_size_Chroma = blk_geom->txsize_uv[0][0];     //Nader - Intra 128x128 not supported
    5377             : 
    5378    23742100 :             eb_av1_predict_intra_block(
    5379             :                 tile,
    5380             :                 !ED_STAGE,
    5381             :                 blk_geom,
    5382    11871000 :                 picture_control_set_ptr->parent_pcs_ptr->av1_cm,    //const Av1Common *cm,
    5383    11871000 :                 plane ? blk_geom->bwidth_uv : blk_geom->bwidth,     //int32_t wpx,
    5384    11871000 :                 plane ? blk_geom->bheight_uv : blk_geom->bheight,   //int32_t hpx,
    5385             :                 plane ? tx_size_Chroma : tx_size,                   //TxSize tx_size,
    5386    11871000 :                 interintra_to_intra_mode[interintra_mode],          //PredictionMode mode,
    5387             :                 0,
    5388             :                 0,                                                  //int32_t use_palette,
    5389             : #if PAL_SUP
    5390             :                 NULL, //inter-intra
    5391             : #endif
    5392             :                 FILTER_INTRA_MODES,                                 // FilterIntraMode filter_intra_mode,
    5393             :                 topNeighArray + 1,
    5394             :                 leftNeighArray + 1,
    5395             :                 &intra_pred_desc,                                   //uint8_t *dst,
    5396             :                                                                     //int32_t dst_stride,
    5397             :                 0,                                                  //int32_t col_off,
    5398             :                 0,                                                  //int32_t row_off,
    5399             :                 plane,                                              //int32_t plane,
    5400    11871000 :                 blk_geom->bsize,                                    //uint32_t puSize,
    5401             :                 dst_origin_x,
    5402             :                 dst_origin_y,
    5403             :                 pu_origin_x,
    5404             :                 pu_origin_y,
    5405             :                 0,                                                  //uint32_t cuOrgX used only for prediction Ptr
    5406             :                 0                                                   //uint32_t cuOrgY used only for prediction Ptr
    5407             :             );
    5408             :             //combine_interintra
    5409    11873600 :             combine_interintra(
    5410             :                 interintra_mode,
    5411             :                 use_wedge_interintra,
    5412             :                 interintra_wedge_index,
    5413             :                 INTERINTRA_WEDGE_SIGN,
    5414    11873600 :                 blk_geom->bsize,
    5415             :                 plane_bsize,
    5416             :                 dst_ptr,
    5417             :                 dst_stride,
    5418             :                 dst_ptr,       // Inter pred buff
    5419             :                 dst_stride,    // Inter pred stride
    5420             :                 (plane == 0) ? intra_pred : (plane == 1) ? intra_pred_cb : intra_pred_cr,  // Intra pred buff
    5421             :                 intra_stride); // Intra pred stride
    5422             : 
    5423             :         }
    5424             :     }
    5425             : #endif
    5426             : #if OBMC_FLAG
    5427   328724000 :     if (motion_mode == OBMC_CAUSAL)
    5428             :     {
    5429             : 
    5430             :         uint8_t * tmp_obmc_bufs[2];
    5431             : 
    5432             :         DECLARE_ALIGNED(16, uint8_t, obmc_buff_0[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
    5433             :         DECLARE_ALIGNED(16, uint8_t, obmc_buff_1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
    5434    13101400 :         tmp_obmc_bufs[0] = obmc_buff_0;
    5435    13101400 :         tmp_obmc_bufs[1] = obmc_buff_1;
    5436             : 
    5437             :         uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
    5438    13101400 :         int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
    5439    13101400 :         int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
    5440             : 
    5441             :         {
    5442    13101400 :             dst_buf1[0] = tmp_obmc_bufs[0];
    5443    13101400 :             dst_buf1[1] = tmp_obmc_bufs[0] + MAX_SB_SQUARE;
    5444    13101400 :             dst_buf1[2] = tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
    5445    13101400 :             dst_buf2[0] = tmp_obmc_bufs[1];
    5446    13101400 :             dst_buf2[1] = tmp_obmc_bufs[1] + MAX_SB_SQUARE;
    5447    13101400 :             dst_buf2[2] = tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
    5448             :         }
    5449             : 
    5450    13101400 :         int mi_row = pu_origin_y >> 2;
    5451    13101400 :         int mi_col = pu_origin_x >> 2;
    5452             : 
    5453    13101400 :         if (use_precomputed_obmc)
    5454             :         {
    5455    13098700 :             dst_buf1[0] = md_context->obmc_buff_0;
    5456    13098700 :             dst_buf1[1] = md_context->obmc_buff_0 + MAX_SB_SQUARE;
    5457    13098700 :             dst_buf1[2] = md_context->obmc_buff_0 + MAX_SB_SQUARE*2;
    5458    13098700 :             dst_buf2[0] = md_context->obmc_buff_1;
    5459    13098700 :             dst_buf2[1] = md_context->obmc_buff_1 + MAX_SB_SQUARE;
    5460    13098700 :             dst_buf2[2] = md_context->obmc_buff_1 + MAX_SB_SQUARE*2;
    5461             :         }
    5462             :         else
    5463             :         {
    5464        2717 :             build_prediction_by_above_preds(
    5465             :                 perform_chroma,
    5466        2717 :                 blk_geom->bsize, picture_control_set_ptr, cu_ptr->av1xd, mi_row, mi_col, dst_buf1,
    5467             :                 dst_stride1);
    5468             : 
    5469        1794 :             build_prediction_by_left_preds(
    5470             :                 perform_chroma,
    5471        1794 :                 blk_geom->bsize, picture_control_set_ptr, cu_ptr->av1xd, mi_row, mi_col, dst_buf2,
    5472             :                 dst_stride2);
    5473             :         }
    5474             : 
    5475    13100500 :         uint8_t * final_dst_ptr_y  = prediction_ptr->buffer_y + prediction_ptr->origin_x + dst_origin_x + (prediction_ptr->origin_y + dst_origin_y) * prediction_ptr->stride_y;
    5476    13100500 :         uint16_t  final_dst_stride_y = prediction_ptr->stride_y;
    5477             : 
    5478    13100500 :         uint8_t * final_dst_ptr_u = prediction_ptr->buffer_cb + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb;
    5479    13100500 :         uint16_t  final_dst_stride_u = prediction_ptr->stride_cb;
    5480             : 
    5481    13100500 :         uint8_t * final_dst_ptr_v = prediction_ptr->buffer_cr + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cr;
    5482    13100500 :         uint16_t  final_dst_stride_v = prediction_ptr->stride_cr;
    5483             : 
    5484    13100500 :         av1_build_obmc_inter_prediction(
    5485             :             final_dst_ptr_y,
    5486             :             final_dst_stride_y,
    5487             :             final_dst_ptr_u,
    5488             :             final_dst_stride_u,
    5489             :             final_dst_ptr_v,
    5490             :             final_dst_stride_v,
    5491             :             perform_chroma,
    5492    13100500 :             blk_geom->bsize,
    5493             :             picture_control_set_ptr,
    5494             :             cu_ptr->av1xd,
    5495             :             mi_row,
    5496             :             mi_col,
    5497             :             dst_buf1,
    5498             :             dst_stride1,
    5499             :             dst_buf2,
    5500             :             dst_stride2);
    5501             :     }
    5502             : #endif
    5503   328725000 :     return return_error;
    5504             : }
    5505             : 
    5506             : 
    5507             : 
    5508           0 : EbErrorType av1_inter_prediction_hbd(
    5509             :     PictureControlSet              *picture_control_set_ptr,
    5510             :     uint32_t                        interp_filters,
    5511             :     CodingUnit                     *cu_ptr,
    5512             :     uint8_t                         ref_frame_type,
    5513             :     MvUnit                         *mv_unit,
    5514             :     uint8_t                         use_intrabc,
    5515             : #if OBMC_FLAG
    5516             :     MotionMode                      motion_mode,
    5517             :     uint8_t                         use_precomputed_obmc,
    5518             :     struct ModeDecisionContext     *md_context,
    5519             : #endif
    5520             :     uint8_t                         compound_idx,
    5521             :     InterInterCompoundData         *interinter_comp,
    5522             : #if II_COMP_FLAG
    5523             :     TileInfo                       * tile,
    5524             :     NeighborArrayUnit              *luma_recon_neighbor_array,
    5525             :     NeighborArrayUnit              *cb_recon_neighbor_array ,
    5526             :     NeighborArrayUnit              *cr_recon_neighbor_array ,
    5527             :     uint8_t                         is_interintra_used ,
    5528             :     INTERINTRA_MODE                 interintra_mode,
    5529             :     uint8_t                         use_wedge_interintra,
    5530             :     int32_t                         interintra_wedge_index,
    5531             : #endif
    5532             :     uint16_t                        pu_origin_x,
    5533             :     uint16_t                        pu_origin_y,
    5534             :     uint8_t                         bwidth,
    5535             :     uint8_t                         bheight,
    5536             :     EbPictureBufferDesc             *ref_pic_list0,
    5537             :     EbPictureBufferDesc             *ref_pic_list1,
    5538             :     EbPictureBufferDesc             *prediction_ptr,
    5539             :     uint16_t                        dst_origin_x,
    5540             :     uint16_t                        dst_origin_y,
    5541             :     EbBool                          perform_chroma,
    5542             :     uint8_t                         bit_depth)
    5543             : {
    5544             :     (void)use_precomputed_obmc;
    5545             :     (void) md_context;
    5546             : 
    5547           0 :     EbErrorType  return_error = EB_ErrorNone;
    5548           0 :     uint8_t         is_compound = (mv_unit->pred_direction == BI_PRED) ? 1 : 0;
    5549             :     DECLARE_ALIGNED(32, uint16_t, tmp_dstY[128 * 128]);//move this to context if stack does not hold.
    5550             :     DECLARE_ALIGNED(32, uint16_t, tmp_dstCb[64 * 64]);
    5551             :     DECLARE_ALIGNED(32, uint16_t, tmp_dstCr[64 * 64]);
    5552             :     MV  mv, mv_q4;
    5553             : 
    5554             :     int32_t subpel_x, subpel_y;
    5555             :     uint16_t * src_ptr;
    5556             :     uint16_t * dst_ptr;
    5557             :     int32_t src_stride;
    5558             :     int32_t dst_stride;
    5559             :     ConvolveParams conv_params;
    5560             :     InterpFilterParams filter_params_x, filter_params_y;
    5561             : 
    5562           0 :     const BlockGeom * blk_geom = get_blk_geom_mds(cu_ptr->mds_idx);
    5563             : 
    5564             : #if OBMC_FLAG
    5565           0 :     if (motion_mode == OBMC_CAUSAL) {
    5566           0 :         assert(is_compound == 0);
    5567           0 :         assert(blk_geom->bwidth > 4 && blk_geom->bheight > 4);
    5568             :     }
    5569             : #endif
    5570             :     //special treatment for chroma in 4XN/NX4 blocks
    5571             :    //if one of the neighbour blocks of the parent square is intra the chroma prediction will follow the normal path using the luma MV of the current nsq block which is the latest sub8x8.
    5572             :    //for this case: only uniPred is allowed.
    5573             : 
    5574           0 :     int32_t sub8x8_inter = 0;
    5575             : 
    5576           0 :     if(perform_chroma && (blk_geom->has_uv && (blk_geom->bwidth == 4 || blk_geom->bheight == 4)))
    5577             : 
    5578             :     {
    5579             :         //CHKN setup input param
    5580           0 :         int32_t bw = blk_geom->bwidth_uv;
    5581           0 :         int32_t bh = blk_geom->bheight_uv;
    5582             :         UNUSED(bw);
    5583             :         UNUSED(bh);
    5584             : 
    5585           0 :         uint32_t mi_x = pu_origin_x;       //these are luma picture wise
    5586           0 :         uint32_t mi_y = pu_origin_y;
    5587             : 
    5588           0 :         MacroBlockD  *xd = cu_ptr->av1xd;
    5589           0 :         xd->mi_stride = picture_control_set_ptr->mi_stride;
    5590           0 :         const int32_t offset = (mi_y >> MI_SIZE_LOG2) * xd->mi_stride + (mi_x >> MI_SIZE_LOG2);
    5591           0 :         xd->mi = picture_control_set_ptr->mi_grid_base + offset;
    5592             : 
    5593             :         //CHKN fill current mi from current block
    5594             :         {
    5595           0 :             ModeInfo *miPtr = *xd->mi;
    5596             :             uint8_t  miX, miY;
    5597             :             MvReferenceFrame rf[2];
    5598           0 :             av1_set_ref_frame(rf, ref_frame_type);
    5599           0 :             for (miY = 0; miY < (blk_geom->bheight >> MI_SIZE_LOG2); miY++) {
    5600           0 :                 for (miX = 0; miX < (blk_geom->bwidth >> MI_SIZE_LOG2); miX++) {
    5601           0 :                     miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.use_intrabc = use_intrabc;
    5602           0 :                     miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.ref_frame[0] = rf[0];
    5603           0 :                     if (mv_unit->pred_direction == UNI_PRED_LIST_0) {
    5604           0 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[0].as_mv.col = mv_unit->mv[REF_LIST_0].x;
    5605           0 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[0].as_mv.row = mv_unit->mv[REF_LIST_0].y;
    5606             :                     }
    5607           0 :                     else if (mv_unit->pred_direction == UNI_PRED_LIST_1) {
    5608           0 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[0].as_mv.col = mv_unit->mv[REF_LIST_1].x;
    5609           0 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[0].as_mv.row = mv_unit->mv[REF_LIST_1].y;
    5610             :                     }
    5611             :                     else {
    5612           0 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[0].as_mv.col = mv_unit->mv[REF_LIST_0].x;
    5613           0 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[0].as_mv.row = mv_unit->mv[REF_LIST_0].y;
    5614           0 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[1].as_mv.col = mv_unit->mv[REF_LIST_1].x;
    5615           0 :                         miPtr[miX + miY * xd->mi_stride].mbmi.block_mi.mv[1].as_mv.row = mv_unit->mv[REF_LIST_1].y;
    5616             :                     }
    5617             :                 }
    5618             :             }
    5619             :         }
    5620             : 
    5621           0 :         int32_t build_for_obmc = 0;
    5622             : 
    5623           0 :         const BlockSize bsize = blk_geom->bsize;//mi->sb_type;
    5624           0 :         assert(bsize < BlockSizeS_ALL);
    5625           0 :         const int32_t ss_x = 1;// pd->subsampling_x;
    5626           0 :         const int32_t ss_y = 1;//pd->subsampling_y;
    5627           0 :         sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
    5628           0 :             (block_size_high[bsize] < 8 && ss_y);
    5629             : 
    5630           0 :         if (use_intrabc) sub8x8_inter = 0;
    5631             :         // For sub8x8 chroma blocks, we may be covering more than one luma block's
    5632             :         // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
    5633             :         // the top-left corner of the prediction source - the correct top-left corner
    5634             :         // is at (pre_x, pre_y).
    5635           0 :         const int32_t row_start =
    5636           0 :             (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
    5637           0 :         const int32_t col_start =
    5638           0 :             (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
    5639             : 
    5640           0 :         const int32_t pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
    5641           0 :         const int32_t pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
    5642             :         UNUSED(pre_x);
    5643             :         UNUSED(pre_y);
    5644             : 
    5645           0 :         sub8x8_inter = sub8x8_inter && !build_for_obmc;
    5646           0 :         if (sub8x8_inter) {
    5647           0 :             for (int32_t row = row_start; row <= 0 && sub8x8_inter; ++row) {
    5648           0 :                 for (int32_t col = col_start; col <= 0; ++col) {
    5649           0 :                     ModeInfo *miPtr = *xd->mi;
    5650           0 :                     const MbModeInfo *this_mbmi = &miPtr[row * xd->mi_stride + col].mbmi;
    5651             : 
    5652           0 :                     if (!is_inter_block(&this_mbmi->block_mi)) sub8x8_inter = 0;
    5653             :                 }
    5654             :             }
    5655             :         }
    5656             : 
    5657           0 :         if (sub8x8_inter) {
    5658             :             // block size
    5659           0 :             const int32_t b4_w = block_size_wide[bsize] >> ss_x;
    5660           0 :             const int32_t b4_h = block_size_high[bsize] >> ss_y;
    5661           0 :             const BlockSize plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
    5662           0 :             assert(plane_bsize < BlockSizeS_ALL);
    5663           0 :             const int32_t b8_w = block_size_wide[plane_bsize] >> ss_x;
    5664           0 :             const int32_t b8_h = block_size_high[plane_bsize] >> ss_y;
    5665             : 
    5666           0 :             assert(!is_compound);
    5667             : 
    5668           0 :             int32_t row = row_start;
    5669             :             int32_t src_stride;
    5670           0 :             for (int32_t y = 0; y < b8_h; y += b4_h) {
    5671           0 :                 int32_t col = col_start;
    5672           0 :                 for (int32_t x = 0; x < b8_w; x += b4_w) {
    5673           0 :                     ModeInfo *miPtr = *xd->mi;
    5674           0 :                     const MbModeInfo *this_mbmi = &miPtr[row * xd->mi_stride + col].mbmi;
    5675             : 
    5676           0 :                     int32_t tmp_dst_stride = 8;
    5677             :                     UNUSED(tmp_dst_stride);
    5678           0 :                     assert(bw < 8 || bh < 8);
    5679             : 
    5680           0 :                     conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstCb, BLOCK_SIZE_64, is_compound, bit_depth);
    5681           0 :                     conv_params.use_jnt_comp_avg = 0;
    5682           0 :                     uint8_t ref_idx = get_ref_frame_idx(this_mbmi->block_mi.ref_frame[0]);
    5683           0 :                     assert(ref_idx < REF_LIST_MAX_DEPTH);
    5684           0 :                     EbPictureBufferDesc  *ref_pic = this_mbmi->block_mi.ref_frame[0] ==
    5685           0 :                         LAST_FRAME || this_mbmi->block_mi.ref_frame[0] == LAST2_FRAME || this_mbmi->block_mi.ref_frame[0] == LAST3_FRAME || this_mbmi->block_mi.ref_frame[0] == GOLDEN_FRAME ?
    5686           0 :                         ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[REF_LIST_0][ref_idx]->object_ptr)->reference_picture16bit :
    5687           0 :                         ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[REF_LIST_1][ref_idx]->object_ptr)->reference_picture16bit;
    5688           0 :                     src_ptr = (uint16_t*)ref_pic->buffer_cb + (ref_pic->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic->stride_cb;
    5689           0 :                     dst_ptr = (uint16_t*)prediction_ptr->buffer_cb + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb;
    5690           0 :                     src_stride = ref_pic->stride_cb;
    5691           0 :                     dst_stride = prediction_ptr->stride_cb;
    5692           0 :                     src_ptr = src_ptr + x + y * ref_pic->stride_cb;
    5693           0 :                     dst_ptr = dst_ptr + x + y * prediction_ptr->stride_cb;
    5694             : 
    5695           0 :                     const MV mv = this_mbmi->block_mi.mv[0].as_mv;
    5696           0 :                     mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, blk_geom->bwidth_uv, blk_geom->bheight_uv, 1, 1);
    5697           0 :                     subpel_x = mv_q4.col & SUBPEL_MASK;
    5698           0 :                     subpel_y = mv_q4.row & SUBPEL_MASK;
    5699           0 :                     src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    5700             : 
    5701           0 :                     av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    5702           0 :                         &filter_params_y, blk_geom->bwidth_uv, blk_geom->bheight_uv);
    5703             : 
    5704           0 :                     convolveHbd[subpel_x != 0][subpel_y != 0][is_compound](
    5705             :                         src_ptr,
    5706             :                         src_stride,
    5707             :                         dst_ptr,
    5708             :                         dst_stride,
    5709             :                         b4_w,
    5710             :                         b4_h,
    5711             :                         &filter_params_x,
    5712             :                         &filter_params_y,
    5713             :                         subpel_x,
    5714             :                         subpel_y,
    5715             :                         &conv_params,
    5716             :                         bit_depth);
    5717             : 
    5718             :                     //Cr
    5719           0 :                     conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstCr, BLOCK_SIZE_64, is_compound, bit_depth);
    5720           0 :                     conv_params.use_jnt_comp_avg = 0;
    5721             : 
    5722           0 :                     src_ptr = (uint16_t*)ref_pic->buffer_cr + (ref_pic->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic->stride_cr;
    5723           0 :                     dst_ptr = (uint16_t*)prediction_ptr->buffer_cr + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cr;
    5724           0 :                     src_stride = ref_pic->stride_cr;
    5725           0 :                     dst_stride = prediction_ptr->stride_cr;
    5726           0 :                     src_ptr = src_ptr + x + y * ref_pic->stride_cr;
    5727           0 :                     dst_ptr = dst_ptr + x + y * prediction_ptr->stride_cr;
    5728             : 
    5729           0 :                     mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, blk_geom->bwidth_uv, blk_geom->bheight_uv, 1, 1);
    5730           0 :                     subpel_x = mv_q4.col & SUBPEL_MASK;
    5731           0 :                     subpel_y = mv_q4.row & SUBPEL_MASK;
    5732           0 :                     src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    5733             : 
    5734           0 :                     av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    5735           0 :                         &filter_params_y, blk_geom->bwidth_uv, blk_geom->bheight_uv);
    5736             : 
    5737           0 :                     convolveHbd[subpel_x != 0][subpel_y != 0][is_compound](
    5738             :                         src_ptr,
    5739             :                         src_stride,
    5740             :                         dst_ptr,
    5741             :                         dst_stride,
    5742             :                         b4_w,
    5743             :                         b4_h,
    5744             :                         &filter_params_x,
    5745             :                         &filter_params_y,
    5746             :                         subpel_x,
    5747             :                         subpel_y,
    5748             :                         &conv_params,
    5749             :                         bit_depth);
    5750             : 
    5751           0 :                     ++col;
    5752             :                 }
    5753           0 :                 ++row;
    5754             :             }
    5755             :         }
    5756             :     }
    5757             : #if INTER_INTER_HBD
    5758             :     MvReferenceFrame rf[2];
    5759           0 :     av1_set_ref_frame(rf, ref_frame_type);
    5760             : #endif
    5761           0 :     if (mv_unit->pred_direction == UNI_PRED_LIST_0 || mv_unit->pred_direction == BI_PRED) {
    5762             :         //List0-Y
    5763           0 :         mv.col = mv_unit->mv[REF_LIST_0].x;
    5764           0 :         mv.row = mv_unit->mv[REF_LIST_0].y;
    5765             : 
    5766           0 :         src_ptr = (uint16_t*)ref_pic_list0->buffer_y + ref_pic_list0->origin_x + pu_origin_x + (ref_pic_list0->origin_y + pu_origin_y) * ref_pic_list0->stride_y;
    5767           0 :         dst_ptr = (uint16_t*)prediction_ptr->buffer_y + prediction_ptr->origin_x + dst_origin_x + (prediction_ptr->origin_y + dst_origin_y) * prediction_ptr->stride_y;
    5768           0 :         src_stride = ref_pic_list0->stride_y;
    5769           0 :         dst_stride = prediction_ptr->stride_y;
    5770             : 
    5771           0 :         mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, bwidth, bheight, 0, 0);//mv_q4 has 1 extra bit for fractionnal to accomodate chroma when accessing filter coeffs.
    5772           0 :         subpel_x = mv_q4.col & SUBPEL_MASK;
    5773           0 :         subpel_y = mv_q4.row & SUBPEL_MASK;
    5774           0 :         src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    5775           0 :         conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstY, 128, is_compound, bit_depth);
    5776             : 
    5777           0 :         av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    5778             :             &filter_params_y, bwidth, bheight);
    5779             : 
    5780           0 :         convolveHbd[subpel_x != 0][subpel_y != 0][is_compound](
    5781             :             src_ptr,
    5782             :             src_stride,
    5783             :             dst_ptr,
    5784             :             dst_stride,
    5785             :             bwidth,
    5786             :             bheight,
    5787             :             &filter_params_x,
    5788             :             &filter_params_y,
    5789             :             subpel_x,
    5790             :             subpel_y,
    5791             :             &conv_params,
    5792             :             bit_depth);
    5793             : 
    5794           0 :         if (perform_chroma && blk_geom->has_uv && sub8x8_inter == 0) {
    5795             :             //List0-Cb
    5796           0 :             src_ptr = (uint16_t*)ref_pic_list0->buffer_cb + (ref_pic_list0->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic_list0->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list0->stride_cb;
    5797           0 :             dst_ptr = (uint16_t*)prediction_ptr->buffer_cb + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb;
    5798           0 :             src_stride = ref_pic_list0->stride_cb;
    5799           0 :             dst_stride = prediction_ptr->stride_cb;
    5800             : 
    5801           0 :             mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, blk_geom->bwidth_uv, blk_geom->bheight_uv, 1, 1);
    5802           0 :             subpel_x = mv_q4.col & SUBPEL_MASK;
    5803           0 :             subpel_y = mv_q4.row & SUBPEL_MASK;
    5804           0 :             src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    5805           0 :             conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstCb, 64, is_compound, bit_depth);
    5806             : 
    5807           0 :             av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    5808           0 :                 &filter_params_y, blk_geom->bwidth_uv, blk_geom->bheight_uv);
    5809             : 
    5810           0 :             if (use_intrabc && (subpel_x != 0 || subpel_y != 0))
    5811           0 :                 highbd_convolve_2d_for_intrabc(
    5812             :                     (const uint16_t *)src_ptr,
    5813             :                     src_stride,
    5814             :                     dst_ptr,
    5815             :                     dst_stride,
    5816           0 :                     blk_geom->bwidth_uv,
    5817           0 :                     blk_geom->bheight_uv,
    5818             :                     subpel_x,
    5819             :                     subpel_y,
    5820             :                     &conv_params,
    5821             :                     bit_depth);
    5822             :             else
    5823           0 :                 convolveHbd[subpel_x != 0][subpel_y != 0][is_compound](
    5824             :                     src_ptr,
    5825             :                     src_stride,
    5826             :                     dst_ptr,
    5827             :                     dst_stride,
    5828           0 :                     blk_geom->bwidth_uv,
    5829           0 :                     blk_geom->bheight_uv,
    5830             :                     &filter_params_x,
    5831             :                     &filter_params_y,
    5832             :                     subpel_x,
    5833             :                     subpel_y,
    5834             :                     &conv_params,
    5835             :                     bit_depth);
    5836             : 
    5837             :             //List0-Cr
    5838           0 :             src_ptr = (uint16_t*)ref_pic_list0->buffer_cr + (ref_pic_list0->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic_list0->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list0->stride_cr;
    5839           0 :             dst_ptr = (uint16_t*)prediction_ptr->buffer_cr + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cr;
    5840           0 :             src_stride = ref_pic_list0->stride_cr;
    5841           0 :             dst_stride = prediction_ptr->stride_cr;
    5842             : 
    5843           0 :             mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, blk_geom->bwidth_uv, blk_geom->bheight_uv, 1, 1);
    5844           0 :             subpel_x = mv_q4.col & SUBPEL_MASK;
    5845           0 :             subpel_y = mv_q4.row & SUBPEL_MASK;
    5846           0 :             src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    5847           0 :             conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstCr, 64, is_compound, bit_depth);
    5848           0 :             if (use_intrabc && (subpel_x != 0 || subpel_y != 0))
    5849           0 :                 highbd_convolve_2d_for_intrabc(
    5850             :                     (const uint16_t *)src_ptr,
    5851             :                     src_stride,
    5852             :                     dst_ptr,
    5853             :                     dst_stride,
    5854           0 :                     blk_geom->bwidth_uv,
    5855           0 :                     blk_geom->bheight_uv,
    5856             :                     subpel_x,
    5857             :                     subpel_y,
    5858             :                     &conv_params,
    5859             :                     bit_depth);
    5860             :             else
    5861           0 :                 convolveHbd[subpel_x != 0][subpel_y != 0][is_compound](
    5862             :                     src_ptr,
    5863             :                     src_stride,
    5864             :                     dst_ptr,
    5865             :                     dst_stride,
    5866           0 :                     blk_geom->bwidth_uv,
    5867           0 :                     blk_geom->bheight_uv,
    5868             :                     &filter_params_x,
    5869             :                     &filter_params_y,
    5870             :                     subpel_x,
    5871             :                     subpel_y,
    5872             :                     &conv_params,
    5873             :                     bit_depth);
    5874             :         }
    5875             :     }
    5876             : 
    5877           0 :     if (mv_unit->pred_direction == UNI_PRED_LIST_1 || mv_unit->pred_direction == BI_PRED) {
    5878             :         //List0-Y
    5879           0 :         mv.col = mv_unit->mv[REF_LIST_1].x;
    5880           0 :         mv.row = mv_unit->mv[REF_LIST_1].y;
    5881             : 
    5882           0 :         src_ptr = (uint16_t*)ref_pic_list1->buffer_y + ref_pic_list1->origin_x + pu_origin_x + (ref_pic_list1->origin_y + pu_origin_y) * ref_pic_list1->stride_y;
    5883           0 :         dst_ptr = (uint16_t*)prediction_ptr->buffer_y + prediction_ptr->origin_x + dst_origin_x + (prediction_ptr->origin_y + dst_origin_y) * prediction_ptr->stride_y;
    5884           0 :         src_stride = ref_pic_list1->stride_y;
    5885           0 :         dst_stride = prediction_ptr->stride_y;
    5886             : 
    5887           0 :         mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, bwidth, bheight, 0, 0);//mv_q4 has 1 extra bit for fractionnal to accomodate chroma when accessing filter coeffs.
    5888           0 :         subpel_x = mv_q4.col & SUBPEL_MASK;
    5889           0 :         subpel_y = mv_q4.row & SUBPEL_MASK;
    5890             : 
    5891           0 :         src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    5892           0 :         conv_params = get_conv_params_no_round(0, (mv_unit->pred_direction == BI_PRED) ? 1 : 0, 0, tmp_dstY, 128, is_compound, bit_depth);
    5893             : #if INTER_INTER_HBD
    5894           0 :         av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    5895             :             &filter_params_y, bwidth, bheight);
    5896             : 
    5897             :         //the luma data is applied to chroma below
    5898           0 :         av1_dist_wtd_comp_weight_assign(
    5899           0 :             &picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header,
    5900           0 :             picture_control_set_ptr->parent_pcs_ptr->cur_order_hint,// cur_frame_index,
    5901           0 :             picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[0] - 1],// bck_frame_index,
    5902           0 :             picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[1] - 1],// fwd_frame_index,
    5903             :             compound_idx,
    5904             :             0,// order_idx,
    5905             :             &conv_params.fwd_offset, &conv_params.bck_offset,
    5906             :             &conv_params.use_dist_wtd_comp_avg, is_compound);
    5907             : 
    5908           0 :         conv_params.use_jnt_comp_avg =  conv_params.use_dist_wtd_comp_avg;
    5909             : #endif
    5910             : 
    5911             : 
    5912             : #if INTER_INTER_HBD
    5913           0 :         if (is_compound && is_masked_compound_type(interinter_comp->type)) {
    5914           0 :             conv_params.do_average = 0;
    5915           0 :             av1_make_masked_inter_predictor_hbd(
    5916             :                 src_ptr,
    5917             :                 src_stride,
    5918             :                 dst_ptr,
    5919             :                 dst_stride,
    5920             :                 blk_geom,
    5921             :                 bwidth,
    5922             :                 bheight,
    5923             :                 &filter_params_x,
    5924             :                 &filter_params_y,
    5925             :                 subpel_x,
    5926             :                 subpel_y,
    5927             :                 &conv_params,
    5928             :                 interinter_comp,
    5929             :                 bit_depth,
    5930             :                 0//plane=Luma  seg_mask is computed based on luma and used for chroma
    5931             :                 );
    5932             :         }
    5933             :         else
    5934             : #endif
    5935           0 :             convolveHbd[subpel_x != 0][subpel_y != 0][is_compound](
    5936             :                 src_ptr,
    5937             :                 src_stride,
    5938             :                 dst_ptr,
    5939             :                 dst_stride,
    5940             :                 bwidth,
    5941             :                 bheight,
    5942             :                 &filter_params_x,
    5943             :                 &filter_params_y,
    5944             :                 subpel_x,
    5945             :                 subpel_y,
    5946             :                 &conv_params,
    5947             :                 bit_depth);
    5948             : 
    5949           0 :         if (perform_chroma && blk_geom->has_uv && sub8x8_inter == 0) {
    5950             :             //List0-Cb
    5951           0 :             src_ptr = (uint16_t*)ref_pic_list1->buffer_cb + (ref_pic_list1->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic_list1->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list1->stride_cb;
    5952           0 :             dst_ptr = (uint16_t*)prediction_ptr->buffer_cb + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb;
    5953           0 :             src_stride = ref_pic_list1->stride_cb;
    5954           0 :             dst_stride = prediction_ptr->stride_cb;
    5955             : 
    5956           0 :             mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, blk_geom->bwidth_uv, blk_geom->bheight_uv, 1, 1);
    5957           0 :             subpel_x = mv_q4.col & SUBPEL_MASK;
    5958           0 :             subpel_y = mv_q4.row & SUBPEL_MASK;
    5959           0 :             src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    5960           0 :             conv_params = get_conv_params_no_round(0, (mv_unit->pred_direction == BI_PRED) ? 1 : 0, 0, tmp_dstCb, 64, is_compound, bit_depth);
    5961             : #if INTER_INTER_HBD
    5962           0 :             av1_dist_wtd_comp_weight_assign(
    5963           0 :                 &picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header,
    5964           0 :                 picture_control_set_ptr->parent_pcs_ptr->cur_order_hint,// cur_frame_index,
    5965           0 :                 picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[0] - 1],// bck_frame_index,
    5966           0 :                 picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[1] - 1],// fwd_frame_index,
    5967             :                 compound_idx,
    5968             :                 0,// order_idx,
    5969             :                 &conv_params.fwd_offset, &conv_params.bck_offset,
    5970             :                 &conv_params.use_dist_wtd_comp_avg, is_compound);
    5971             : 
    5972           0 :             conv_params.use_jnt_comp_avg = conv_params.use_dist_wtd_comp_avg;
    5973             : #endif
    5974           0 :             av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    5975           0 :                 &filter_params_y, blk_geom->bwidth_uv, blk_geom->bheight_uv);
    5976             : #if INTER_INTER_HBD
    5977           0 :             if (is_compound && is_masked_compound_type(interinter_comp->type)) {
    5978           0 :                 conv_params.do_average = 0;
    5979           0 :                 av1_make_masked_inter_predictor_hbd(
    5980             :                     src_ptr,
    5981             :                     src_stride,
    5982             :                     dst_ptr,
    5983             :                     dst_stride,
    5984             :                     blk_geom,
    5985           0 :                     blk_geom->bwidth_uv,
    5986           0 :                     blk_geom->bheight_uv,
    5987             :                     &filter_params_x,
    5988             :                     &filter_params_y,
    5989             :                     subpel_x,
    5990             :                     subpel_y,
    5991             :                     &conv_params,
    5992             :                     interinter_comp,
    5993             :                     bit_depth,
    5994             :                     1//plane=cb  seg_mask is computed based on luma and used for chroma
    5995             :                 );
    5996             :             }
    5997             :             else
    5998             : #endif
    5999           0 :                 convolveHbd[subpel_x != 0][subpel_y != 0][is_compound](
    6000             :                     src_ptr,
    6001             :                     src_stride,
    6002             :                     dst_ptr,
    6003             :                     dst_stride,
    6004           0 :                     blk_geom->bwidth_uv,
    6005           0 :                     blk_geom->bheight_uv,
    6006             :                     &filter_params_x,
    6007             :                     &filter_params_y,
    6008             :                     subpel_x,
    6009             :                     subpel_y,
    6010             :                     &conv_params,
    6011             :                     bit_depth);
    6012             : 
    6013             :             //List0-Cr
    6014           0 :             src_ptr = (uint16_t*)ref_pic_list1->buffer_cr + (ref_pic_list1->origin_x + ((pu_origin_x >> 3) << 3)) / 2 + (ref_pic_list1->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list1->stride_cr;
    6015           0 :             dst_ptr = (uint16_t*)prediction_ptr->buffer_cr + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cr;
    6016           0 :             src_stride = ref_pic_list1->stride_cr;
    6017           0 :             dst_stride = prediction_ptr->stride_cr;
    6018             : 
    6019           0 :             mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv, blk_geom->bwidth_uv, blk_geom->bheight_uv, 1, 1);
    6020           0 :             subpel_x = mv_q4.col & SUBPEL_MASK;
    6021           0 :             subpel_y = mv_q4.row & SUBPEL_MASK;
    6022           0 :             src_ptr = src_ptr + (mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS);
    6023           0 :             conv_params = get_conv_params_no_round(0, (mv_unit->pred_direction == BI_PRED) ? 1 : 0, 0, tmp_dstCr, 64, is_compound, bit_depth);
    6024             : 
    6025             :  #if INTER_INTER_HBD
    6026           0 :             av1_dist_wtd_comp_weight_assign(
    6027           0 :                 &picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header,
    6028           0 :                 picture_control_set_ptr->parent_pcs_ptr->cur_order_hint,// cur_frame_index,
    6029           0 :                 picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[0] - 1],// bck_frame_index,
    6030           0 :                 picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[1] - 1],// fwd_frame_index,
    6031             :                 compound_idx,
    6032             :                 0,// order_idx,
    6033             :                 &conv_params.fwd_offset, &conv_params.bck_offset,
    6034             :                 &conv_params.use_dist_wtd_comp_avg, is_compound);
    6035             : 
    6036           0 :             conv_params.use_jnt_comp_avg = conv_params.use_dist_wtd_comp_avg;
    6037             : 
    6038           0 :             if (is_compound && is_masked_compound_type(interinter_comp->type)) {
    6039           0 :                 conv_params.do_average = 0;
    6040           0 :                 av1_make_masked_inter_predictor_hbd(
    6041             :                     src_ptr,
    6042             :                     src_stride,
    6043             :                     dst_ptr,
    6044             :                     dst_stride,
    6045             :                     blk_geom,
    6046           0 :                     blk_geom->bwidth_uv,
    6047           0 :                     blk_geom->bheight_uv,
    6048             :                     &filter_params_x,
    6049             :                     &filter_params_y,
    6050             :                     subpel_x,
    6051             :                     subpel_y,
    6052             :                     &conv_params,
    6053             :                     interinter_comp,
    6054             :                     bit_depth,
    6055             :                     1//plane=Cr  seg_mask is computed based on luma and used for chroma
    6056             :                 );
    6057             :             }
    6058             :             else
    6059             : #endif
    6060           0 :                 convolveHbd[subpel_x != 0][subpel_y != 0][is_compound](
    6061             :                     src_ptr,
    6062             :                     src_stride,
    6063             :                     dst_ptr,
    6064             :                     dst_stride,
    6065           0 :                     blk_geom->bwidth_uv,
    6066           0 :                     blk_geom->bheight_uv,
    6067             :                     &filter_params_x,
    6068             :                     &filter_params_y,
    6069             :                     subpel_x,
    6070             :                     subpel_y,
    6071             :                     &conv_params,
    6072             :                     bit_depth);
    6073             :         }
    6074             :     }
    6075             : 
    6076             : #if INTER_INTRA_HBD
    6077           0 :     if ( is_interintra_used ) {
    6078           0 :         int32_t start_plane = 0;
    6079           0 :         int32_t end_plane = perform_chroma && blk_geom->has_uv ? MAX_MB_PLANE: 1;
    6080             :         // temp buffer for intra pred
    6081             :         DECLARE_ALIGNED(16, uint8_t, intra_pred[MAX_SB_SQUARE]);
    6082             :         DECLARE_ALIGNED(16, uint8_t, intra_pred_cb[MAX_SB_SQUARE]);
    6083             :         DECLARE_ALIGNED(16, uint8_t, intra_pred_cr[MAX_SB_SQUARE]);
    6084             : 
    6085             :         int32_t  intra_stride;
    6086             : 
    6087           0 :         for (int32_t plane = start_plane; plane < end_plane; ++plane) {
    6088             : 
    6089             :             EbPictureBufferDesc  intra_pred_desc;
    6090           0 :             intra_pred_desc.origin_x     = intra_pred_desc.origin_y  = 0;
    6091           0 :             intra_pred_desc.stride_y     = bwidth;
    6092           0 :             intra_pred_desc.stride_cb    = bwidth/2;
    6093           0 :             intra_pred_desc.stride_cr    = bwidth/2;
    6094           0 :             intra_pred_desc.buffer_y     = intra_pred;
    6095           0 :             intra_pred_desc.buffer_cb    = intra_pred_cb;
    6096           0 :             intra_pred_desc.buffer_cr    = intra_pred_cr;
    6097             : 
    6098           0 :             const int ssx = plane ? 1 : 0;
    6099           0 :             const int ssy = plane ? 1 : 0;
    6100           0 :             const BlockSize plane_bsize = get_plane_block_size(blk_geom->bsize, ssx, ssy);
    6101             :             //av1_build_interintra_predictors_sbp
    6102             :             uint16_t    topNeighArray[64 * 2 + 1];
    6103             :             uint16_t    leftNeighArray[64 * 2 + 1];
    6104             : 
    6105           0 :             uint32_t cu_originx_uv = (pu_origin_x >> 3 << 3) >> 1;
    6106           0 :             uint32_t cu_originy_uv = (pu_origin_y >> 3 << 3) >> 1;
    6107             : 
    6108           0 :             if (plane == 0) {
    6109           0 :                 dst_ptr = (uint16_t*)prediction_ptr->buffer_y + prediction_ptr->origin_x + dst_origin_x + (prediction_ptr->origin_y + dst_origin_y) * prediction_ptr->stride_y;
    6110           0 :                 dst_stride = prediction_ptr->stride_y;
    6111           0 :                 intra_stride = intra_pred_desc.stride_y;
    6112             : 
    6113           0 :                 if (pu_origin_y != 0)
    6114           0 :                     memcpy(topNeighArray + 1, (uint16_t*)luma_recon_neighbor_array->top_array + pu_origin_x, blk_geom->bwidth * 2 * sizeof(uint16_t));
    6115             : 
    6116           0 :                 if (pu_origin_x != 0)
    6117           0 :                     memcpy(leftNeighArray + 1, (uint16_t*)luma_recon_neighbor_array->left_array + pu_origin_y, blk_geom->bheight * 2 * sizeof(uint16_t));
    6118             : 
    6119           0 :                 if (pu_origin_y != 0 && pu_origin_x != 0)
    6120           0 :                     topNeighArray[0] = leftNeighArray[0] = ((uint16_t*)luma_recon_neighbor_array->top_left_array)[MAX_PICTURE_HEIGHT_SIZE + pu_origin_x - pu_origin_y];
    6121             :             }
    6122             : 
    6123           0 :             else if (plane == 1) {
    6124           0 :                 dst_ptr = (uint16_t*)prediction_ptr->buffer_cb + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb;
    6125           0 :                  dst_stride = prediction_ptr->stride_cb;
    6126           0 :                 intra_stride = intra_pred_desc.stride_cb;
    6127             : 
    6128           0 :                 if (cu_originy_uv != 0)
    6129           0 :                     memcpy(topNeighArray + 1, (uint16_t*)cb_recon_neighbor_array->top_array + cu_originx_uv, blk_geom->bwidth_uv * 2 * sizeof(uint16_t));
    6130             : 
    6131           0 :                 if (cu_originx_uv != 0)
    6132           0 :                     memcpy(leftNeighArray + 1, (uint16_t*)cb_recon_neighbor_array->left_array + cu_originy_uv, blk_geom->bheight_uv * 2 * sizeof(uint16_t));
    6133             : 
    6134           0 :                 if (cu_originy_uv != 0 && cu_originx_uv != 0)
    6135           0 :                     topNeighArray[0] = leftNeighArray[0] = ((uint16_t*)cb_recon_neighbor_array->top_left_array)[MAX_PICTURE_HEIGHT_SIZE / 2 + cu_originx_uv - cu_originy_uv / 2];
    6136             :             }
    6137             :             else {
    6138           0 :                 dst_ptr = (uint16_t*)prediction_ptr->buffer_cr + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cr;
    6139           0 :                  dst_stride = prediction_ptr->stride_cr;
    6140           0 :                  intra_stride = intra_pred_desc.stride_cr;
    6141             : 
    6142           0 :                 if (cu_originy_uv != 0)
    6143           0 :                     memcpy(topNeighArray + 1, (uint16_t*)cr_recon_neighbor_array->top_array + cu_originx_uv, blk_geom->bwidth_uv * 2 * sizeof(uint16_t));
    6144             : 
    6145           0 :                 if (cu_originx_uv != 0)
    6146           0 :                     memcpy(leftNeighArray + 1, (uint16_t*)cr_recon_neighbor_array->left_array + cu_originy_uv, blk_geom->bheight_uv * 2 * sizeof(uint16_t));
    6147             : 
    6148           0 :                 if (cu_originy_uv != 0 && cu_originx_uv != 0)
    6149           0 :                     topNeighArray[0] = leftNeighArray[0] = ((uint16_t*)cr_recon_neighbor_array->top_left_array)[MAX_PICTURE_HEIGHT_SIZE / 2 + cu_originx_uv - cu_originy_uv / 2];
    6150             :             }
    6151           0 :             TxSize  tx_size = blk_geom->txsize[0][0];               // Nader - Intra 128x128 not supported
    6152           0 :             TxSize  tx_size_Chroma = blk_geom->txsize_uv[0][0];     //Nader - Intra 128x128 not supported
    6153             : 
    6154           0 :             eb_av1_predict_intra_block_16bit(
    6155             :                 tile,
    6156             :                 !ED_STAGE,
    6157             :                 blk_geom,
    6158           0 :                 picture_control_set_ptr->parent_pcs_ptr->av1_cm,    //const Av1Common *cm,
    6159           0 :                 plane ? blk_geom->bwidth_uv : blk_geom->bwidth,     //int32_t wpx,
    6160           0 :                 plane ? blk_geom->bheight_uv : blk_geom->bheight,   //int32_t hpx,
    6161             :                 plane ? tx_size_Chroma : tx_size,                   //TxSize tx_size,
    6162           0 :                 interintra_to_intra_mode[interintra_mode],          //PredictionMode mode,
    6163             :                 0,
    6164             :                 0,                                                  //int32_t use_palette,
    6165             : #if PAL_SUP
    6166             :                 NULL,
    6167             : #endif
    6168             :                 FILTER_INTRA_MODES,                                 // FilterIntraMode filter_intra_mode,
    6169             :                 topNeighArray + 1,
    6170             :                 leftNeighArray + 1,
    6171             :                 &intra_pred_desc,                                   //uint8_t *dst,
    6172             :                                                                     //int32_t dst_stride,
    6173             :                 0,                                                  //int32_t col_off,
    6174             :                 0,                                                  //int32_t row_off,
    6175             :                 plane,                                              //int32_t plane,
    6176           0 :                 blk_geom->bsize,                                    //uint32_t puSize,
    6177             :                 dst_origin_x,
    6178             :                 dst_origin_y,
    6179             :                 pu_origin_x,
    6180             :                 pu_origin_y,
    6181             :                 0,                                                  //uint32_t cuOrgX used only for prediction Ptr
    6182             :                 0                                                   //uint32_t cuOrgY used only for prediction Ptr
    6183             :             );
    6184             : 
    6185             :             //combine_interintra_highbd
    6186           0 :             combine_interintra_highbd(
    6187             :                 interintra_mode,
    6188             :                 use_wedge_interintra,
    6189             :                 interintra_wedge_index,
    6190             :                 INTERINTRA_WEDGE_SIGN,
    6191           0 :                 blk_geom->bsize,
    6192             :                 plane_bsize,
    6193             :                 (uint8_t*)dst_ptr,
    6194             :                 dst_stride,
    6195             :                 (uint8_t*)dst_ptr,       // Inter pred buff
    6196             :                 dst_stride,    // Inter pred stride
    6197             :                 (plane == 0) ? intra_pred : (plane == 1) ? intra_pred_cb : intra_pred_cr,  // Intra pred buff
    6198             :                 intra_stride, // Intra pred stride
    6199             :                 bit_depth);
    6200             :         }
    6201             :     }
    6202             : #endif
    6203             :     #if OBMC_FLAG
    6204           0 :     if (motion_mode == OBMC_CAUSAL)
    6205             :     {
    6206             : 
    6207             :         uint16_t * tmp_obmc_bufs[2];
    6208             : 
    6209             :         DECLARE_ALIGNED(16, uint16_t, obmc_buff_0[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
    6210             :         DECLARE_ALIGNED(16, uint16_t, obmc_buff_1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
    6211           0 :         tmp_obmc_bufs[0] = (uint16_t*)obmc_buff_0;
    6212           0 :         tmp_obmc_bufs[1] = (uint16_t*)obmc_buff_1;
    6213             : 
    6214             :         uint16_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
    6215           0 :         int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
    6216           0 :         int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
    6217             : 
    6218             :         {
    6219           0 :             dst_buf1[0] = (uint16_t*)tmp_obmc_bufs[0];
    6220           0 :             dst_buf1[1] = (uint16_t*)tmp_obmc_bufs[0] + MAX_SB_SQUARE;
    6221           0 :             dst_buf1[2] = (uint16_t*)tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
    6222           0 :             dst_buf2[0] = (uint16_t*)tmp_obmc_bufs[1];
    6223           0 :             dst_buf2[1] = (uint16_t*)tmp_obmc_bufs[1] + MAX_SB_SQUARE;
    6224           0 :             dst_buf2[2] = (uint16_t*)tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
    6225             :         }
    6226             : 
    6227           0 :         int mi_row = pu_origin_y >> 2;
    6228           0 :         int mi_col = pu_origin_x >> 2;
    6229             : 
    6230           0 :         build_prediction_by_above_preds_hbd(
    6231             :             perform_chroma,
    6232           0 :             blk_geom->bsize, picture_control_set_ptr, cu_ptr->av1xd, mi_row, mi_col, dst_buf1,
    6233             :             dst_stride1);
    6234             : 
    6235           0 :         build_prediction_by_left_preds_hbd(
    6236             :             perform_chroma,
    6237           0 :             blk_geom->bsize, picture_control_set_ptr, cu_ptr->av1xd, mi_row, mi_col, dst_buf2,
    6238             :             dst_stride2);
    6239             : 
    6240           0 :         uint16_t * final_dst_ptr_y  = (uint16_t*) prediction_ptr->buffer_y + prediction_ptr->origin_x + dst_origin_x + (prediction_ptr->origin_y + dst_origin_y) * prediction_ptr->stride_y;
    6241           0 :         uint16_t  final_dst_stride_y = prediction_ptr->stride_y;
    6242             : 
    6243           0 :         uint16_t * final_dst_ptr_u = (uint16_t*)prediction_ptr->buffer_cb + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb;
    6244           0 :         uint16_t  final_dst_stride_u = prediction_ptr->stride_cb;
    6245             : 
    6246           0 :         uint16_t * final_dst_ptr_v =  (uint16_t*)prediction_ptr->buffer_cr + (prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2 + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cr;
    6247           0 :         uint16_t  final_dst_stride_v = prediction_ptr->stride_cr;
    6248             : 
    6249           0 :         av1_build_obmc_inter_prediction_hbd(
    6250             :             final_dst_ptr_y,
    6251             :             final_dst_stride_y,
    6252             :             final_dst_ptr_u,
    6253             :             final_dst_stride_u,
    6254             :             final_dst_ptr_v,
    6255             :             final_dst_stride_v,
    6256             :             perform_chroma,
    6257           0 :             blk_geom->bsize,
    6258             :             picture_control_set_ptr,
    6259             :             cu_ptr->av1xd,
    6260             :             mi_row,
    6261             :             mi_col,
    6262             :             dst_buf1,
    6263             :             dst_stride1,
    6264             :             dst_buf2,
    6265             :             dst_stride2);
    6266             : 
    6267             :     }
    6268             : #endif
    6269           0 :     return return_error;
    6270             : }
    6271             : 
    6272             : 
    6273      621713 : static void chroma_plane_warped_motion_prediction_sub8x8(
    6274             :     PictureControlSet *picture_control_set_ptr,
    6275             :     uint8_t compound_idx,
    6276             :     CodingUnit *cu_ptr,
    6277             :     const BlockGeom *blk_geom,
    6278             :     uint8_t bwidth,
    6279             :     uint8_t bheight,
    6280             :     uint8_t is_compound,
    6281             :     uint8_t bit_depth,
    6282             :     int32_t src_stride,
    6283             :     int32_t dst_stride,
    6284             :     uint8_t *src_ptr_l0,
    6285             :     uint8_t *src_ptr_l1,
    6286             :     uint8_t *dst_ptr,
    6287             :     MvReferenceFrame rf[2],
    6288             :     MvUnit *mv_unit) {
    6289      621713 :     EbBool is16bit = (EbBool)(bit_depth > EB_8BIT);
    6290             :     DECLARE_ALIGNED(32, uint16_t, tmp_dst[64 * 64]);
    6291      621713 :     const uint32_t interp_filters = 0;
    6292             :     InterpFilterParams filter_params_x, filter_params_y;
    6293             : 
    6294             :     MV  mv_l0;
    6295      621713 :     mv_l0.col = mv_unit->mv[REF_LIST_0].x;
    6296      621713 :     mv_l0.row = mv_unit->mv[REF_LIST_0].y;
    6297             : 
    6298      621713 :     MV mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv_l0, blk_geom->bwidth_uv, blk_geom->bheight_uv, 1, 1);
    6299      621718 :     int32_t subpel_x = mv_q4.col & SUBPEL_MASK;
    6300      621718 :     int32_t subpel_y = mv_q4.row & SUBPEL_MASK;
    6301      621718 :     src_ptr_l0 = src_ptr_l0 + (is16bit ? 2 : 1) * ((mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS));
    6302      621718 :     ConvolveParams conv_params = get_conv_params_no_round(0, 0, 0, tmp_dst, 64, is_compound, bit_depth);
    6303             : 
    6304      621717 :     av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    6305      621717 :         &filter_params_y, blk_geom->bwidth_uv, blk_geom->bheight_uv);
    6306             : 
    6307      621712 :     if (bit_depth == EB_8BIT)
    6308      621712 :         convolve[subpel_x != 0][subpel_y != 0][is_compound](
    6309             :             src_ptr_l0,
    6310             :             src_stride,
    6311             :             dst_ptr,
    6312             :             dst_stride,
    6313             :             bwidth,
    6314             :             bheight,
    6315             :             &filter_params_x,
    6316             :             &filter_params_y,
    6317             :             subpel_x,
    6318             :             subpel_y,
    6319             :             &conv_params);
    6320             :     else
    6321           0 :         convolveHbd[subpel_x != 0][subpel_y != 0][is_compound](
    6322             :             (uint16_t *)src_ptr_l0,
    6323             :             src_stride,
    6324             :             (uint16_t *)dst_ptr,
    6325             :             dst_stride,
    6326             :             bwidth,
    6327             :             bheight,
    6328             :             &filter_params_x,
    6329             :             &filter_params_y,
    6330             :             subpel_x,
    6331             :             subpel_y,
    6332             :             &conv_params,
    6333             :             bit_depth);
    6334             : 
    6335             :     //List1-Cb
    6336      621705 :     if (is_compound) {
    6337             :         MV  mv_l1;
    6338      419486 :         mv_l1.col = mv_unit->mv[REF_LIST_1].x;
    6339      419486 :         mv_l1.row = mv_unit->mv[REF_LIST_1].y;
    6340             : 
    6341      419486 :         mv_q4 = clamp_mv_to_umv_border_sb(cu_ptr->av1xd, &mv_l1, blk_geom->bwidth_uv, blk_geom->bheight_uv, 1, 1);
    6342      419490 :         subpel_x = mv_q4.col & SUBPEL_MASK;
    6343      419490 :         subpel_y = mv_q4.row & SUBPEL_MASK;
    6344      419490 :         src_ptr_l1 = src_ptr_l1 + (is16bit ? 2 : 1) * ((mv_q4.row >> SUBPEL_BITS) * src_stride + (mv_q4.col >> SUBPEL_BITS));
    6345             : 
    6346      419490 :         av1_dist_wtd_comp_weight_assign(
    6347      419490 :             &picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header,
    6348      419490 :             picture_control_set_ptr->parent_pcs_ptr->cur_order_hint,// cur_frame_index,
    6349      419490 :             picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[0] - 1],// bck_frame_index,
    6350      419490 :             picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[1] - 1],// fwd_frame_index,
    6351             :             compound_idx,
    6352             :             0,// order_idx,
    6353             :             &conv_params.fwd_offset, &conv_params.bck_offset,
    6354             :             &conv_params.use_dist_wtd_comp_avg, is_compound);
    6355      419490 :         conv_params.use_jnt_comp_avg = conv_params.use_dist_wtd_comp_avg;
    6356      419490 :         av1_get_convolve_filter_params(interp_filters, &filter_params_x,
    6357      419490 :             &filter_params_y, blk_geom->bwidth_uv, blk_geom->bheight_uv);
    6358             : 
    6359      419486 :         conv_params.do_average = 1;
    6360      419486 :         if (bit_depth == EB_8BIT)
    6361      419486 :             convolve[subpel_x != 0][subpel_y != 0][is_compound](
    6362             :                 src_ptr_l1,
    6363             :                 src_stride,
    6364             :                 dst_ptr,
    6365             :                 dst_stride,
    6366             :                 bwidth,
    6367             :                 bheight,
    6368             :                 &filter_params_x,//puSize > 8 ? &av1RegularFilter : &av1RegularFilterW4,
    6369             :                 &filter_params_y,//puSize > 8 ? &av1RegularFilter : &av1RegularFilterW4,
    6370             :                 subpel_x,
    6371             :                 subpel_y,
    6372             :                 &conv_params);
    6373             :         else
    6374           0 :             convolveHbd[subpel_x != 0][subpel_y != 0][is_compound](
    6375             :                 (uint16_t *)src_ptr_l1,
    6376             :                 src_stride,
    6377             :                 (uint16_t *)dst_ptr,
    6378             :                 dst_stride,
    6379             :                 bwidth,
    6380             :                 bheight,
    6381             :                 &filter_params_x,//puSize > 8 ? &av1RegularFilter : &av1RegularFilterW4,
    6382             :                 &filter_params_y,//puSize > 8 ? &av1RegularFilter : &av1RegularFilterW4,
    6383             :                 subpel_x,
    6384             :                 subpel_y,
    6385             :                 &conv_params,
    6386             :                 bit_depth);
    6387             :     }
    6388      621708 : }
    6389             : 
    6390             : 
    6391     5437110 : static void plane_warped_motion_prediction(
    6392             :     PictureControlSet *picture_control_set_ptr,
    6393             :     uint8_t compound_idx,
    6394             :     InterInterCompoundData *interinter_comp,
    6395             :     uint16_t pu_origin_x,
    6396             :     uint16_t pu_origin_y,
    6397             :     const BlockGeom *blk_geom,
    6398             :     uint8_t bwidth,
    6399             :     uint8_t bheight,
    6400             :     EbWarpedMotionParams *wm_params_l0,
    6401             :     EbWarpedMotionParams *wm_params_l1,
    6402             :     uint8_t is_compound,
    6403             :     uint8_t bit_depth,
    6404             :     int32_t src_stride,
    6405             :     int32_t dst_stride,
    6406             :     uint16_t buf_width,
    6407             :     uint16_t buf_height,
    6408             :     uint8_t ss_x,
    6409             :     uint8_t ss_y,
    6410             :     uint8_t *src_ptr_l0,
    6411             :     uint8_t *src_ptr_l1,
    6412             :     uint8_t *dst_ptr,
    6413             :     uint8_t plane,
    6414             :     MvReferenceFrame rf[2])
    6415             : {
    6416     5437110 :     EbBool is16bit = (EbBool)(bit_depth > EB_8BIT);
    6417             : 
    6418     5437110 :     if (!is_compound) {
    6419     3306670 :         ConvolveParams conv_params = get_conv_params_no_round(0, 0, 0, NULL, 128, is_compound, bit_depth);
    6420             : 
    6421     3306760 :         eb_av1_warp_plane(
    6422             :             wm_params_l0,
    6423             :             (int) is16bit,
    6424             :             bit_depth,
    6425             :             src_ptr_l0,
    6426             :             (int) buf_width,
    6427             :             (int) buf_height,
    6428             :             src_stride,
    6429             :             dst_ptr,
    6430             :             pu_origin_x,
    6431             :             pu_origin_y,
    6432             :             bwidth,
    6433             :             bheight,
    6434             :             dst_stride,
    6435             :             ss_x,
    6436             :             ss_y,
    6437             :             &conv_params);
    6438             :     } else {
    6439             :         DECLARE_ALIGNED(32, uint16_t, tmp_dstY[128 * 128]);//move this to context if stack does not hold.
    6440             : 
    6441     2130440 :         ConvolveParams conv_params = get_conv_params_no_round(0, 0, 0, tmp_dstY, 128, is_compound, bit_depth);
    6442     2130920 :         av1_dist_wtd_comp_weight_assign(
    6443     2130920 :             &picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header,
    6444     2130920 :             picture_control_set_ptr->parent_pcs_ptr->cur_order_hint,// cur_frame_index,
    6445     2130920 :             picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[0] - 1],// bck_frame_index,
    6446     2130920 :             picture_control_set_ptr->parent_pcs_ptr->ref_order_hint[rf[1] - 1],// fwd_frame_index,
    6447             :             compound_idx,
    6448             :             0,// order_idx,
    6449             :             &conv_params.fwd_offset, &conv_params.bck_offset,
    6450             :             &conv_params.use_dist_wtd_comp_avg, is_compound);
    6451     2130880 :         conv_params.use_jnt_comp_avg = conv_params.use_dist_wtd_comp_avg;
    6452             : 
    6453     2130880 :         conv_params.do_average = 0;
    6454     2130880 :         eb_av1_warp_plane(
    6455             :             wm_params_l0,
    6456             :             (int) is16bit,
    6457             :             bit_depth,
    6458             :             src_ptr_l0,
    6459             :             (int) buf_width,
    6460             :             (int) buf_height,
    6461             :             src_stride,
    6462             :             dst_ptr,
    6463             :             pu_origin_x,
    6464             :             pu_origin_y,
    6465             :             bwidth,
    6466             :             bheight,
    6467             :             dst_stride,
    6468             :             ss_x,
    6469             :             ss_y,
    6470             :             &conv_params);
    6471             : 
    6472     2130990 :         if (is_masked_compound_type(interinter_comp->type)) {
    6473           0 :             av1_make_masked_warp_inter_predictor(
    6474             :                 src_ptr_l1,
    6475             :                 src_stride,
    6476             :                 buf_width,
    6477             :                 buf_height,
    6478             :                 dst_ptr,
    6479             :                 dst_stride,
    6480             :                 blk_geom,
    6481             :                 bwidth,
    6482             :                 bheight,
    6483             :                 &conv_params,
    6484             :                 interinter_comp,
    6485             :                 bit_depth,
    6486             :                 plane,
    6487             :                 pu_origin_x,
    6488             :                 pu_origin_y,
    6489             :                 wm_params_l1
    6490             :             );
    6491             :         } else {
    6492     2130960 :             conv_params.do_average = 1;
    6493     2130960 :             eb_av1_warp_plane(
    6494             :                 wm_params_l1,
    6495             :                 (int) is16bit,
    6496             :                 bit_depth,
    6497             :                 src_ptr_l1,
    6498             :                 (int) buf_width,
    6499             :                 (int) buf_height,
    6500             :                 src_stride,
    6501             :                 dst_ptr,
    6502             :                 pu_origin_x,
    6503             :                 pu_origin_y,
    6504             :                 bwidth,
    6505             :                 bheight,
    6506             :                 dst_stride,
    6507             :                 ss_x,
    6508             :                 ss_y,
    6509             :                 &conv_params);
    6510             :         }
    6511             :     }
    6512     5437840 : }
    6513             : 
    6514             : 
    6515     5057210 : EbErrorType warped_motion_prediction(
    6516             :     PictureControlSet                    *picture_control_set_ptr,
    6517             :     MvUnit                               *mv_unit,
    6518             :     uint8_t                               ref_frame_type,
    6519             :     uint8_t                               compound_idx,
    6520             :     InterInterCompoundData               *interinter_comp,
    6521             :     uint16_t                              pu_origin_x,
    6522             :     uint16_t                              pu_origin_y,
    6523             :     CodingUnit                           *cu_ptr,
    6524             :     const BlockGeom                      *blk_geom,
    6525             :     EbPictureBufferDesc                  *ref_pic_list0,
    6526             :     EbPictureBufferDesc                  *ref_pic_list1,
    6527             :     EbPictureBufferDesc                  *prediction_ptr,
    6528             :     uint16_t                              dst_origin_x,
    6529             :     uint16_t                              dst_origin_y,
    6530             :     EbWarpedMotionParams                 *wm_params_l0,
    6531             :     EbWarpedMotionParams                 *wm_params_l1,
    6532             :     uint8_t                               bit_depth,
    6533             :     EbBool                                perform_chroma)
    6534             : {
    6535     5057210 :     EbErrorType  return_error = EB_ErrorNone;
    6536     5057210 :     uint8_t is_compound = (mv_unit->pred_direction == BI_PRED) ? 1 : 0;
    6537     5057210 :     EbBool is16bit = (EbBool)(bit_depth > EB_8BIT);
    6538             : 
    6539             :     int32_t src_stride;
    6540             :     int32_t dst_stride;
    6541             :     uint16_t buf_width;
    6542             :     uint16_t buf_height;
    6543     5057210 :     uint8_t ss_x = 1; // subsamplings
    6544     5057210 :     uint8_t ss_y = 1;
    6545             : 
    6546             :     MvReferenceFrame rf[2];
    6547     5057210 :     av1_set_ref_frame(rf, ref_frame_type);
    6548             : 
    6549             :     uint8_t *src_ptr_l0, *src_ptr_l1;
    6550             :     uint8_t *dst_ptr;
    6551     5056920 :     assert(ref_pic_list0 != NULL);
    6552             : 
    6553             :     // Y
    6554     5056990 :     src_ptr_l0 = ref_pic_list0->buffer_y + (is16bit ? 2 : 1)
    6555     5056990 :                  * (ref_pic_list0->origin_x + ref_pic_list0->origin_y * ref_pic_list0->stride_y);
    6556     1867670 :     src_ptr_l1 = is_compound ? ref_pic_list1->buffer_y + (is16bit ? 2 : 1)
    6557     1867670 :                                * (ref_pic_list1->origin_x + ref_pic_list1->origin_y * ref_pic_list1->stride_y)
    6558     6924660 :                              : NULL;
    6559     5056990 :     src_stride = ref_pic_list0->stride_y;
    6560     5056990 :     buf_width = ref_pic_list0->width;
    6561     5056990 :     buf_height = ref_pic_list0->height;
    6562             : 
    6563     5056990 :     dst_ptr = prediction_ptr->buffer_y + (is16bit ? 2 : 1)
    6564     5056990 :               * (prediction_ptr->origin_x + dst_origin_x
    6565     5056990 :                  + (prediction_ptr->origin_y + dst_origin_y) * prediction_ptr->stride_y);
    6566     5056990 :     dst_stride = prediction_ptr->stride_y;
    6567             : 
    6568             :     // Warp plane
    6569     5056990 :     plane_warped_motion_prediction(
    6570             :         picture_control_set_ptr,
    6571             :         compound_idx,
    6572             :         interinter_comp,
    6573             :         pu_origin_x,
    6574             :         pu_origin_y,
    6575             :         blk_geom,
    6576     5056990 :         blk_geom->bwidth,
    6577     5056990 :         blk_geom->bheight,
    6578             :         wm_params_l0,
    6579             :         wm_params_l1,
    6580             :         is_compound,
    6581             :         bit_depth,
    6582             :         src_stride,
    6583             :         dst_stride,
    6584             :         buf_width,
    6585             :         buf_height,
    6586             :         0,
    6587             :         0,
    6588             :         src_ptr_l0,
    6589             :         src_ptr_l1,
    6590             :         dst_ptr,
    6591             :         0, // plane
    6592             :         rf);
    6593             : 
    6594     5057460 :     if (!blk_geom->has_uv)
    6595           0 :         return return_error;
    6596             : 
    6597     5057460 :     if (perform_chroma) {
    6598      500978 :         if (blk_geom->bwidth >= 16 && blk_geom->bheight >= 16) {
    6599             :             // Cb
    6600      190118 :             src_ptr_l0 = ref_pic_list0->buffer_cb + (is16bit ? 2 : 1)
    6601      190118 :                          * (ref_pic_list0->origin_x / 2
    6602      190118 :                             + (ref_pic_list0->origin_y / 2) * ref_pic_list0->stride_cb);
    6603      131638 :             src_ptr_l1 = is_compound ? ref_pic_list1->buffer_cb + (is16bit ? 2 : 1)
    6604      131638 :                                        * (ref_pic_list1->origin_x / 2
    6605      131638 :                                           + (ref_pic_list1->origin_y / 2 ) * ref_pic_list1->stride_cb)
    6606      321756 :                                      : NULL;
    6607      190118 :             src_stride = ref_pic_list0->stride_cb;
    6608             : 
    6609      190118 :             dst_ptr = prediction_ptr->buffer_cb + (is16bit ? 2 : 1)
    6610      190118 :                       * ((prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2
    6611      190118 :                          + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb);
    6612      190118 :             dst_stride = prediction_ptr->stride_cb;
    6613             : 
    6614      190118 :             plane_warped_motion_prediction(
    6615             :                 picture_control_set_ptr,
    6616             :                 compound_idx,
    6617             :                 interinter_comp,
    6618      190118 :                 pu_origin_x >> ss_x,
    6619      190118 :                 pu_origin_y >> ss_y,
    6620             :                 blk_geom,
    6621      190118 :                 blk_geom->bwidth_uv,
    6622      190118 :                 blk_geom->bheight_uv,
    6623             :                 wm_params_l0,
    6624             :                 wm_params_l1,
    6625             :                 is_compound,
    6626             :                 bit_depth,
    6627             :                 src_stride,
    6628             :                 dst_stride,
    6629      190118 :                 buf_width >> ss_x,
    6630      190118 :                 buf_height >> ss_y,
    6631             :                 ss_x,
    6632             :                 ss_y,
    6633             :                 src_ptr_l0,
    6634             :                 src_ptr_l1,
    6635             :                 dst_ptr,
    6636             :                 1, // plane
    6637             :                 rf);
    6638             : 
    6639             :             // Cr
    6640      190117 :             src_ptr_l0 = ref_pic_list0->buffer_cr + (is16bit ? 2 : 1)
    6641      190117 :                          * (ref_pic_list0->origin_x / 2
    6642      190117 :                             + (ref_pic_list0->origin_y / 2 ) * ref_pic_list0->stride_cr);
    6643      131637 :             src_ptr_l1 = is_compound ? ref_pic_list1->buffer_cr + (is16bit ? 2 : 1)
    6644      131637 :                                        * (ref_pic_list1->origin_x / 2
    6645      131637 :                                           + (ref_pic_list1->origin_y / 2 ) * ref_pic_list1->stride_cr)
    6646      321754 :                                      : NULL;
    6647      190117 :             src_stride = ref_pic_list0->stride_cr;
    6648             : 
    6649      190117 :             dst_ptr = prediction_ptr->buffer_cr + (is16bit ? 2 : 1)
    6650      190117 :                       * ((prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2
    6651      190117 :                          + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cr);
    6652      190117 :             dst_stride = prediction_ptr->stride_cr;
    6653             : 
    6654      190117 :             plane_warped_motion_prediction(
    6655             :                 picture_control_set_ptr,
    6656             :                 compound_idx,
    6657             :                 interinter_comp,
    6658      190117 :                 pu_origin_x >> ss_x,
    6659      190117 :                 pu_origin_y >> ss_y,
    6660             :                 blk_geom,
    6661      190117 :                 blk_geom->bwidth_uv,
    6662      190117 :                 blk_geom->bheight_uv,
    6663             :                 wm_params_l0,
    6664             :                 wm_params_l1,
    6665             :                 is_compound,
    6666             :                 bit_depth,
    6667             :                 src_stride,
    6668             :                 dst_stride,
    6669      190117 :                 buf_width >> ss_x,
    6670      190117 :                 buf_height >> ss_y,
    6671             :                 ss_x,
    6672             :                 ss_y,
    6673             :                 src_ptr_l0,
    6674             :                 src_ptr_l1,
    6675             :                 dst_ptr,
    6676             :                 2, // plane
    6677             :                 rf);
    6678             : 
    6679             :         } else { // Translation prediction when chroma block is smaller than 8x8
    6680             : 
    6681             :             // Cb
    6682      310860 :             src_ptr_l0 = ref_pic_list0->buffer_cb + (is16bit ? 2 : 1)
    6683      310860 :                          * ((ref_pic_list0->origin_x + ((pu_origin_x >> 3) << 3)) / 2
    6684      310860 :                             + (ref_pic_list0->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list0->stride_cb);
    6685      209746 :             src_ptr_l1 = is_compound ? ref_pic_list1->buffer_cb + (is16bit ? 2 : 1)
    6686      209746 :                                        * ((ref_pic_list1->origin_x + ((pu_origin_x >> 3) << 3)) / 2
    6687      209746 :                                           + (ref_pic_list1->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list1->stride_cb)
    6688      520606 :                                      : NULL;
    6689      310860 :             dst_ptr = prediction_ptr->buffer_cb + (is16bit ? 2 : 1)
    6690      310860 :                       * ((prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2
    6691      310860 :                          + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb);
    6692      310860 :             src_stride = ref_pic_list0->stride_cb;
    6693      310860 :             dst_stride = prediction_ptr->stride_cb;
    6694             : 
    6695      310860 :             chroma_plane_warped_motion_prediction_sub8x8(
    6696             :                 picture_control_set_ptr,
    6697             :                 compound_idx,
    6698             :                 cu_ptr,
    6699             :                 blk_geom,
    6700      310860 :                 blk_geom->bwidth_uv,
    6701      310860 :                 blk_geom->bheight_uv,
    6702             :                 is_compound,
    6703             :                 bit_depth,
    6704             :                 src_stride,
    6705             :                 dst_stride,
    6706             :                 src_ptr_l0,
    6707             :                 src_ptr_l1,
    6708             :                 dst_ptr,
    6709             :                 rf,
    6710             :                 mv_unit);
    6711             : 
    6712             :             // Cr
    6713      310860 :             src_ptr_l0 = ref_pic_list0->buffer_cr + (is16bit ? 2 : 1)
    6714      310860 :                          * ((ref_pic_list0->origin_x + ((pu_origin_x >> 3) << 3)) / 2
    6715      310860 :                             + (ref_pic_list0->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list0->stride_cr);
    6716      209746 :             src_ptr_l1 = is_compound ? ref_pic_list1->buffer_cr + (is16bit ? 2 : 1)
    6717      209746 :                                        * ((ref_pic_list1->origin_x + ((pu_origin_x >> 3) << 3)) / 2
    6718      209746 :                                           + (ref_pic_list1->origin_y + ((pu_origin_y >> 3) << 3)) / 2 * ref_pic_list1->stride_cr)
    6719      520606 :                                      : NULL;
    6720      310860 :             dst_ptr = prediction_ptr->buffer_cr + (is16bit ? 2 : 1)
    6721      310860 :                       * ((prediction_ptr->origin_x + ((dst_origin_x >> 3) << 3)) / 2
    6722      310860 :                          + (prediction_ptr->origin_y + ((dst_origin_y >> 3) << 3)) / 2 * prediction_ptr->stride_cb);
    6723      310860 :             src_stride = ref_pic_list0->stride_cr;
    6724      310860 :             dst_stride = prediction_ptr->stride_cr;
    6725             : 
    6726      310860 :             chroma_plane_warped_motion_prediction_sub8x8(
    6727             :                 picture_control_set_ptr,
    6728             :                 compound_idx,
    6729             :                 cu_ptr,
    6730             :                 blk_geom,
    6731      310860 :                 blk_geom->bwidth_uv,
    6732      310860 :                 blk_geom->bheight_uv,
    6733             :                 is_compound,
    6734             :                 bit_depth,
    6735             :                 src_stride,
    6736             :                 dst_stride,
    6737             :                 src_ptr_l0,
    6738             :                 src_ptr_l1,
    6739             :                 dst_ptr,
    6740             :                 rf,
    6741             :                 mv_unit);
    6742             :         }
    6743             :     }
    6744             : 
    6745     5057460 :     return return_error;
    6746             : }
    6747             : 
    6748             : 
    6749             : #define SWITCHABLE_INTERP_RATE_FACTOR 1
    6750             : extern int32_t eb_av1_get_pred_context_switchable_interp(
    6751             :     NeighborArrayUnit     *ref_frame_type_neighbor_array,
    6752             :     MvReferenceFrame rf0,
    6753             :     MvReferenceFrame rf1,
    6754             :     NeighborArrayUnit32     *interpolation_type_neighbor_array,
    6755             :     uint32_t cu_origin_x,
    6756             :     uint32_t cu_origin_y,
    6757             :     int32_t dir
    6758             : );
    6759             : 
    6760    66250300 : int32_t eb_av1_get_switchable_rate(
    6761             :     ModeDecisionCandidateBuffer *candidate_buffer_ptr,
    6762             :     const Av1Common *const cm,
    6763             :     ModeDecisionContext *md_context_ptr)
    6764             : {
    6765    66250300 :     if (cm->interp_filter == SWITCHABLE) {
    6766    66254000 :         int32_t inter_filter_cost = 0;
    6767             :         int32_t dir;
    6768             : 
    6769   198361000 :         for (dir = 0; dir < 2; ++dir) {
    6770             :             MvReferenceFrame rf[2];
    6771   132096000 :             av1_set_ref_frame(rf, candidate_buffer_ptr->candidate_ptr->ref_frame_type);
    6772   132081000 :             const int32_t ctx = eb_av1_get_pred_context_switchable_interp(
    6773             :                 md_context_ptr->ref_frame_type_neighbor_array,
    6774   132081000 :                 rf[0],
    6775   132081000 :                 rf[1],
    6776             :                 md_context_ptr->interpolation_type_neighbor_array,
    6777   132081000 :                 md_context_ptr->cu_origin_x,
    6778   132081000 :                 md_context_ptr->cu_origin_y,
    6779             :                 dir
    6780             :             );
    6781             : 
    6782   132112000 :             const InterpFilter filter = av1_extract_interp_filter(/*mbmi*/candidate_buffer_ptr->candidate_ptr->interp_filters, dir);
    6783   132107000 :             assert(ctx < SWITCHABLE_FILTER_CONTEXTS);
    6784   132107000 :             assert(filter < SWITCHABLE_FILTERS);
    6785   132107000 :             inter_filter_cost +=  /*x->switchable_interp_costs*/md_context_ptr->md_rate_estimation_ptr->switchable_interp_fac_bitss[ctx][filter];
    6786             :         }
    6787    66265100 :         return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
    6788             :     }
    6789             :     else
    6790           0 :         return 0;
    6791             : }
    6792             : 
    6793             : //void model_rd_norm(int32_t xsq_q10, int32_t *r_q10, int32_t *d_q10) {
    6794             :  // NOTE: The tables below must be of the same size.
    6795             : 
    6796             :  // The functions described below are sampled at the four most significant
    6797             :  // bits of x^2 + 8 / 256.
    6798             : 
    6799           0 : void highbd_variance64_c(const uint8_t *a8, int32_t a_stride,
    6800             :     const uint8_t *b8, int32_t b_stride, int32_t w, int32_t h,
    6801             :     uint64_t *sse) {
    6802           0 :     const uint8_t *a = a8;//CONVERT_TO_SHORTPTR(a8);
    6803           0 :     const uint8_t *b = b8;//CONVERT_TO_SHORTPTR(b8);
    6804           0 :     uint64_t tsse = 0;
    6805           0 :     for (int32_t i = 0; i < h; ++i) {
    6806           0 :         for (int32_t j = 0; j < w; ++j) {
    6807           0 :             const int32_t diff = a[j] - b[j];
    6808           0 :             tsse += (uint32_t)(diff * diff);
    6809             :         }
    6810           0 :         a += a_stride;
    6811           0 :         b += b_stride;
    6812             :     }
    6813           0 :     *sse = tsse;
    6814           0 : }
    6815             : 
    6816             : #define RDDIV_BITS 7
    6817             : #define RDCOST(RM, R, D)                                            \
    6818             :   (ROUND_POWER_OF_TWO(((uint64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \
    6819             :    ((D) * (1 << RDDIV_BITS)))
    6820             : 
    6821    66339100 : static void model_rd_norm(int32_t xsq_q10, int32_t *r_q10, int32_t *d_q10) {
    6822             :     // NOTE: The tables below must be of the same size.
    6823             : 
    6824             :     // The functions described below are sampled at the four most significant
    6825             :     // bits of x^2 + 8 / 256.
    6826             : 
    6827             :     // Normalized rate:
    6828             :     // This table models the rate for a Laplacian source with given variance
    6829             :     // when quantized with a uniform quantizer with given stepsize. The
    6830             :     // closed form expression is:
    6831             :     // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
    6832             :     // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
    6833             :     // and H(x) is the binary entropy function.
    6834             :     static const int32_t rate_tab_q10[] = {
    6835             :       65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142,
    6836             :       4044,  3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186,
    6837             :       3133,  3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353,
    6838             :       2290,  2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651,
    6839             :       1608,  1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963,
    6840             :       911,   864,  821,  781,  745,  680,  623,  574,  530,  490,  455,  424,
    6841             :       395,   345,  304,  269,  239,  213,  190,  171,  154,  126,  104,  87,
    6842             :       73,    61,   52,   44,   38,   28,   21,   16,   12,   10,   8,    6,
    6843             :       5,     3,    2,    1,    1,    1,    0,    0,
    6844             :     };
    6845             :     // Normalized distortion:
    6846             :     // This table models the normalized distortion for a Laplacian source
    6847             :     // with given variance when quantized with a uniform quantizer
    6848             :     // with given stepsize. The closed form expression is:
    6849             :     // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
    6850             :     // where x = qpstep / sqrt(variance).
    6851             :     // Note the actual distortion is Dn * variance.
    6852             :     static const int32_t dist_tab_q10[] = {
    6853             :       0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    4,    5,
    6854             :       5,    6,    7,    7,    8,    9,    11,   12,   13,   15,   16,   17,
    6855             :       18,   21,   24,   26,   29,   31,   34,   36,   39,   44,   49,   54,
    6856             :       59,   64,   69,   73,   78,   88,   97,   106,  115,  124,  133,  142,
    6857             :       151,  167,  184,  200,  215,  231,  245,  260,  274,  301,  327,  351,
    6858             :       375,  397,  418,  439,  458,  495,  528,  559,  587,  613,  637,  659,
    6859             :       680,  717,  749,  777,  801,  823,  842,  859,  874,  899,  919,  936,
    6860             :       949,  960,  969,  977,  983,  994,  1001, 1006, 1010, 1013, 1015, 1017,
    6861             :       1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
    6862             :     };
    6863             :     static const int32_t xsq_iq_q10[] = {
    6864             :       0,      4,      8,      12,     16,     20,     24,     28,     32,
    6865             :       40,     48,     56,     64,     72,     80,     88,     96,     112,
    6866             :       128,    144,    160,    176,    192,    208,    224,    256,    288,
    6867             :       320,    352,    384,    416,    448,    480,    544,    608,    672,
    6868             :       736,    800,    864,    928,    992,    1120,   1248,   1376,   1504,
    6869             :       1632,   1760,   1888,   2016,   2272,   2528,   2784,   3040,   3296,
    6870             :       3552,   3808,   4064,   4576,   5088,   5600,   6112,   6624,   7136,
    6871             :       7648,   8160,   9184,   10208,  11232,  12256,  13280,  14304,  15328,
    6872             :       16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,  32736,
    6873             :       36832,  40928,  45024,  49120,  53216,  57312,  61408,  65504,  73696,
    6874             :       81888,  90080,  98272,  106464, 114656, 122848, 131040, 147424, 163808,
    6875             :       180192, 196576, 212960, 229344, 245728,
    6876             :     };
    6877    66339100 :     const int32_t tmp = (xsq_q10 >> 2) + 8;
    6878    66339100 :     const int32_t k = get_msb(tmp) - 3;
    6879    66326600 :     const int32_t xq = (k << 3) + ((tmp >> k) & 0x7);
    6880    66326600 :     const int32_t one_q10 = 1 << 10;
    6881    66326600 :     const int32_t a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
    6882    66326600 :     const int32_t b_q10 = one_q10 - a_q10;
    6883    66326600 :     *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
    6884    66326600 :     *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
    6885    66326600 : }
    6886             : 
    6887    66283600 : void eb_av1_model_rd_from_var_lapndz(int64_t var, uint32_t n_log2,
    6888             :     uint32_t qstep, int32_t *rate,
    6889             :     int64_t *dist) {
    6890             :     // This function models the rate and distortion for a Laplacian
    6891             :     // source with given variance when quantized with a uniform quantizer
    6892             :     // with given stepsize. The closed form expressions are in:
    6893             :     // Hang and Chen, "Source Model for transform video coder and its
    6894             :     // application - Part I: Fundamental Theory", IEEE Trans. Circ.
    6895             :     // Sys. for Video Tech., April 1997.
    6896    66283600 :     if (var == 0) {
    6897         596 :         *rate = 0;
    6898         596 :         *dist = 0;
    6899             :     }
    6900             :     else {
    6901             :         int32_t d_q10, r_q10;
    6902             :         static const uint32_t MAX_XSQ_Q10 = 245727;
    6903    66283000 :         const uint64_t xsq_q10_64 =
    6904    66283000 :             (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
    6905    66283000 :         const int32_t xsq_q10 = (int32_t)MIN(xsq_q10_64, MAX_XSQ_Q10);
    6906    66283000 :         model_rd_norm(xsq_q10, &r_q10, &d_q10);
    6907    66342200 :         *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - AV1_PROB_COST_SHIFT);
    6908    66342200 :         *dist = (var * (int64_t)d_q10 + 512) >> 10;
    6909             :     }
    6910    66342700 : }
    6911             : 
    6912    66285500 : void model_rd_from_sse(
    6913             :     BlockSize bsize,
    6914             :     int16_t quantizer,
    6915             :     uint8_t bit_depth,
    6916             :     uint64_t sse,
    6917             :     uint32_t *rate,
    6918             :     uint64_t *dist)
    6919             : {
    6920             :     /* OMK (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :3;*/
    6921    66285500 :     int32_t dequant_shift = bit_depth - 5;
    6922             : 
    6923             :     // Fast approximate the modelling function.
    6924             :     if (0/*cpi->sf.simple_model_rd_from_var*/)
    6925             :     {
    6926             :         int64_t square_error = (uint64_t)sse;
    6927             :         quantizer = quantizer >> dequant_shift;
    6928             : 
    6929             :         if (quantizer < 120)
    6930             :             *rate = (int32_t)((square_error * (280 - quantizer)) >>
    6931             :             (16 - AV1_PROB_COST_SHIFT));
    6932             :         else
    6933             :             *rate = 0;
    6934             :         *dist = (uint64_t)(square_error * quantizer) >> 8;
    6935             :     } else {
    6936    66285500 :         eb_av1_model_rd_from_var_lapndz((uint64_t)sse, num_pels_log2_lookup[bsize],
    6937    66285500 :             quantizer >> dequant_shift, (int32_t*)rate,
    6938             :             (int64_t*)dist);
    6939             :     }
    6940             : 
    6941    66342400 :     *dist <<= 4;
    6942    66342400 : }
    6943             : 
    6944    66236800 : extern void model_rd_for_sb(
    6945             :     PictureControlSet *picture_control_set_ptr,
    6946             :     EbPictureBufferDesc *prediction_ptr,
    6947             :     ModeDecisionContext *md_context_ptr,
    6948             :     int32_t plane_from,
    6949             :     int32_t plane_to,
    6950             :     int32_t *out_rate_sum,
    6951             :     int64_t *out_dist_sum,
    6952             :     uint8_t bit_depth)
    6953             : {
    6954             :     // Note our transform coeffs are 8 times an orthogonal transform.
    6955             :     // Hence quantizer step is also 8 times. To get effective quantizer
    6956             :     // we need to divide by 8 before sending to modeling function.
    6957             :     int32_t plane;
    6958             : 
    6959    66236800 :     uint64_t rate_sum = 0;
    6960    66236800 :     uint64_t dist_sum = 0;
    6961    66236800 :     uint64_t total_sse = 0;
    6962             : 
    6963    66236800 :     EbPictureBufferDesc *input_picture_ptr = bit_depth > 8 ? picture_control_set_ptr->input_frame16bit : picture_control_set_ptr->parent_pcs_ptr->enhanced_picture_ptr;
    6964    66236800 :     const uint32_t input_offset = (md_context_ptr->cu_origin_y + input_picture_ptr->origin_y) * input_picture_ptr->stride_y + (md_context_ptr->cu_origin_x + input_picture_ptr->origin_x);
    6965    66236800 :     const uint32_t input_chroma_offset = ((md_context_ptr->cu_origin_y + input_picture_ptr->origin_y) * input_picture_ptr->stride_cb + (md_context_ptr->cu_origin_x + input_picture_ptr->origin_x)) / 2;
    6966    66236800 :     const uint32_t prediction_offset = prediction_ptr->origin_x + md_context_ptr->blk_geom->origin_x + (prediction_ptr->origin_y + md_context_ptr->blk_geom->origin_y) * prediction_ptr->stride_y;
    6967    66236800 :     const uint32_t prediction_chroma_offset = (prediction_ptr->origin_x + md_context_ptr->blk_geom->origin_x + (prediction_ptr->origin_y + md_context_ptr->blk_geom->origin_y) * prediction_ptr->stride_cb) / 2;
    6968             : 
    6969    66236800 :     EbSpatialFullDistType spatial_full_dist_type_fun = bit_depth > 8 ?
    6970    66236800 :         full_distortion_kernel16_bits : spatial_full_distortion_kernel;
    6971             : 
    6972   132580000 :     for (plane = plane_from; plane <= plane_to; ++plane) {
    6973             :         uint64_t sse;
    6974             :         uint32_t rate;
    6975             :         uint64_t dist;
    6976             : 
    6977    66301200 :         if (plane == 0) {
    6978    66301200 :             sse = spatial_full_dist_type_fun(
    6979             :                 input_picture_ptr->buffer_y,
    6980             :                 input_offset,
    6981    66301200 :                 input_picture_ptr->stride_y,
    6982             :                 prediction_ptr->buffer_y,
    6983             :                 prediction_offset,
    6984    66301200 :                 prediction_ptr->stride_y,
    6985    66301200 :                 md_context_ptr->blk_geom->bwidth,
    6986    66301200 :                 md_context_ptr->blk_geom->bheight);
    6987             :         }
    6988           0 :         else if (plane == 1) {
    6989           0 :             sse = spatial_full_dist_type_fun(
    6990             :                 input_picture_ptr->buffer_cb,
    6991             :                 input_chroma_offset,
    6992           0 :                 input_picture_ptr->stride_cb,
    6993             :                 prediction_ptr->buffer_cb,
    6994             :                 prediction_chroma_offset,
    6995           0 :                 prediction_ptr->stride_cb,
    6996           0 :                 md_context_ptr->blk_geom->bwidth_uv,
    6997           0 :                 md_context_ptr->blk_geom->bheight_uv);
    6998             :         } else {
    6999           0 :             sse = spatial_full_dist_type_fun(
    7000             :                 input_picture_ptr->buffer_cr,
    7001             :                 input_chroma_offset,
    7002           0 :                 input_picture_ptr->stride_cr,
    7003             :                 prediction_ptr->buffer_cr,
    7004             :                 prediction_chroma_offset,
    7005           0 :                 prediction_ptr->stride_cr,
    7006           0 :                 md_context_ptr->blk_geom->bwidth_uv,
    7007           0 :                 md_context_ptr->blk_geom->bheight_uv);
    7008             :         }
    7009             : 
    7010    66295600 :         sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8));
    7011    66295600 :         total_sse += sse;
    7012             : 
    7013    66295600 :         int32_t current_q_index = picture_control_set_ptr->
    7014    66295600 :             parent_pcs_ptr->frm_hdr.quantization_params.base_q_idx;
    7015    66295600 :         Dequants *const dequants = &picture_control_set_ptr->parent_pcs_ptr->deq;
    7016             : 
    7017    66295600 :         int16_t quantizer = dequants->y_dequant_Q3[current_q_index][1];
    7018   132591000 :         model_rd_from_sse(
    7019    66295600 :             plane == 0 ? md_context_ptr->blk_geom->bsize : md_context_ptr->blk_geom->bsize_uv,
    7020             :             quantizer,
    7021             :             bit_depth,
    7022             :             sse,
    7023             :             &rate,
    7024             :             &dist);
    7025             : 
    7026    66343100 :         rate_sum += rate;
    7027    66343100 :         dist_sum += dist;
    7028             :     }
    7029             : 
    7030             :     //*skip_txfm_sb = total_sse == 0;
    7031             :     //*skip_sse_sb = total_sse << 4;
    7032    66278700 :     *out_rate_sum = (int32_t)rate_sum;
    7033    66278700 :     *out_dist_sum = dist_sum;
    7034    66278700 : }
    7035             : 
    7036             : 
    7037    22012900 : int32_t is_nontrans_global_motion(
    7038             :     BlockSize sb_type,
    7039             :     ModeDecisionCandidateBuffer *candidate_buffer_ptr,
    7040             :     PictureControlSet *picture_control_set_ptr)
    7041             : {
    7042             :     int32_t ref;
    7043             : 
    7044             :     // First check if all modes are GLOBALMV
    7045    22012900 :     if (candidate_buffer_ptr->candidate_ptr->pred_mode != GLOBALMV && candidate_buffer_ptr->candidate_ptr->pred_mode != GLOBAL_GLOBALMV)
    7046    22013700 :         return 0;
    7047             : 
    7048           0 :     if (MIN(mi_size_wide[sb_type], mi_size_high[sb_type]) < 2)
    7049           0 :         return 0;
    7050             :     MvReferenceFrame rf[2];
    7051           0 :     av1_set_ref_frame(rf, candidate_buffer_ptr->candidate_ptr->ref_frame_type);
    7052             :     // Now check if all global motion is non translational
    7053           0 :     for (ref = 0; ref < 1 + candidate_buffer_ptr->candidate_ptr->is_compound/*has_second_ref(mbmi)*/; ++ref) {
    7054           0 :         if (picture_control_set_ptr->parent_pcs_ptr->global_motion[ref ? rf[1] : rf[0]].wmtype == TRANSLATION)
    7055             :             //if (xd->global_motion[mbmi->ref_frame[ref]].wmtype == TRANSLATION)
    7056           0 :             return 0;
    7057             :     }
    7058           0 :     return 1;
    7059             : }
    7060    22361200 : static INLINE int32_t av1_is_interp_needed(
    7061             :     ModeDecisionCandidateBuffer *candidate_buffer_ptr,
    7062             :     PictureControlSet *picture_control_set_ptr,
    7063             :     BlockSize bsize)
    7064             : {
    7065    22361200 :     if (candidate_buffer_ptr->candidate_ptr->merge_flag)
    7066      348252 :         return 0;
    7067             : 
    7068    22013000 :     if (candidate_buffer_ptr->candidate_ptr->motion_mode == WARPED_CAUSAL)
    7069           0 :         return 0;
    7070             : 
    7071    22013000 :     if (is_nontrans_global_motion( bsize,
    7072             :         candidate_buffer_ptr, picture_control_set_ptr))
    7073           0 :         return 0;
    7074             : 
    7075    22014100 :     return 1;
    7076             : }
    7077             : 
    7078             : #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
    7079             : static const int32_t filter_sets[DUAL_FILTER_SET_SIZE][2] = {
    7080             :   { 0, 0 }, { 0, 1 }, { 0, 2 }, { 1, 0 }, { 1, 1 },
    7081             :   { 1, 2 }, { 2, 0 }, { 2, 1 }, { 2, 2 },
    7082             : };
    7083             : 
    7084    22355900 : void interpolation_filter_search(
    7085             :     PictureControlSet *picture_control_set_ptr,
    7086             :     EbPictureBufferDesc *prediction_ptr,
    7087             :     ModeDecisionContext *md_context_ptr,
    7088             :     ModeDecisionCandidateBuffer *candidate_buffer_ptr,
    7089             :     MvUnit mv_unit,
    7090             :     EbPictureBufferDesc  *ref_pic_list0,
    7091             :     EbPictureBufferDesc  *ref_pic_list1,
    7092             :     uint8_t hbd_mode_decision,
    7093             :     uint8_t bit_depth)
    7094             : {
    7095    22355900 :     const Av1Common *cm = picture_control_set_ptr->parent_pcs_ptr->av1_cm;//&cpi->common;
    7096    22358300 :     EbBool use_uv = (md_context_ptr->blk_geom->has_uv && md_context_ptr->chroma_level <= CHROMA_MODE_1 &&
    7097    44714300 :         picture_control_set_ptr->parent_pcs_ptr->interpolation_search_level != IT_SEARCH_FAST_LOOP_UV_BLIND) ? EB_TRUE : EB_FALSE;
    7098    22355900 :     const int32_t num_planes = use_uv ? MAX_MB_PLANE : 1;
    7099             : 
    7100    22355900 :     int64_t rd = INT64_MAX;
    7101    22355900 :     int32_t switchable_rate = 0;
    7102             : 
    7103             :     int32_t i;
    7104             :     int32_t tmp_rate;
    7105             :     int64_t tmp_dist;
    7106             : 
    7107    22355900 :     uint32_t full_lambda_8b = md_context_ptr->full_lambda >> (2 * (bit_depth - 8));
    7108             : 
    7109    22355900 :     InterpFilter assign_filter = SWITCHABLE;
    7110             : 
    7111    22355900 :     if (cm->interp_filter != SWITCHABLE)
    7112           0 :         assign_filter = cm->interp_filter;
    7113             : 
    7114             :     //set_default_interp_filters(mbmi, assign_filter);
    7115    44709500 :     /*mbmi*/candidate_buffer_ptr->candidate_ptr->interp_filters =//EIGHTTAP_REGULAR ;
    7116    22355900 :         av1_broadcast_interp_filter(av1_unswitchable_filter(assign_filter));
    7117             : 
    7118    22357300 :     switchable_rate = eb_av1_get_switchable_rate(
    7119             :         candidate_buffer_ptr,
    7120             :         cm,
    7121             :         md_context_ptr
    7122             :     );
    7123             : 
    7124    22342900 :     av1_inter_prediction_function_table[hbd_mode_decision](
    7125             :         picture_control_set_ptr,
    7126    22342900 :         candidate_buffer_ptr->candidate_ptr->interp_filters,
    7127             :         md_context_ptr->cu_ptr,
    7128    22342900 :         candidate_buffer_ptr->candidate_ptr->ref_frame_type,
    7129             :         &mv_unit,
    7130             :         0,
    7131             : #if OBMC_FLAG
    7132             :         SIMPLE_TRANSLATION,
    7133             :         0,
    7134             :         0,
    7135             : #endif
    7136    22342900 :         candidate_buffer_ptr->candidate_ptr->compound_idx,
    7137    22342900 :         &candidate_buffer_ptr->candidate_ptr->interinter_comp,
    7138             : #if II_COMP_FLAG
    7139    22342900 :         &md_context_ptr->sb_ptr->tile_info,
    7140             :         md_context_ptr->luma_recon_neighbor_array,
    7141             :         md_context_ptr->cb_recon_neighbor_array,
    7142             :         md_context_ptr->cr_recon_neighbor_array,
    7143             :         0, //No inter-intra for IFSearch
    7144    22342900 :         candidate_buffer_ptr->candidate_ptr->interintra_mode,
    7145    22342900 :         candidate_buffer_ptr->candidate_ptr->use_wedge_interintra,
    7146    22342900 :         candidate_buffer_ptr->candidate_ptr->interintra_wedge_index,
    7147             : #endif
    7148    22342900 :         md_context_ptr->cu_origin_x,
    7149    22342900 :         md_context_ptr->cu_origin_y,
    7150    22342900 :         md_context_ptr->blk_geom->bwidth,
    7151    22342900 :         md_context_ptr->blk_geom->bheight,
    7152             :         ref_pic_list0,
    7153             :         ref_pic_list1,
    7154             :         prediction_ptr,
    7155    22342900 :         md_context_ptr->blk_geom->origin_x,
    7156    22342900 :         md_context_ptr->blk_geom->origin_y,
    7157             :         use_uv,
    7158             : #if IFS_8BIT_MD
    7159             :         hbd_mode_decision ? EB_10BIT : EB_8BIT);
    7160             : #else
    7161             :         (uint8_t)picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->static_config.encoder_bit_depth);
    7162             : #endif
    7163             : 
    7164    22352000 :     model_rd_for_sb(
    7165             :         picture_control_set_ptr,
    7166             :         prediction_ptr,
    7167             :         md_context_ptr,
    7168             :         0,
    7169             :         num_planes - 1,
    7170             :         &tmp_rate,
    7171             :         &tmp_dist,
    7172             :         hbd_mode_decision ? EB_10BIT : EB_8BIT);
    7173             : 
    7174    22361200 :     rd = RDCOST(full_lambda_8b, switchable_rate + tmp_rate, tmp_dist);
    7175             : 
    7176    22361200 :     if (assign_filter == SWITCHABLE) {
    7177             :         // do interp_filter search
    7178    22361800 :         if (av1_is_interp_needed(candidate_buffer_ptr, picture_control_set_ptr, md_context_ptr->blk_geom->bsize) /*&& av1_is_interp_search_needed(xd)*/) {
    7179    22013800 :             const int32_t filter_set_size = DUAL_FILTER_SET_SIZE;
    7180    22013800 :             int32_t best_in_temp = 0;
    7181    22013800 :             uint32_t best_filters = 0;// mbmi->interp_filters;
    7182             : 
    7183    22013800 :             if (picture_control_set_ptr->parent_pcs_ptr->interpolation_search_level &&
    7184    22014500 :                 picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header.enable_dual_filter) {
    7185             :                 int32_t tmp_rs;
    7186             :                 int64_t tmp_rd;
    7187             : 
    7188             :                 // default to (R,R): EIGHTTAP_REGULARxEIGHTTAP_REGULAR
    7189           0 :                 int32_t best_dual_mode = 0;
    7190             :                 // Find best of {R}x{R,Sm,Sh}
    7191             :                 // EIGHTTAP_REGULAR mode is calculated beforehand
    7192           0 :                 for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
    7193             : 
    7194           0 :                     /*mbmi*/candidate_buffer_ptr->candidate_ptr->interp_filters = (InterpFilter)
    7195           0 :                         av1_make_interp_filters((InterpFilter)filter_sets[i][0], (InterpFilter)filter_sets[i][1]);
    7196             : 
    7197           0 :                     tmp_rs = eb_av1_get_switchable_rate(
    7198             :                         candidate_buffer_ptr,
    7199             :                         cm,
    7200             :                         md_context_ptr
    7201             :                     );
    7202             : 
    7203           0 :                     av1_inter_prediction_function_table[hbd_mode_decision](
    7204             :                         picture_control_set_ptr,
    7205           0 :                         candidate_buffer_ptr->candidate_ptr->interp_filters,
    7206             :                         md_context_ptr->cu_ptr,
    7207           0 :                         candidate_buffer_ptr->candidate_ptr->ref_frame_type,
    7208             :                         &mv_unit,
    7209             :                         0,
    7210             : #if OBMC_FLAG
    7211             :                         SIMPLE_TRANSLATION,
    7212             :                         0,
    7213             :                         0,
    7214             : #endif
    7215           0 :                         candidate_buffer_ptr->candidate_ptr->compound_idx,
    7216           0 :                         &candidate_buffer_ptr->candidate_ptr->interinter_comp,
    7217             : #if II_COMP_FLAG
    7218           0 :                         &md_context_ptr->sb_ptr->tile_info,
    7219             :                         md_context_ptr->luma_recon_neighbor_array,
    7220             :                         md_context_ptr->cb_recon_neighbor_array,
    7221             :                         md_context_ptr->cr_recon_neighbor_array,
    7222             :                         0, //No inter-intra for IFSearch
    7223           0 :                         candidate_buffer_ptr->candidate_ptr->interintra_mode,
    7224           0 :                         candidate_buffer_ptr->candidate_ptr->use_wedge_interintra,
    7225           0 :                         candidate_buffer_ptr->candidate_ptr->interintra_wedge_index,
    7226             : #endif
    7227           0 :                         md_context_ptr->cu_origin_x,
    7228           0 :                         md_context_ptr->cu_origin_y,
    7229           0 :                         md_context_ptr->blk_geom->bwidth,
    7230           0 :                         md_context_ptr->blk_geom->bheight,
    7231             :                         ref_pic_list0,
    7232             :                         ref_pic_list1,
    7233             :                         prediction_ptr,
    7234           0 :                         md_context_ptr->blk_geom->origin_x,
    7235           0 :                         md_context_ptr->blk_geom->origin_y,
    7236             :                         use_uv,
    7237             : #if IFS_8BIT_MD
    7238             :                         hbd_mode_decision ? EB_10BIT : EB_8BIT);
    7239             : #else
    7240             :                         (uint8_t)picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->static_config.encoder_bit_depth);
    7241             : #endif
    7242             : 
    7243           0 :                     model_rd_for_sb(
    7244             :                         picture_control_set_ptr,
    7245             :                         prediction_ptr,
    7246             :                         md_context_ptr,
    7247             :                         0,
    7248             :                         num_planes - 1,
    7249             :                         &tmp_rate,
    7250             :                         &tmp_dist,
    7251             :                         hbd_mode_decision ? EB_10BIT : EB_8BIT);
    7252           0 :                     tmp_rd = RDCOST(full_lambda_8b, tmp_rs + tmp_rate, tmp_dist);
    7253             : 
    7254           0 :                     if (tmp_rd < rd) {
    7255           0 :                         best_dual_mode = i;
    7256           0 :                         rd = tmp_rd;
    7257           0 :                         switchable_rate = tmp_rs;
    7258           0 :                         best_filters = /*mbmi*/candidate_buffer_ptr->candidate_ptr->interp_filters;
    7259           0 :                         best_in_temp = !best_in_temp;
    7260             :                     }
    7261             :                 }
    7262             : 
    7263             :                 // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
    7264           0 :                 for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size;
    7265           0 :                     i += SWITCHABLE_FILTERS) {
    7266             : 
    7267           0 :                     /*mbmi*/candidate_buffer_ptr->candidate_ptr->interp_filters =
    7268           0 :                         av1_make_interp_filters((InterpFilter)filter_sets[i][0], (InterpFilter)filter_sets[i][1]);
    7269             : 
    7270           0 :                     tmp_rs = eb_av1_get_switchable_rate(
    7271             :                         candidate_buffer_ptr,
    7272             :                         cm,
    7273             :                         md_context_ptr
    7274             :                     );
    7275             : 
    7276           0 :                     av1_inter_prediction_function_table[hbd_mode_decision](
    7277             :                         picture_control_set_ptr,
    7278           0 :                         candidate_buffer_ptr->candidate_ptr->interp_filters,
    7279             :                         md_context_ptr->cu_ptr,
    7280           0 :                         candidate_buffer_ptr->candidate_ptr->ref_frame_type,
    7281             :                         &mv_unit,
    7282             :                         0,
    7283             : #if OBMC_FLAG
    7284             :                         SIMPLE_TRANSLATION,
    7285             :                         0,
    7286             :                         0,
    7287             : #endif
    7288           0 :                         candidate_buffer_ptr->candidate_ptr->compound_idx,
    7289           0 :                         &candidate_buffer_ptr->candidate_ptr->interinter_comp,
    7290             : #if II_COMP_FLAG
    7291           0 :                         &md_context_ptr->sb_ptr->tile_info,
    7292             :                         md_context_ptr->luma_recon_neighbor_array,
    7293             :                         md_context_ptr->cb_recon_neighbor_array,
    7294             :                         md_context_ptr->cr_recon_neighbor_array,
    7295             :                         0, //No inter-intra for IFSearch
    7296           0 :                         candidate_buffer_ptr->candidate_ptr->interintra_mode,
    7297           0 :                         candidate_buffer_ptr->candidate_ptr->use_wedge_interintra,
    7298           0 :                         candidate_buffer_ptr->candidate_ptr->interintra_wedge_index,
    7299             : #endif
    7300           0 :                         md_context_ptr->cu_origin_x,
    7301           0 :                         md_context_ptr->cu_origin_y,
    7302           0 :                         md_context_ptr->blk_geom->bwidth,
    7303           0 :                         md_context_ptr->blk_geom->bheight,
    7304             :                         ref_pic_list0,
    7305             :                         ref_pic_list1,
    7306             :                         prediction_ptr,
    7307           0 :                         md_context_ptr->blk_geom->origin_x,
    7308           0 :                         md_context_ptr->blk_geom->origin_y,
    7309             :                         use_uv,
    7310             : #if IFS_8BIT_MD
    7311             :                         hbd_mode_decision ? EB_10BIT : EB_8BIT);
    7312             : #else
    7313             :                         (uint8_t)picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->static_config.encoder_bit_depth);
    7314             : #endif
    7315             : 
    7316           0 :                     model_rd_for_sb(
    7317             :                         picture_control_set_ptr,
    7318             :                         prediction_ptr,
    7319             :                         md_context_ptr,
    7320             :                         0,
    7321             :                         num_planes - 1,
    7322             :                         &tmp_rate,
    7323             :                         &tmp_dist,
    7324             :                         hbd_mode_decision ? EB_10BIT : EB_8BIT);
    7325           0 :                     tmp_rd = RDCOST(full_lambda_8b, tmp_rs + tmp_rate, tmp_dist);
    7326             : 
    7327           0 :                     if (tmp_rd < rd) {
    7328           0 :                         rd = tmp_rd;
    7329           0 :                         switchable_rate = tmp_rs;
    7330           0 :                         best_filters = /*mbmi*/candidate_buffer_ptr->candidate_ptr->interp_filters;
    7331           0 :                         best_in_temp = !best_in_temp;
    7332             :                     }
    7333             :                 }
    7334             :             }
    7335             :             else {
    7336             :                 // EIGHTTAP_REGULAR mode is calculated beforehand
    7337   198001000 :                 for (i = 1; i < filter_set_size; ++i) {
    7338             :                     int32_t tmp_rs;
    7339             :                     int64_t tmp_rd;
    7340             : 
    7341   175934000 :                     if (/*cm->seq_params.enable_dual_filter*/picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header.enable_dual_filter == 0)
    7342   175947000 :                         if (filter_sets[i][0] != filter_sets[i][1]) continue;
    7343             : 
    7344    43959200 :                     /*mbmi*/candidate_buffer_ptr->candidate_ptr->interp_filters = av1_make_interp_filters((InterpFilter)filter_sets[i][0], (InterpFilter)filter_sets[i][1]);
    7345             : 
    7346    43983200 :                     tmp_rs = eb_av1_get_switchable_rate(
    7347             :                         candidate_buffer_ptr,
    7348             :                         cm,
    7349             :                         md_context_ptr
    7350             :                     );
    7351             : 
    7352    43958700 :                     av1_inter_prediction_function_table[hbd_mode_decision](
    7353             :                         picture_control_set_ptr,
    7354    43958700 :                         candidate_buffer_ptr->candidate_ptr->interp_filters,
    7355             :                         md_context_ptr->cu_ptr,
    7356    43958700 :                         candidate_buffer_ptr->candidate_ptr->ref_frame_type,
    7357             :                         &mv_unit,
    7358             :                         0,
    7359             : #if OBMC_FLAG
    7360             :                         SIMPLE_TRANSLATION,
    7361             :                         0,
    7362             :                         0,
    7363             : #endif
    7364    43958700 :                         candidate_buffer_ptr->candidate_ptr->compound_idx,
    7365    43958700 :                         &candidate_buffer_ptr->candidate_ptr->interinter_comp,
    7366             : #if II_COMP_FLAG
    7367    43958700 :                         &md_context_ptr->sb_ptr->tile_info,
    7368             :                         md_context_ptr->luma_recon_neighbor_array,
    7369             :                         md_context_ptr->cb_recon_neighbor_array,
    7370             :                         md_context_ptr->cr_recon_neighbor_array,
    7371             :                         0, //No inter-intra for IFSearch
    7372    43958700 :                         candidate_buffer_ptr->candidate_ptr->interintra_mode,
    7373    43958700 :                         candidate_buffer_ptr->candidate_ptr->use_wedge_interintra,
    7374    43958700 :                         candidate_buffer_ptr->candidate_ptr->interintra_wedge_index,
    7375             : #endif
    7376    43958700 :                         md_context_ptr->cu_origin_x,
    7377    43958700 :                         md_context_ptr->cu_origin_y,
    7378    43958700 :                         md_context_ptr->blk_geom->bwidth,
    7379    43958700 :                         md_context_ptr->blk_geom->bheight,
    7380             :                         ref_pic_list0,
    7381             :                         ref_pic_list1,
    7382             :                         prediction_ptr,
    7383    43958700 :                         md_context_ptr->blk_geom->origin_x,
    7384    43958700 :                         md_context_ptr->blk_geom->origin_y,
    7385             :                         use_uv,
    7386             : #if IFS_8BIT_MD
    7387             :                         hbd_mode_decision ? EB_10BIT : EB_8BIT);
    7388             : #else
    7389             :                         (uint8_t)picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr->static_config.encoder_bit_depth);
    7390             : #endif
    7391             : 
    7392    43975600 :                     model_rd_for_sb(
    7393             :                         picture_control_set_ptr,
    7394             :                         prediction_ptr,
    7395             :                         md_context_ptr,
    7396             :                         0,
    7397             :                         num_planes - 1,
    7398             :                         &tmp_rate,
    7399             :                         &tmp_dist,
    7400             :                         hbd_mode_decision ? EB_10BIT : EB_8BIT);
    7401    44012200 :                     tmp_rd = RDCOST(full_lambda_8b, tmp_rs + tmp_rate, tmp_dist);
    7402             : 
    7403    44012200 :                     if (tmp_rd < rd) {
    7404     4501350 :                         rd = tmp_rd;
    7405     4501350 :                         switchable_rate = tmp_rs;
    7406     4501350 :                         best_filters = /*mbmi*/candidate_buffer_ptr->candidate_ptr->interp_filters;
    7407     4501350 :                         best_in_temp = !best_in_temp;
    7408             :                     }
    7409             :                 }
    7410             :             }
    7411             : 
    7412    22066800 :             /*mbmi*/candidate_buffer_ptr->candidate_ptr->interp_filters = best_filters;
    7413             :         }
    7414             :         else {
    7415      348406 :             candidate_buffer_ptr->candidate_ptr->interp_filters = 0;
    7416             :         }
    7417             :     }
    7418    22414600 : }
    7419             : 
    7420   198599000 : EbErrorType inter_pu_prediction_av1(
    7421             :     ModeDecisionContext                  *md_context_ptr,
    7422             :     PictureControlSet                    *picture_control_set_ptr,
    7423             :     ModeDecisionCandidateBuffer          *candidate_buffer_ptr)
    7424             : {
    7425   198599000 :     EbErrorType            return_error = EB_ErrorNone;
    7426   198599000 :     EbPictureBufferDesc  *ref_pic_list0 = (EbPictureBufferDesc*)EB_NULL;
    7427   198599000 :     EbPictureBufferDesc  *ref_pic_list1 = (EbPictureBufferDesc*)EB_NULL;
    7428   198599000 :     ModeDecisionCandidate *const candidate_ptr = candidate_buffer_ptr->candidate_ptr;
    7429   198599000 :     SequenceControlSet* sequence_control_set_ptr = ((SequenceControlSet*)(picture_control_set_ptr->sequence_control_set_wrapper_ptr->object_ptr));
    7430             : 
    7431             :     Mv mv_0;
    7432             :     Mv mv_1;
    7433   198599000 :     mv_0.x = candidate_buffer_ptr->candidate_ptr->motion_vector_xl0;
    7434   198599000 :     mv_0.y = candidate_buffer_ptr->candidate_ptr->motion_vector_yl0;
    7435   198599000 :     mv_1.x = candidate_buffer_ptr->candidate_ptr->motion_vector_xl1;
    7436   198599000 :     mv_1.y = candidate_buffer_ptr->candidate_ptr->motion_vector_yl1;
    7437             :     MvUnit mv_unit;
    7438   198599000 :     mv_unit.pred_direction = candidate_buffer_ptr->candidate_ptr->prediction_direction[md_context_ptr->pu_itr];
    7439   198599000 :     mv_unit.mv[0] = mv_0;
    7440   198599000 :     mv_unit.mv[1] = mv_1;
    7441             : 
    7442   198599000 :     if (candidate_buffer_ptr->candidate_ptr->use_intrabc) {
    7443           0 :         if (!md_context_ptr->hbd_mode_decision)
    7444           0 :             ref_pic_list0 = ((EbReferenceObject*)picture_control_set_ptr->parent_pcs_ptr->reference_picture_wrapper_ptr->object_ptr)->reference_picture;
    7445             :         else
    7446           0 :             ref_pic_list0 = ((EbReferenceObject*)picture_control_set_ptr->parent_pcs_ptr->reference_picture_wrapper_ptr->object_ptr)->reference_picture16bit;
    7447             : 
    7448           0 :         av1_inter_prediction_function_table[md_context_ptr->hbd_mode_decision > EB_8_BIT_MD](
    7449             :             picture_control_set_ptr,
    7450           0 :             candidate_buffer_ptr->candidate_ptr->interp_filters,
    7451             :             md_context_ptr->cu_ptr,
    7452           0 :             candidate_buffer_ptr->candidate_ptr->ref_frame_type,
    7453             :             &mv_unit,
    7454             :             1,//use_intrabc
    7455             : #if OBMC_FLAG
    7456             :             SIMPLE_TRANSLATION,
    7457             :             0,
    7458             :             0,
    7459             : #endif
    7460             :             1,//1 for avg
    7461           0 :             &candidate_buffer_ptr->candidate_ptr->interinter_comp,
    7462             : #if II_COMP_FLAG
    7463             :             NULL,
    7464             :             NULL,
    7465             :             NULL,
    7466             :             NULL,
    7467             :             0,
    7468             :             0,
    7469             :             0,
    7470             :             0,
    7471             : #endif
    7472           0 :             md_context_ptr->cu_origin_x,
    7473           0 :             md_context_ptr->cu_origin_y,
    7474           0 :             md_context_ptr->blk_geom->bwidth,
    7475           0 :             md_context_ptr->blk_geom->bheight,
    7476             :             ref_pic_list0,
    7477             :             0,//ref_pic_list1,
    7478             :             candidate_buffer_ptr->prediction_ptr,
    7479           0 :             md_context_ptr->blk_geom->origin_x,
    7480           0 :             md_context_ptr->blk_geom->origin_y,
    7481           0 :             md_context_ptr->chroma_level <= CHROMA_MODE_1 && md_context_ptr->md_staging_skip_inter_chroma_pred == EB_FALSE,
    7482           0 :             sequence_control_set_ptr->static_config.encoder_bit_depth);
    7483             : 
    7484           0 :         return return_error;
    7485             :     }
    7486             : 
    7487   198599000 :     int8_t ref_idx_l0 = candidate_buffer_ptr->candidate_ptr->ref_frame_index_l0;
    7488   198599000 :     int8_t ref_idx_l1 = candidate_buffer_ptr->candidate_ptr->ref_frame_index_l1;
    7489             :     MvReferenceFrame rf[2];
    7490   198599000 :     av1_set_ref_frame(rf, candidate_buffer_ptr->candidate_ptr->ref_frame_type);
    7491             : 
    7492             :     uint8_t list_idx0, list_idx1;
    7493   199363000 :     list_idx0 = get_list_idx(rf[0]);
    7494   199541000 :     if (rf[1] == NONE_FRAME)
    7495   115795000 :         list_idx1 = get_list_idx(rf[0]);
    7496             :     else
    7497    83746000 :         list_idx1 = get_list_idx(rf[1]);
    7498   199058000 :     assert(list_idx0 < MAX_NUM_OF_REF_PIC_LIST);
    7499   199058000 :     assert(list_idx1 < MAX_NUM_OF_REF_PIC_LIST);
    7500             : 
    7501   199058000 :     if (ref_idx_l0 >= 0) {
    7502   153871000 :         ref_pic_list0 = md_context_ptr->hbd_mode_decision ?
    7503           0 :             ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[list_idx0][ref_idx_l0]->object_ptr)->reference_picture16bit
    7504   153871000 :             : ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[list_idx0][ref_idx_l0]->object_ptr)->reference_picture;
    7505             :     }
    7506             : 
    7507   199058000 :     if (ref_idx_l1 >= 0) {
    7508   129146000 :         ref_pic_list1 =  md_context_ptr->hbd_mode_decision ?
    7509           0 :             ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[list_idx1][ref_idx_l1]->object_ptr)->reference_picture16bit
    7510   129146000 :             : ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[list_idx1][ref_idx_l1]->object_ptr)->reference_picture;
    7511             :     }
    7512             : 
    7513   199058000 :     if (picture_control_set_ptr->parent_pcs_ptr->frm_hdr.allow_warped_motion
    7514   134364000 :         && candidate_ptr->motion_mode != WARPED_CAUSAL)
    7515             :     {
    7516   130559000 :         wm_count_samples(
    7517             :             md_context_ptr->cu_ptr,
    7518             :             md_context_ptr->blk_geom,
    7519   130559000 :             md_context_ptr->cu_origin_x,
    7520   130559000 :             md_context_ptr->cu_origin_y,
    7521   130559000 :             candidate_ptr->ref_frame_type,
    7522             :             picture_control_set_ptr,
    7523             :             &candidate_ptr->num_proj_ref);
    7524             :     }
    7525             : 
    7526   198724000 :     uint8_t bit_depth = EB_8BIT;
    7527   198724000 :     if (sequence_control_set_ptr->static_config.encoder_bit_depth > EB_8BIT && md_context_ptr->hbd_mode_decision)
    7528           0 :         bit_depth = sequence_control_set_ptr->static_config.encoder_bit_depth;
    7529             : 
    7530             : 
    7531   198724000 :     if (candidate_ptr->motion_mode == WARPED_CAUSAL) {
    7532     5053160 :         assert(ref_pic_list0 != NULL);
    7533             : 
    7534     5053160 :         warped_motion_prediction(
    7535             :             picture_control_set_ptr,
    7536             :             &mv_unit,
    7537     5053160 :             candidate_ptr->ref_frame_type,
    7538     5053160 :             candidate_ptr->compound_idx,
    7539             :             &candidate_ptr->interinter_comp,
    7540     5053160 :             md_context_ptr->cu_origin_x,
    7541     5053160 :             md_context_ptr->cu_origin_y,
    7542             :             md_context_ptr->cu_ptr,
    7543             :             md_context_ptr->blk_geom,
    7544             :             ref_pic_list0,
    7545             :             ref_pic_list1,
    7546             :             candidate_buffer_ptr->prediction_ptr,
    7547     5053160 :             md_context_ptr->blk_geom->origin_x,
    7548     5053160 :             md_context_ptr->blk_geom->origin_y,
    7549             :             &candidate_ptr->wm_params_l0,
    7550             :             &candidate_ptr->wm_params_l1,
    7551             :             bit_depth,
    7552     5053160 :             md_context_ptr->chroma_level <= CHROMA_MODE_1 && md_context_ptr->md_staging_skip_inter_chroma_pred == EB_FALSE);
    7553             : 
    7554     5053420 :         return return_error;
    7555             :     }
    7556             : 
    7557             : 
    7558   193671000 :     if (picture_control_set_ptr->parent_pcs_ptr->interpolation_search_level == IT_SEARCH_OFF)
    7559      619811 :         candidate_buffer_ptr->candidate_ptr->interp_filters = 0;
    7560             :     else {
    7561             : 
    7562   193051000 :         if (md_context_ptr->md_staging_skip_interpolation_search == EB_FALSE) {
    7563    26466200 :             uint16_t capped_size = md_context_ptr->interpolation_filter_search_blk_size == 0 ? 4 :
    7564           0 :                                    md_context_ptr->interpolation_filter_search_blk_size == 1 ? 8 : 16 ;
    7565             : 
    7566    26466200 :             if (md_context_ptr->blk_geom->bwidth > capped_size && md_context_ptr->blk_geom->bheight > capped_size)
    7567             : #if IFS_8BIT_MD
    7568             :             {
    7569    22354600 :                 if (md_context_ptr->hbd_mode_decision == EB_DUAL_BIT_MD) {
    7570             : 
    7571           0 :                     if (ref_idx_l0 >= 0)
    7572           0 :                         ref_pic_list0 = ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[list_idx0][ref_idx_l0]->object_ptr)->reference_picture;
    7573             : 
    7574           0 :                     if (ref_idx_l1 >= 0)
    7575           0 :                         ref_pic_list1 = ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[list_idx1][ref_idx_l1]->object_ptr)->reference_picture;
    7576             :                 }
    7577             : #endif
    7578    22354600 :                 interpolation_filter_search(
    7579             :                     picture_control_set_ptr,
    7580             :                     candidate_buffer_ptr->prediction_ptr_temp,
    7581             :                     md_context_ptr,
    7582             :                     candidate_buffer_ptr,
    7583             :                     mv_unit,
    7584             :                     ref_pic_list0,
    7585             :                     ref_pic_list1,
    7586    22354600 :                     md_context_ptr->hbd_mode_decision == EB_DUAL_BIT_MD ? EB_8_BIT_MD: md_context_ptr->hbd_mode_decision,
    7587             :                     bit_depth);
    7588             : #if IFS_8BIT_MD
    7589    22359900 :                 if (md_context_ptr->hbd_mode_decision == EB_DUAL_BIT_MD) {
    7590           0 :                     if (ref_idx_l0 >= 0)
    7591           0 :                         ref_pic_list0 = ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[list_idx0][ref_idx_l0]->object_ptr)->reference_picture16bit;
    7592             : 
    7593           0 :                     if (ref_idx_l1 >= 0)
    7594           0 :                         ref_pic_list1 = ((EbReferenceObject*)picture_control_set_ptr->ref_pic_ptr_array[list_idx1][ref_idx_l1]->object_ptr)->reference_picture16bit;
    7595             :                 }
    7596             :             }
    7597             : #endif
    7598             :         }
    7599             :     }
    7600             : 
    7601             :     NeighborArrayUnit            *luma_recon_neighbor_array;
    7602             :     NeighborArrayUnit            *cb_recon_neighbor_array;
    7603             :     NeighborArrayUnit            *cr_recon_neighbor_array;
    7604             : 
    7605   193676000 :     if (!md_context_ptr->hbd_mode_decision) {
    7606   193713000 :         luma_recon_neighbor_array = md_context_ptr->luma_recon_neighbor_array;
    7607   193713000 :         cb_recon_neighbor_array = md_context_ptr->cb_recon_neighbor_array;
    7608   193713000 :         cr_recon_neighbor_array = md_context_ptr->cr_recon_neighbor_array;
    7609             :     }
    7610             :     else {
    7611           0 :         luma_recon_neighbor_array = md_context_ptr->luma_recon_neighbor_array16bit;
    7612           0 :         cb_recon_neighbor_array = md_context_ptr->cb_recon_neighbor_array16bit;
    7613           0 :         cr_recon_neighbor_array = md_context_ptr->cr_recon_neighbor_array16bit;
    7614             : 
    7615             :     }
    7616             : 
    7617   387352000 :     av1_inter_prediction_function_table[md_context_ptr->hbd_mode_decision > EB_8_BIT_MD](
    7618             :         picture_control_set_ptr,
    7619   193676000 :         candidate_buffer_ptr->candidate_ptr->interp_filters,
    7620             :         md_context_ptr->cu_ptr,
    7621   193676000 :         candidate_buffer_ptr->candidate_ptr->ref_frame_type,
    7622             :         &mv_unit,
    7623   193676000 :         candidate_buffer_ptr->candidate_ptr->use_intrabc,
    7624             : #if OBMC_FLAG
    7625   193676000 :         candidate_buffer_ptr->candidate_ptr->motion_mode,//MD
    7626             :         1,
    7627             :         md_context_ptr,
    7628             : #endif
    7629   193676000 :         candidate_buffer_ptr->candidate_ptr->compound_idx,
    7630   193676000 :         &candidate_buffer_ptr->candidate_ptr->interinter_comp,
    7631             : #if II_COMP_FLAG
    7632   193676000 :         &md_context_ptr->sb_ptr->tile_info,
    7633             :         luma_recon_neighbor_array,
    7634             :         cb_recon_neighbor_array,
    7635             :         cr_recon_neighbor_array,
    7636   193676000 :         candidate_ptr->is_interintra_used,
    7637   193676000 :         candidate_ptr->interintra_mode,
    7638   193676000 :         candidate_ptr->use_wedge_interintra,
    7639             :         candidate_ptr->interintra_wedge_index,
    7640             : #endif
    7641   193676000 :         md_context_ptr->cu_origin_x,
    7642   193676000 :         md_context_ptr->cu_origin_y,
    7643   193676000 :         md_context_ptr->blk_geom->bwidth,
    7644   193676000 :         md_context_ptr->blk_geom->bheight,
    7645             :         ref_pic_list0,
    7646             :         ref_pic_list1,
    7647             :         candidate_buffer_ptr->prediction_ptr,
    7648   193676000 :         md_context_ptr->blk_geom->origin_x,
    7649   193676000 :         md_context_ptr->blk_geom->origin_y,
    7650   193676000 :         md_context_ptr->chroma_level <= CHROMA_MODE_1 && md_context_ptr->md_staging_skip_inter_chroma_pred == EB_FALSE,
    7651   193676000 :         sequence_control_set_ptr->static_config.encoder_bit_depth);
    7652             : 
    7653   193619000 :     return return_error;
    7654             : }
    7655             : 
    7656             : /***************************************************
    7657             : *  PreLoad Reference Block  for 16bit mode
    7658             : ***************************************************/
    7659           0 : void UnPackReferenceLumaBlock(
    7660             :     EbPictureBufferDesc *refFramePic,
    7661             :     uint32_t                 pos_x,
    7662             :     uint32_t                 pos_y,
    7663             :     uint32_t                 pu_width,
    7664             :     uint32_t                 pu_height,
    7665             :     EbPictureBufferDesc *dst,
    7666             :     EbBool                sub_pred)
    7667             : {
    7668           0 :     pu_width += 4;
    7669           0 :     pu_height += 4;
    7670           0 :     uint32_t inPosx = (pos_x >> 2) - 2;
    7671           0 :     uint32_t inPosy = (pos_y >> 2) - 2;
    7672           0 :     uint16_t *ptr16 = (uint16_t *)refFramePic->buffer_y + inPosx + inPosy * refFramePic->stride_y;
    7673             : 
    7674           0 :     extract8_bitdata_safe_sub(
    7675             :         ptr16,
    7676           0 :         refFramePic->stride_y << sub_pred,
    7677             :         dst->buffer_y,
    7678           0 :         dst->stride_y << sub_pred,
    7679             :         pu_width,
    7680             :         pu_height >> sub_pred,
    7681             :         sub_pred
    7682             :     );
    7683           0 : }
    7684             : 
    7685             : /** choose_mvp_idx_v2 function is used to choose the best AMVP candidate.
    7686             :     @param *candidate_ptr(output)
    7687             :         candidate_ptr points to the prediction result.
    7688             :     @param cu_ptr(input)
    7689             :         pointer to the CU where the target PU belongs to.
    7690             :     @param *pu_index(input)
    7691             :         the index of the PU inside a CU
    7692             :     @param ref0AMVPCandArray(input)
    7693             :     @param ref0_num_available_amvp_cand(input)
    7694             :     @param ref1AMVPCandArray(input)
    7695             :     @param ref1NumAvailableAMVPCand(input)
    7696             :  */
    7697           0 : EbErrorType choose_mvp_idx_v2(
    7698             :     ModeDecisionCandidate  *candidate_ptr,
    7699             :     uint32_t                    cu_origin_x,
    7700             :     uint32_t                    cu_origin_y,
    7701             :     uint32_t                    pu_index,
    7702             :     uint32_t                    tb_size,
    7703             :     int16_t                   *ref0_amvp_cand_array_x,
    7704             :     int16_t                   *ref0_amvp_cand_array_y,
    7705             :     uint32_t                    ref0_num_available_amvp_cand,
    7706             :     int16_t                   *ref1_amvp_cand_array_x,
    7707             :     int16_t                   *ref1_amvp_cand_array_y,
    7708             :     uint32_t                    ref1NumAvailableAMVPCand,
    7709             :     PictureControlSet      *picture_control_set_ptr)
    7710             : {
    7711           0 :     EbErrorType  return_error = EB_ErrorNone;
    7712             :     uint8_t         mvpRef0Idx;
    7713             :     uint8_t         mvpRef1Idx;
    7714             : 
    7715           0 :     uint32_t        picture_width = ((SequenceControlSet*)picture_control_set_ptr->sequence_control_set_wrapper_ptr->object_ptr)->seq_header.max_frame_width;
    7716           0 :     uint32_t        picture_height = ((SequenceControlSet*)picture_control_set_ptr->sequence_control_set_wrapper_ptr->object_ptr)->seq_header.max_frame_height;
    7717             : 
    7718             :     uint32_t   mvd0, mvd1;
    7719             : 
    7720           0 :     switch (candidate_ptr->prediction_direction[pu_index]) {
    7721           0 :     case UNI_PRED_LIST_0:
    7722             :         // Clip the input MV
    7723           0 :         clip_mv(
    7724             :             cu_origin_x,
    7725             :             cu_origin_y,
    7726             :             &candidate_ptr->motion_vector_xl0,
    7727             :             &candidate_ptr->motion_vector_yl0,
    7728             :             picture_width,
    7729             :             picture_height,
    7730             :             tb_size);
    7731             : 
    7732             :         // Choose the AMVP candidate
    7733             :         switch (ref0_num_available_amvp_cand) {
    7734           0 :         case 0:
    7735             :         case 1:
    7736             :             //mvpRef0Idx = 0;
    7737           0 :             candidate_ptr->motion_vector_pred_idx[REF_LIST_0] = 0;
    7738           0 :             candidate_ptr->motion_vector_pred_x[REF_LIST_0] = ref0_amvp_cand_array_x[0];
    7739           0 :             candidate_ptr->motion_vector_pred_y[REF_LIST_0] = ref0_amvp_cand_array_y[0];
    7740           0 :             break;
    7741           0 :         case 2:
    7742             : 
    7743           0 :             mvd0 = EB_ABS_DIFF(ref0_amvp_cand_array_x[0], candidate_ptr->motion_vector_xl0) +
    7744           0 :                 EB_ABS_DIFF(ref0_amvp_cand_array_y[0], candidate_ptr->motion_vector_yl0);
    7745             : 
    7746           0 :             mvd1 = EB_ABS_DIFF(ref0_amvp_cand_array_x[1], candidate_ptr->motion_vector_xl0) +
    7747           0 :                 EB_ABS_DIFF(ref0_amvp_cand_array_y[1], candidate_ptr->motion_vector_yl0);
    7748             : 
    7749           0 :             mvpRef0Idx = ((mvd0) <= (mvd1)) ? 0 : 1;
    7750             : 
    7751           0 :             candidate_ptr->motion_vector_pred_idx[REF_LIST_0] = mvpRef0Idx;
    7752           0 :             candidate_ptr->motion_vector_pred_x[REF_LIST_0] = ref0_amvp_cand_array_x[mvpRef0Idx];
    7753           0 :             candidate_ptr->motion_vector_pred_y[REF_LIST_0] = ref0_amvp_cand_array_y[mvpRef0Idx];
    7754           0 :             break;
    7755           0 :         default:
    7756           0 :             break;
    7757             :         }
    7758             : 
    7759           0 :         break;
    7760             : 
    7761           0 :     case UNI_PRED_LIST_1:
    7762             : 
    7763             :         // Clip the input MV
    7764           0 :         clip_mv(
    7765             :             cu_origin_x,
    7766             :             cu_origin_y,
    7767             :             &candidate_ptr->motion_vector_xl1,
    7768             :             &candidate_ptr->motion_vector_yl1,
    7769             :             picture_width,
    7770             :             picture_height,
    7771             :             tb_size);
    7772             : 
    7773             :         // Choose the AMVP candidate
    7774             :         switch (ref1NumAvailableAMVPCand) {
    7775           0 :         case 0:
    7776             :         case 1:
    7777             :             //mvpRef1Idx = 0;
    7778           0 :             candidate_ptr->motion_vector_pred_idx[REF_LIST_1] = 0;
    7779           0 :             candidate_ptr->motion_vector_pred_x[REF_LIST_1] = ref1_amvp_cand_array_x[0];
    7780           0 :             candidate_ptr->motion_vector_pred_y[REF_LIST_1] = ref1_amvp_cand_array_y[0];
    7781           0 :             break;
    7782           0 :         case 2:
    7783             : 
    7784           0 :             mvd0 = EB_ABS_DIFF(ref1_amvp_cand_array_x[0], candidate_ptr->motion_vector_xl1) +
    7785           0 :                 EB_ABS_DIFF(ref1_amvp_cand_array_y[0], candidate_ptr->motion_vector_yl1);
    7786             : 
    7787           0 :             mvd1 = EB_ABS_DIFF(ref1_amvp_cand_array_x[1], candidate_ptr->motion_vector_xl1) +
    7788           0 :                 EB_ABS_DIFF(ref1_amvp_cand_array_y[1], candidate_ptr->motion_vector_yl1);
    7789             : 
    7790           0 :             mvpRef1Idx = ((mvd0) <= (mvd1)) ? 0 : 1;
    7791             : 
    7792           0 :             candidate_ptr->motion_vector_pred_idx[REF_LIST_1] = mvpRef1Idx;
    7793           0 :             candidate_ptr->motion_vector_pred_x[REF_LIST_1] = ref1_amvp_cand_array_x[mvpRef1Idx];
    7794           0 :             candidate_ptr->motion_vector_pred_y[REF_LIST_1] = ref1_amvp_cand_array_y[mvpRef1Idx];
    7795           0 :             break;
    7796           0 :         default:
    7797           0 :             break;
    7798             :         }
    7799             : 
    7800             :         // MVP in ref_pic_list0
    7801             :         //mvpRef0Idx = 0;
    7802             :         //candidate_ptr->motion_vector_pred_idx[REF_LIST_0][pu_index] = mvpRef0Idx;
    7803             :         //candidate_ptr->motion_vector_pred_x[REF_LIST_0][pu_index]  = 0;
    7804             :         //candidate_ptr->motion_vector_pred_y[REF_LIST_0][pu_index]  = 0;
    7805             : 
    7806           0 :         break;
    7807             : 
    7808           0 :     case BI_PRED:
    7809             : 
    7810             :         // Choose the MVP in list0
    7811             :         // Clip the input MV
    7812           0 :         clip_mv(
    7813             :             cu_origin_x,
    7814             :             cu_origin_y,
    7815             :             &candidate_ptr->motion_vector_xl0,
    7816             :             &candidate_ptr->motion_vector_yl0,
    7817             :             picture_width,
    7818             :             picture_height,
    7819             :             tb_size);
    7820             : 
    7821             :         // Choose the AMVP candidate
    7822             :         switch (ref0_num_available_amvp_cand) {
    7823           0 :         case 0:
    7824             :         case 1:
    7825             :             //mvpRef0Idx = 0;
    7826           0 :             candidate_ptr->motion_vector_pred_idx[REF_LIST_0] = 0;
    7827           0 :             candidate_ptr->motion_vector_pred_x[REF_LIST_0] = ref0_amvp_cand_array_x[0];
    7828           0 :             candidate_ptr->motion_vector_pred_y[REF_LIST_0] = ref0_amvp_cand_array_y[0];
    7829           0 :             break;
    7830           0 :         case 2:
    7831             : 
    7832           0 :             mvd0 = EB_ABS_DIFF(ref0_amvp_cand_array_x[0], candidate_ptr->motion_vector_xl0) +
    7833           0 :                 EB_ABS_DIFF(ref0_amvp_cand_array_y[0], candidate_ptr->motion_vector_yl0);
    7834             : 
    7835           0 :             mvd1 = EB_ABS_DIFF(ref0_amvp_cand_array_x[1], candidate_ptr->motion_vector_xl0) +
    7836           0 :                 EB_ABS_DIFF(ref0_amvp_cand_array_y[1], candidate_ptr->motion_vector_yl0);
    7837             : 
    7838           0 :             mvpRef0Idx = ((mvd0) <= (mvd1)) ? 0 : 1;
    7839             : 
    7840           0 :             candidate_ptr->motion_vector_pred_idx[REF_LIST_0] = mvpRef0Idx;
    7841           0 :             candidate_ptr->motion_vector_pred_x[REF_LIST_0] = ref0_amvp_cand_array_x[mvpRef0Idx];
    7842           0 :             candidate_ptr->motion_vector_pred_y[REF_LIST_0] = ref0_amvp_cand_array_y[mvpRef0Idx];
    7843           0 :             break;
    7844           0 :         default:
    7845           0 :             break;
    7846             :         }
    7847             : 
    7848             :         // Choose the MVP in list1
    7849             :         // Clip the input MV
    7850           0 :         clip_mv(
    7851             :             cu_origin_x,
    7852             :             cu_origin_y,
    7853             :             &candidate_ptr->motion_vector_xl1,
    7854             :             &candidate_ptr->motion_vector_yl1,
    7855             :             picture_width,
    7856             :             picture_height,
    7857             :             tb_size);
    7858             : 
    7859             :         // Choose the AMVP candidate
    7860             :         switch (ref1NumAvailableAMVPCand) {
    7861           0 :         case 0:
    7862             :         case 1:
    7863             :             //mvpRef1Idx = 0;
    7864           0 :             candidate_ptr->motion_vector_pred_idx[REF_LIST_1] = 0;
    7865           0 :             candidate_ptr->motion_vector_pred_x[REF_LIST_1] = ref1_amvp_cand_array_x[0];
    7866           0 :             candidate_ptr->motion_vector_pred_y[REF_LIST_1] = ref1_amvp_cand_array_y[0];
    7867           0 :             break;
    7868           0 :         case 2:
    7869             : 
    7870           0 :             mvd0 = EB_ABS_DIFF(ref1_amvp_cand_array_x[0], candidate_ptr->motion_vector_xl1) +
    7871           0 :                 EB_ABS_DIFF(ref1_amvp_cand_array_y[0], candidate_ptr->motion_vector_yl1);
    7872             : 
    7873           0 :             mvd1 = EB_ABS_DIFF(ref1_amvp_cand_array_x[1], candidate_ptr->motion_vector_xl1) +
    7874           0 :                 EB_ABS_DIFF(ref1_amvp_cand_array_y[1], candidate_ptr->motion_vector_yl1);
    7875             : 
    7876           0 :             mvpRef1Idx = ((mvd0) <= (mvd1)) ? 0 : 1;
    7877             : 
    7878           0 :             candidate_ptr->motion_vector_pred_idx[REF_LIST_1] = mvpRef1Idx;
    7879           0 :             candidate_ptr->motion_vector_pred_x[REF_LIST_1] = ref1_amvp_cand_array_x[mvpRef1Idx];
    7880           0 :             candidate_ptr->motion_vector_pred_y[REF_LIST_1] = ref1_amvp_cand_array_y[mvpRef1Idx];
    7881           0 :             break;
    7882           0 :         default:
    7883           0 :             break;
    7884             :         }
    7885             : 
    7886           0 :         break;
    7887             : 
    7888           0 :     default:
    7889           0 :         break;
    7890             :     }
    7891             : 
    7892           0 :     return return_error;
    7893             : }

Generated by: LCOV version 1.14