LCOV - code coverage report
Current view: top level - Codec - EbTemporalFiltering.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 425 996 42.7 %
Date: 2019-11-25 17:38:06 Functions: 14 28 50.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Netflix, Inc.
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : /*
       6             : * Copyright(c) 2019 Intel Corporation
       7             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       8             : */
       9             : /*
      10             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
      11             :  *
      12             :  * This source code is subject to the terms of the BSD 2 Clause License and
      13             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      14             :  * was not distributed with this source code in the LICENSE file, you can
      15             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      16             :  * Media Patent License 1.0 was not distributed with this source code in the
      17             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      18             :  */
      19             : 
      20             : #include <stdlib.h>
      21             : #include <stdio.h>
      22             : #include <string.h>
      23             : #include <assert.h>
      24             : #include "EbTemporalFiltering.h"
      25             : #include "EbComputeSAD.h"
      26             : #include "EbMotionEstimation.h"
      27             : #include "EbMotionEstimationProcess.h"
      28             : #include "EbMotionEstimationContext.h"
      29             : #include "EbDefinitions.h"
      30             : #include "EbLambdaRateTables.h"
      31             : #include "EbPictureAnalysisProcess.h"
      32             : #include "EbMcp.h"
      33             : #include "av1me.h"
      34             : #include "EbTemporalFiltering_sse4.h"
      35             : #include "EbObject.h"
      36             : #include "EbPictureOperators.h"
      37             : #include "EbInterPrediction.h"
      38             : #include "aom_dsp_rtcd.h"
      39             : #include "EbComputeVariance_C.h"
      40             : 
      41             : #undef _MM_HINT_T2
      42             : #define _MM_HINT_T2  1
      43             : 
      44             : static EB_AV1_INTER_PREDICTION_FUNC_PTR   av1_inter_prediction_function_table[2] =
      45             : {
      46             :     av1_inter_prediction,
      47             :     av1_inter_prediction_hbd
      48             : };
      49             : 
      50             : static unsigned int index_mult[14] = {
      51             :         0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124
      52             : };
      53             : 
      54             : static int64_t index_mult_highbd[14] = { 0U,          0U,          0U,
      55             :                                          0U,          3221225472U, 2576980378U,
      56             :                                          2147483648U, 1840700270U, 1610612736U,
      57             :                                          1431655766U, 1288490189U, 1171354718U,
      58             :                                          0U,          991146300U };
      59             : 
      60             : // relationship between pu_index and row and col of the 32x32 sub-blocks
      61             : static const uint32_t subblock_xy_32x32[4][2] = { {0,0}, {0,1}, {1,0}, {1,1} };
      62             : 
      63             : static const uint32_t subblock_xy_16x16[N_16X16_BLOCKS][2] = { {0,0}, {0,1}, {0,2}, {0,3},
      64             :                                                                {1,0}, {1,1}, {1,2}, {1,3},
      65             :                                                                {2,0}, {2,1}, {2,2}, {2,3},
      66             :                                                                {3,0}, {3,1}, {3,2}, {3,3} };
      67             : 
      68             : static const uint32_t subblocks_from32x32_to_16x16[N_16X16_BLOCKS] = { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3 };
      69             : 
      70             : static const uint32_t index_16x16_from_subindexes[4][4] = { {0, 1, 4, 5}, {2, 3, 6, 7}, {8, 9, 12, 13}, {10, 11, 14, 15} };
      71             : 
      72             : extern aom_variance_fn_ptr_t mefn_ptr[BlockSizeS_ALL];
      73             : 
      74             : #if DEBUG_TF
      75             : // save YUV to file - auxiliary function for debug
      76             : void save_YUV_to_file(char *filename, EbByte buffer_y, EbByte buffer_u, EbByte buffer_v,
      77             :                       uint16_t width, uint16_t height,
      78             :                       uint16_t stride_y, uint16_t stride_u, uint16_t stride_v,
      79             :                       uint16_t origin_y, uint16_t origin_x,
      80             :                       uint32_t ss_x, uint32_t ss_y){
      81             :     FILE *fid = NULL;
      82             :     EbByte pic_point;
      83             :     int h;
      84             : 
      85             :     // save current source picture to a YUV file
      86             :     FOPEN(fid, filename, "wb");
      87             : 
      88             :     if (!fid){
      89             :         printf("Unable to open file %s to write.\n", "temp_picture.yuv");
      90             :     }else{
      91             :         // the source picture saved in the enchanced_picture_ptr contains a border in x and y dimensions
      92             :         pic_point = buffer_y + (origin_y*stride_y) + origin_x;
      93             :         for (h = 0; h < height; h++) {
      94             :             fwrite(pic_point, 1, (size_t)width, fid);
      95             :             pic_point = pic_point + stride_y;
      96             :         }
      97             :         pic_point = buffer_u + ((origin_y >> ss_y)*stride_u) + (origin_x >> ss_x);
      98             :         for (h = 0; h < height >> ss_y; h++) {
      99             :             fwrite(pic_point, 1, (size_t)width >> ss_x, fid);
     100             :             pic_point = pic_point + stride_u;
     101             :         }
     102             :         pic_point = buffer_v + ((origin_y >> ss_y)*stride_v) + (origin_x >> ss_x);
     103             :         for (h = 0; h < height >> ss_y; h++) {
     104             :             fwrite(pic_point, 1, (size_t)width >> ss_x, fid);
     105             :             pic_point = pic_point + stride_v;
     106             :         }
     107             :         fclose(fid);
     108             :     }
     109             : }
     110             : 
     111             : // save YUV to file - auxiliary function for debug
     112             : void save_YUV_to_file_highbd(char *filename, uint16_t* buffer_y, uint16_t* buffer_u, uint16_t* buffer_v,
     113             :                       uint16_t width, uint16_t height,
     114             :                       uint16_t stride_y, uint16_t stride_u, uint16_t stride_v,
     115             :                       uint16_t origin_y, uint16_t origin_x,
     116             :                       uint32_t ss_x, uint32_t ss_y){
     117             :     FILE *fid = NULL;
     118             :     uint16_t *pic_point;
     119             :     int h;
     120             : 
     121             :     // save current source picture to a YUV file
     122             :     FOPEN(fid, filename, "wb");
     123             : 
     124             :     if (!fid){
     125             :         printf("Unable to open file %s to write.\n", "temp_picture.yuv");
     126             :     }else{
     127             :         // the source picture saved in the enchanced_picture_ptr contains a border in x and y dimensions
     128             :         pic_point = buffer_y + (origin_y*stride_y) + origin_x;
     129             :         for (h = 0; h < height; h++) {
     130             :             fwrite(pic_point, 2, (size_t)width, fid);
     131             :             pic_point = pic_point + stride_y;
     132             :         }
     133             :         pic_point = buffer_u + ((origin_y >> ss_y)*stride_u) + (origin_x >> ss_x);
     134             :         for (h = 0; h < height >> ss_y; h++) {
     135             :             fwrite(pic_point, 2, (size_t)width >> ss_x, fid);
     136             : 
     137             :             pic_point = pic_point + stride_u;
     138             :         }
     139             :         pic_point = buffer_v + ((origin_y >> ss_y)*stride_v) + (origin_x >> ss_x);
     140             :         for (h = 0; h < height >> ss_y; h++) {
     141             :             fwrite(pic_point, 2, (size_t)width >> ss_x, fid);
     142             :             pic_point = pic_point + stride_v;
     143             :         }
     144             :         fclose(fid);
     145             :     }
     146             : }
     147             : #endif
     148             : 
     149           0 : static void pack_highbd_pic(EbPictureBufferDesc *pic_ptr,
     150             :                             uint16_t *buffer_16bit[3],
     151             :                             uint32_t ss_x,
     152             :                             uint32_t ss_y,
     153             :                             EbBool include_padding)
     154             : {
     155             : 
     156           0 :     uint32_t input_y_offset = 0;
     157           0 :     uint32_t input_bit_inc_y_offset = 0;
     158           0 :     uint32_t input_cb_offset = 0;
     159           0 :     uint32_t input_bit_inc_cb_offset = 0;
     160           0 :     uint32_t input_cr_offset = 0;
     161           0 :     uint32_t input_bit_inc_cr_offset = 0;
     162           0 :     uint16_t width = pic_ptr->stride_y;
     163           0 :     uint16_t height = (uint16_t)(pic_ptr->origin_y*2 + pic_ptr->height);
     164             : 
     165           0 :     if(!include_padding){
     166           0 :         input_y_offset = ((pic_ptr->origin_y) * pic_ptr->stride_y) + (pic_ptr->origin_x);
     167           0 :         input_bit_inc_y_offset = ((pic_ptr->origin_y)      * pic_ptr->stride_bit_inc_y) + (pic_ptr->origin_x);
     168           0 :         input_cb_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_cb) + ((pic_ptr->origin_x) >> ss_x);
     169           0 :         input_bit_inc_cb_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_bit_inc_cb) + ((pic_ptr->origin_x) >> ss_x);
     170           0 :         input_cr_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_cr) + ((pic_ptr->origin_x) >> ss_x);
     171           0 :         input_bit_inc_cr_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_bit_inc_cr) + ((pic_ptr->origin_x) >> ss_x);
     172             : 
     173           0 :         width = pic_ptr->width;
     174           0 :         height = pic_ptr->height;
     175             :     }
     176             : 
     177           0 :     pack2d_src(pic_ptr->buffer_y + input_y_offset,
     178           0 :                pic_ptr->stride_y,
     179           0 :                pic_ptr->buffer_bit_inc_y + input_bit_inc_y_offset,
     180           0 :                pic_ptr->stride_bit_inc_y,
     181             :                buffer_16bit[C_Y],
     182           0 :                pic_ptr->stride_y,
     183             :                width,
     184             :                height);
     185             : 
     186           0 :     pack2d_src(pic_ptr->buffer_cb + input_cb_offset,
     187           0 :                pic_ptr->stride_cb,
     188           0 :                pic_ptr->buffer_bit_inc_cb + input_bit_inc_cb_offset,
     189           0 :                pic_ptr->stride_bit_inc_cb,
     190           0 :                buffer_16bit[C_U],
     191           0 :                pic_ptr->stride_cb,
     192           0 :                width >> ss_x,
     193           0 :                height >> ss_y);
     194             : 
     195           0 :     pack2d_src(pic_ptr->buffer_cr + input_cr_offset,
     196           0 :                pic_ptr->stride_cr,
     197           0 :                pic_ptr->buffer_bit_inc_cr + input_bit_inc_cr_offset,
     198           0 :                pic_ptr->stride_bit_inc_cr,
     199           0 :                buffer_16bit[C_V],
     200           0 :                pic_ptr->stride_cr,
     201           0 :                width >> ss_x,
     202           0 :                height >> ss_y);
     203             : 
     204           0 : }
     205             : 
     206           0 : static void unpack_highbd_pic(uint16_t *buffer_highbd[3],
     207             :                               EbPictureBufferDesc *pic_ptr,
     208             :                               uint32_t ss_x,
     209             :                               uint32_t ss_y,
     210             :                               EbBool include_padding)
     211             : {
     212             : 
     213           0 :     uint32_t input_y_offset = 0;
     214           0 :     uint32_t input_bit_inc_y_offset = 0;
     215           0 :     uint32_t input_cb_offset = 0;
     216           0 :     uint32_t input_bit_inc_cb_offset = 0;
     217           0 :     uint32_t input_cr_offset = 0;
     218           0 :     uint32_t input_bit_inc_cr_offset = 0;
     219           0 :     uint16_t width = pic_ptr->stride_y;
     220           0 :     uint16_t height = (uint16_t)(pic_ptr->origin_y*2 + pic_ptr->height);
     221             : 
     222           0 :     if(!include_padding){
     223           0 :         input_y_offset = ((pic_ptr->origin_y) * pic_ptr->stride_y) + (pic_ptr->origin_x);
     224           0 :         input_bit_inc_y_offset = ((pic_ptr->origin_y)      * pic_ptr->stride_bit_inc_y) + (pic_ptr->origin_x);
     225           0 :         input_cb_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_cb) + ((pic_ptr->origin_x) >> ss_x);
     226           0 :         input_bit_inc_cb_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_bit_inc_cb) + ((pic_ptr->origin_x) >> ss_x);
     227           0 :         input_cr_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_cr) + ((pic_ptr->origin_x) >> ss_x);
     228           0 :         input_bit_inc_cr_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_bit_inc_cr) + ((pic_ptr->origin_x) >> ss_x);
     229             : 
     230           0 :         width = pic_ptr->width;
     231           0 :         height = pic_ptr->height;
     232             :     }
     233             : 
     234           0 :     un_pack2d(buffer_highbd[C_Y],
     235           0 :               pic_ptr->stride_y,
     236           0 :               pic_ptr->buffer_y + input_y_offset,
     237           0 :               pic_ptr->stride_y,
     238           0 :               pic_ptr->buffer_bit_inc_y + input_bit_inc_y_offset,
     239           0 :               pic_ptr->stride_bit_inc_y,
     240             :               width,
     241             :               height);
     242             : 
     243           0 :     un_pack2d(buffer_highbd[C_U],
     244           0 :               pic_ptr->stride_cb,
     245           0 :               pic_ptr->buffer_cb + input_cb_offset,
     246           0 :               pic_ptr->stride_cb,
     247           0 :               pic_ptr->buffer_bit_inc_cb + input_bit_inc_cb_offset,
     248           0 :               pic_ptr->stride_bit_inc_cb,
     249           0 :               width >> ss_x,
     250           0 :               height >> ss_y);
     251             : 
     252           0 :     un_pack2d(buffer_highbd[C_V],
     253           0 :               pic_ptr->stride_cr,
     254           0 :               pic_ptr->buffer_cr + input_cr_offset,
     255           0 :               pic_ptr->stride_cr,
     256           0 :               pic_ptr->buffer_bit_inc_cr + input_bit_inc_cr_offset,
     257           0 :               pic_ptr->stride_bit_inc_cr,
     258           0 :               width >> ss_x,
     259           0 :               height >> ss_y);
     260           0 : }
     261             : 
     262          48 : void generate_padding_pic(EbPictureBufferDesc *pic_ptr,
     263             :                           uint32_t ss_x,
     264             :                           uint32_t ss_y,
     265             :                           EbBool is_highbd){
     266             : 
     267          48 :     if(!is_highbd){
     268          48 :         generate_padding(pic_ptr->buffer_cb,
     269          48 :                          pic_ptr->stride_cb,
     270          48 :                          pic_ptr->width >> ss_x,
     271          48 :                          pic_ptr->height >> ss_y,
     272          48 :                          pic_ptr->origin_x >> ss_x,
     273          48 :                          pic_ptr->origin_y >> ss_y);
     274             : 
     275          48 :         generate_padding(pic_ptr->buffer_cr,
     276          48 :                          pic_ptr->stride_cr,
     277          48 :                          pic_ptr->width >> ss_x,
     278          48 :                          pic_ptr->height >> ss_y,
     279          48 :                          pic_ptr->origin_x >> ss_x,
     280          48 :                          pic_ptr->origin_y >> ss_y);
     281             :     }else{
     282           0 :         generate_padding(pic_ptr->buffer_cb,
     283           0 :                          pic_ptr->stride_cb,
     284           0 :                          pic_ptr->width >> ss_x,
     285           0 :                          pic_ptr->height >> ss_y,
     286           0 :                          pic_ptr->origin_x >> ss_x,
     287           0 :                          pic_ptr->origin_y >> ss_y);
     288             : 
     289           0 :         generate_padding(pic_ptr->buffer_cr,
     290           0 :                          pic_ptr->stride_cr,
     291           0 :                          pic_ptr->width >> ss_x,
     292           0 :                          pic_ptr->height >> ss_y,
     293           0 :                          pic_ptr->origin_x >> ss_x,
     294           0 :                          pic_ptr->origin_y >> ss_y);
     295             : 
     296           0 :         generate_padding(pic_ptr->buffer_bit_inc_cb,
     297           0 :                          pic_ptr->stride_cr,
     298           0 :                          pic_ptr->width >> ss_x,
     299           0 :                          pic_ptr->height >> ss_y,
     300           0 :                          pic_ptr->origin_x >> ss_x,
     301           0 :                          pic_ptr->origin_y >> ss_y);
     302             : 
     303           0 :         generate_padding(pic_ptr->buffer_bit_inc_cr,
     304           0 :                          pic_ptr->stride_cr,
     305           0 :                          pic_ptr->width >> ss_x,
     306           0 :                          pic_ptr->height >> ss_y,
     307           0 :                          pic_ptr->origin_x >> ss_x,
     308           0 :                          pic_ptr->origin_y >> ss_y);
     309             :     }
     310          48 : }
     311             : 
     312             : // assign a single value to all elements in an array
     313        4320 : static void populate_list_with_value(int *list,
     314             :                                      int nelements,
     315             :                                      const int value){
     316       33120 :     for(int i=0; i<nelements; i++)
     317       28800 :         list[i] = value;
     318        4320 : }
     319             : 
     320             : // get block filter weights using a distance metric
     321        2880 : static void get_blk_fw_using_dist(int const *me_32x32_subblock_vf,
     322             :                                   int const *me_16x16_subblock_vf,
     323             :                                   EbBool use_16x16_subblocks_only,
     324             :                                   int *blk_fw,
     325             :                                   EbBool is_highbd){
     326             :     uint32_t blk_idx, idx_32x32;
     327             : 
     328        2880 :     int me_sum_16x16_subblock_vf[4] = {0};
     329        2880 :     int max_me_vf[4] = {INT_MIN_TF, INT_MIN_TF, INT_MIN_TF, INT_MIN_TF}, min_me_vf[4] = {INT_MAX_TF, INT_MAX_TF, INT_MAX_TF, INT_MAX_TF};
     330             : 
     331             :     int threshold_low, threshold_high;
     332             : 
     333        2880 :     if(!is_highbd){
     334        2880 :         threshold_low = THRES_LOW;
     335        2880 :         threshold_high = THRES_HIGH;
     336             :     }else{
     337           0 :         threshold_low = THRES_LOW*16;
     338           0 :         threshold_high = THRES_HIGH*16;
     339             :     }
     340             : 
     341        2880 :     if(use_16x16_subblocks_only) {
     342       14400 :         for (idx_32x32 = 0; idx_32x32 < 4; idx_32x32++) {
     343             :             // split into 16x16 sub-blocks
     344             : 
     345      195840 :             for (blk_idx = 0; blk_idx < N_16X16_BLOCKS; blk_idx++) {
     346      184320 :                 if (subblocks_from32x32_to_16x16[blk_idx] == idx_32x32) {
     347       46080 :                     blk_fw[blk_idx] = me_16x16_subblock_vf[blk_idx] < threshold_low
     348             :                                       ? 2
     349       46080 :                                       : me_16x16_subblock_vf[blk_idx] < threshold_high ? 1 : 0;
     350             :                 }
     351             :             }
     352             :         }
     353             :     }else {
     354           0 :         for (blk_idx = 0; blk_idx < N_16X16_BLOCKS; blk_idx++) {
     355           0 :             idx_32x32 = subblocks_from32x32_to_16x16[blk_idx];
     356             : 
     357           0 :             if (min_me_vf[idx_32x32] > me_16x16_subblock_vf[blk_idx])
     358           0 :                 min_me_vf[idx_32x32] = me_16x16_subblock_vf[blk_idx];
     359           0 :             if (max_me_vf[idx_32x32] < me_16x16_subblock_vf[blk_idx])
     360           0 :                 max_me_vf[idx_32x32] = me_16x16_subblock_vf[blk_idx];
     361             : 
     362           0 :             me_sum_16x16_subblock_vf[idx_32x32] += me_16x16_subblock_vf[blk_idx];
     363             :         }
     364             : 
     365           0 :         for (idx_32x32 = 0; idx_32x32 < 4; idx_32x32++) {
     366           0 :             if (((me_32x32_subblock_vf[idx_32x32] * 15 < (me_sum_16x16_subblock_vf[idx_32x32] << 4)) &&
     367           0 :                  max_me_vf - min_me_vf < THRES_DIFF_HIGH) ||
     368           0 :                 ((me_32x32_subblock_vf[idx_32x32] * 14 < (me_sum_16x16_subblock_vf[idx_32x32] << 4)) &&
     369           0 :                  max_me_vf - min_me_vf < THRES_DIFF_LOW)) {
     370             :                 // split into 32x32 sub-blocks
     371             : 
     372           0 :                 int weight = me_32x32_subblock_vf[idx_32x32] < (threshold_low << THR_SHIFT)
     373             :                              ? 2
     374           0 :                              : me_32x32_subblock_vf[idx_32x32] < (threshold_high << THR_SHIFT) ? 1 : 0;
     375             : 
     376           0 :                 for (blk_idx = 0; blk_idx < N_16X16_BLOCKS; blk_idx++) {
     377           0 :                     if (subblocks_from32x32_to_16x16[blk_idx] == idx_32x32)
     378           0 :                         blk_fw[blk_idx] = weight;
     379             :                 }
     380             :             } else {
     381             :                 // split into 16x16 sub-blocks
     382             : 
     383           0 :                 for (blk_idx = 0; blk_idx < N_16X16_BLOCKS; blk_idx++) {
     384           0 :                     if (subblocks_from32x32_to_16x16[blk_idx] == idx_32x32) {
     385           0 :                         blk_fw[blk_idx] = me_16x16_subblock_vf[blk_idx] < threshold_low
     386             :                                           ? 2
     387           0 :                                           : me_16x16_subblock_vf[blk_idx] < threshold_high ? 1 : 0;
     388             :                     }
     389             :                 }
     390             :             }
     391             :         }
     392             :     }
     393        2880 : }
     394             : 
     395             : // compute variance for the MC block residuals
     396        2880 : static void get_ME_distortion(int *me_32x32_subblock_vf,
     397             :                               int *me_16x16_subblock_vf,
     398             :                               uint8_t *pred_y,
     399             :                               int stride_pred_y,
     400             :                               uint8_t *src_y,
     401             :                               int stride_src_y){
     402             :     unsigned int sse;
     403             : 
     404             :     uint8_t * pred_y_ptr;
     405             :     uint8_t * src_y_ptr;
     406             : 
     407       14400 :     for(uint32_t index_32x32 = 0; index_32x32 < 4; index_32x32++) {
     408       11519 :         int row = subblock_xy_32x32[index_32x32][0];
     409       11519 :         int col = subblock_xy_32x32[index_32x32][1];
     410             : 
     411       11519 :         pred_y_ptr = pred_y + 32*row*stride_pred_y + 32*col;
     412       11519 :         src_y_ptr = src_y + 32*row*stride_src_y + 32*col;
     413             : 
     414       11519 :         const aom_variance_fn_ptr_t *fn_ptr = &mefn_ptr[BLOCK_32X32];
     415             : 
     416       11519 :         me_32x32_subblock_vf[index_32x32] = fn_ptr->vf(pred_y_ptr, stride_pred_y, src_y_ptr, stride_src_y, &sse );
     417             :     }
     418             : 
     419       48959 :     for(uint32_t index_16x16 = 0; index_16x16 < 16; index_16x16++) {
     420       46078 :         int row = subblock_xy_16x16[index_16x16][0];
     421       46078 :         int col = subblock_xy_16x16[index_16x16][1];
     422             : 
     423       46078 :         pred_y_ptr = pred_y + 16*row*stride_pred_y + 16*col;
     424       46078 :         src_y_ptr = src_y + 16*row*stride_src_y + 16*col;
     425             : 
     426       46078 :         const aom_variance_fn_ptr_t *fn_ptr = &mefn_ptr[BLOCK_16X16];
     427             : 
     428       46078 :         me_16x16_subblock_vf[index_16x16] = fn_ptr->vf(pred_y_ptr, stride_pred_y, src_y_ptr, stride_src_y, &sse );
     429             :     }
     430        2881 : }
     431             : 
     432             : // compute variance for the MC block residuals - highbd
     433           0 : static void get_ME_distortion_highbd(int *me_32x32_subblock_vf,
     434             :                                      int *me_16x16_subblock_vf,
     435             :                                      uint16_t *pred_y,
     436             :                                      int stride_pred_y,
     437             :                                      uint16_t *src_y,
     438             :                                      int stride_src_y){
     439             :     unsigned int sse;
     440             : 
     441             :     uint16_t *pred_Y_ptr;
     442             :     uint16_t *src_Y_ptr;
     443             : 
     444           0 :     for(uint32_t index_32x32 = 0; index_32x32 < 4; index_32x32++) {
     445           0 :         int row = subblock_xy_32x32[index_32x32][0];
     446           0 :         int col = subblock_xy_32x32[index_32x32][1];
     447             : 
     448           0 :         pred_Y_ptr = pred_y + 32*row*stride_pred_y + 32*col;
     449           0 :         src_Y_ptr = src_y + 32*row*stride_src_y + 32*col;
     450             : 
     451           0 :         me_32x32_subblock_vf[index_32x32] = variance_highbd_c(pred_Y_ptr, stride_pred_y, src_Y_ptr, stride_src_y, 32, 32, &sse );
     452             :     }
     453             : 
     454           0 :     for(uint32_t index_16x16 = 0; index_16x16 < 16; index_16x16++) {
     455           0 :         int row = subblock_xy_16x16[index_16x16][0];
     456           0 :         int col = subblock_xy_16x16[index_16x16][1];
     457             : 
     458           0 :         pred_Y_ptr = pred_y + 16*row*stride_pred_y + 16*col;
     459           0 :         src_Y_ptr = src_y + 16*row*stride_src_y + 16*col;
     460             : 
     461           0 :         me_16x16_subblock_vf[index_16x16] = variance_highbd_c(pred_Y_ptr, stride_pred_y, src_Y_ptr, stride_src_y, 16, 16, &sse );
     462             :     }
     463           0 : }
     464             : 
     465             : // Create and initialize all necessary ME context structures
     466        2880 : static void create_ME_context_and_picture_control(MotionEstimationContext_t *context_ptr,
     467             :                                                   PictureParentControlSet *picture_control_set_ptr_frame,
     468             :                                                   PictureParentControlSet *picture_control_set_ptr_central,
     469             :                                                   EbPictureBufferDesc *input_picture_ptr_central,
     470             :                                                   int blk_row,
     471             :                                                   int blk_col,
     472             :                                                   uint32_t ss_x,
     473             :                                                   uint32_t ss_y){
     474             :     uint32_t lcuRow;
     475             : 
     476             :     // set reference picture for alt-refs
     477        2880 :     context_ptr->me_context_ptr->alt_ref_reference_ptr = (EbPaReferenceObject*)picture_control_set_ptr_frame->pa_reference_picture_wrapper_ptr->object_ptr;
     478        2880 :     context_ptr->me_context_ptr->me_alt_ref = EB_TRUE;
     479             : 
     480             :     // set the buffers with the original, quarter and sixteenth pixels version of the source frame
     481        2880 :     EbPaReferenceObject *src_object = (EbPaReferenceObject*)picture_control_set_ptr_central->pa_reference_picture_wrapper_ptr->object_ptr;
     482        2880 :     EbPictureBufferDesc *padded_pic_ptr = src_object->input_padded_picture_ptr;
     483        2880 :     SequenceControlSet *sequence_control_set_ptr = (SequenceControlSet*)picture_control_set_ptr_central->sequence_control_set_wrapper_ptr->object_ptr;
     484             :     // Set 1/4 and 1/16 ME reference buffer(s); filtered or decimated
     485        5760 :     EbPictureBufferDesc * quarter_pic_ptr = (sequence_control_set_ptr->down_sampling_method_me_search == ME_FILTERED_DOWNSAMPLED) ?
     486        2880 :         src_object->quarter_filtered_picture_ptr :
     487             :         src_object->quarter_decimated_picture_ptr;
     488             : 
     489        5760 :     EbPictureBufferDesc *sixteenth_pic_ptr = (sequence_control_set_ptr->down_sampling_method_me_search == ME_FILTERED_DOWNSAMPLED) ?
     490        2880 :         src_object->sixteenth_filtered_picture_ptr :
     491             :         src_object->sixteenth_decimated_picture_ptr;
     492             :     // Parts from MotionEstimationKernel()
     493        2880 :     uint32_t sb_origin_x = (uint32_t)(blk_col * BW);
     494        2880 :     uint32_t sb_origin_y = (uint32_t)(blk_row * BH);
     495             : 
     496        2880 :     uint32_t sb_width = (input_picture_ptr_central->width - sb_origin_x) < BLOCK_SIZE_64 ? input_picture_ptr_central->width - sb_origin_x : BLOCK_SIZE_64;
     497        2880 :     uint32_t sb_height = (input_picture_ptr_central->height - sb_origin_y) < BLOCK_SIZE_64 ? input_picture_ptr_central->height - sb_origin_y : BLOCK_SIZE_64;
     498             : 
     499             :     // Load the SB from the input to the intermediate SB buffer
     500        2880 :     int bufferIndex = (input_picture_ptr_central->origin_y + sb_origin_y) * input_picture_ptr_central->stride_y + input_picture_ptr_central->origin_x + sb_origin_x;
     501             : 
     502             :     // set search type
     503        2880 :     context_ptr->me_context_ptr->hme_search_type = HME_RECTANGULAR;
     504             : 
     505             :     // set search method
     506        2880 :     context_ptr->me_context_ptr->hme_search_method = FULL_SAD_SEARCH;
     507             : 
     508             :     // set Lambda
     509        2880 :     context_ptr->me_context_ptr->lambda = lambda_mode_decision_ra_sad[picture_control_set_ptr_central->picture_qp];
     510             : 
     511             :     // populate src block buffers: sb_buffer, quarter_sb_buffer and sixteenth_sb_buffer
     512      187119 :     for (lcuRow = 0; lcuRow < BLOCK_SIZE_64; lcuRow++) {
     513      184244 :         EB_MEMCPY((&(context_ptr->me_context_ptr->sb_buffer[lcuRow * BLOCK_SIZE_64])), (&(input_picture_ptr_central->buffer_y[bufferIndex + lcuRow * input_picture_ptr_central->stride_y])), BLOCK_SIZE_64 * sizeof(uint8_t));
     514             :     }
     515             : 
     516             :     {
     517        2875 :         uint8_t * src_ptr = &(padded_pic_ptr->buffer_y[bufferIndex]);
     518             : 
     519             :         //_MM_HINT_T0     //_MM_HINT_T1    //_MM_HINT_T2    //_MM_HINT_NTA
     520             :         uint32_t i;
     521      175670 :         for (i = 0; i < sb_height; i++)
     522             :         {
     523      172799 :             char const* p = (char const*)(src_ptr + i * padded_pic_ptr->stride_y);
     524      172799 :             _mm_prefetch(p, _MM_HINT_T2);
     525             :         }
     526             :     }
     527             : 
     528        2871 :     context_ptr->me_context_ptr->sb_src_ptr = &(padded_pic_ptr->buffer_y[bufferIndex]);
     529        2871 :     context_ptr->me_context_ptr->sb_src_stride = padded_pic_ptr->stride_y;
     530             : 
     531             :     // Load the 1/4 decimated SB from the 1/4 decimated input to the 1/4 intermediate SB buffer
     532        2871 :     bufferIndex = (quarter_pic_ptr->origin_y + (sb_origin_y >> ss_y)) * quarter_pic_ptr->stride_y + quarter_pic_ptr->origin_x + (sb_origin_x >> ss_x);
     533             : 
     534       89271 :     for (lcuRow = 0; lcuRow < (sb_height >> ss_y); lcuRow++) {
     535       86400 :         EB_MEMCPY((&(context_ptr->me_context_ptr->quarter_sb_buffer[lcuRow * context_ptr->me_context_ptr->quarter_sb_buffer_stride])), (&(quarter_pic_ptr->buffer_y[bufferIndex + lcuRow * quarter_pic_ptr->stride_y])), (sb_width >> ss_x) * sizeof(uint8_t));
     536             :     }
     537             : 
     538             :     // Load the 1/16 decimated SB from the 1/16 decimated input to the 1/16 intermediate SB buffer
     539        2871 :     bufferIndex = (sixteenth_pic_ptr->origin_y + (sb_origin_y >> 2)) * sixteenth_pic_ptr->stride_y + sixteenth_pic_ptr->origin_x + (sb_origin_x >> 2);
     540             : 
     541             :     {
     542        2871 :         uint8_t *framePtr = &(sixteenth_pic_ptr->buffer_y[bufferIndex]);
     543        2871 :         uint8_t *localPtr = context_ptr->me_context_ptr->sixteenth_sb_buffer;
     544             : 
     545        2871 :         if (context_ptr->me_context_ptr->hme_search_method == FULL_SAD_SEARCH) {
     546       46080 :             for (lcuRow = 0; lcuRow < (sb_height >> 2); lcuRow += 1) {
     547       43200 :                 EB_MEMCPY(localPtr, framePtr, (sb_width >> 2) * sizeof(uint8_t));
     548       43200 :                 localPtr += 16;
     549       43200 :                 framePtr += sixteenth_pic_ptr->stride_y;
     550             :             }
     551             :         }
     552             :         else {
     553           0 :             for (lcuRow = 0; lcuRow < (sb_height >> 2); lcuRow += 2) {
     554           0 :                 EB_MEMCPY(localPtr, framePtr, (sb_width >> 2) * sizeof(uint8_t));
     555           0 :                 localPtr += 16;
     556           0 :                 framePtr += sixteenth_pic_ptr->stride_y << 1;
     557             :             }
     558             :         }
     559             :     }
     560        2871 : }
     561             : 
     562             : // Get sub-block filter weights for the 16 subblocks case
     563           0 : static INLINE int get_subblock_filter_weight_16subblocks(unsigned int y,
     564             :                                                          unsigned int x,
     565             :                                                          unsigned int block_height,
     566             :                                                          unsigned int block_width,
     567             :                                                          const int *blk_fw) {
     568           0 :     const unsigned int block_width_div4 = block_width / 4;
     569           0 :     const unsigned int block_height_div4 = block_height / 4;
     570             : 
     571           0 :     int filter_weight = 0;
     572           0 :     if (y < block_height_div4) {
     573           0 :         if (x < block_width_div4)
     574           0 :             filter_weight = blk_fw[0];
     575           0 :         else if(x < block_width_div4*2)
     576           0 :             filter_weight = blk_fw[1];
     577           0 :         else if(x < block_width_div4*3)
     578           0 :             filter_weight = blk_fw[2];
     579             :         else
     580           0 :             filter_weight = blk_fw[3];
     581           0 :     } else if(y < block_height_div4*2){
     582           0 :         if (x < block_width_div4)
     583           0 :             filter_weight = blk_fw[4];
     584           0 :         else if(x < block_width_div4*2)
     585           0 :             filter_weight = blk_fw[5];
     586           0 :         else if(x < block_width_div4*3)
     587           0 :             filter_weight = blk_fw[6];
     588             :         else
     589           0 :             filter_weight = blk_fw[7];
     590           0 :     } else if(y < block_height_div4*3){
     591           0 :         if (x < block_width_div4)
     592           0 :             filter_weight = blk_fw[8];
     593           0 :         else if(x < block_width_div4*2)
     594           0 :             filter_weight = blk_fw[9];
     595           0 :         else if(x < block_width_div4*3)
     596           0 :             filter_weight = blk_fw[10];
     597             :         else
     598           0 :             filter_weight = blk_fw[11];
     599             :     } else {
     600           0 :         if (x < block_width_div4)
     601           0 :             filter_weight = blk_fw[12];
     602           0 :         else if(x < block_width_div4*2)
     603           0 :             filter_weight = blk_fw[13];
     604           0 :         else if(x < block_width_div4*3)
     605           0 :             filter_weight = blk_fw[14];
     606             :         else
     607           0 :             filter_weight = blk_fw[15];
     608             :     }
     609             : 
     610           0 :     return filter_weight;
     611             : }
     612             : 
     613             : // Get sub-block filter weights for the 4 subblocks case
     614           0 : static INLINE int get_subblock_filter_weight_4subblocks(unsigned int y,
     615             :                                                         unsigned int x,
     616             :                                                         unsigned int block_height,
     617             :                                                         unsigned int block_width,
     618             :                                                         const int *blk_fw) {
     619           0 :     int filter_weight = 0;
     620           0 :     if (y < block_height / 2) {
     621           0 :         if (x < block_width / 2)
     622           0 :             filter_weight = blk_fw[0];
     623             :         else
     624           0 :             filter_weight = blk_fw[1];
     625             :     } else {
     626           0 :         if (x < block_width / 2)
     627           0 :             filter_weight = blk_fw[2];
     628             :         else
     629           0 :             filter_weight = blk_fw[3];
     630             :     }
     631           0 :     return filter_weight;
     632             : }
     633             : 
     634             : // Adjust value of the modified (weight of filtering) based on the distortion and strength parameter
     635           0 : static INLINE int adjust_modifier(int sum_dist,
     636             :                                   int index,
     637             :                                   int rounding,
     638             :                                   int strength,
     639             :                                   int filter_weight) {
     640           0 :     assert(index >= 0 && index <= 13);
     641           0 :     assert(index_mult[index] != 0);
     642             : 
     643             :     //mod = (sum_dist / index) * 3;
     644           0 :     int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
     645             : 
     646           0 :     mod += rounding;
     647           0 :     mod >>= strength;
     648             : 
     649           0 :     mod = AOMMIN(16, mod);
     650             : 
     651           0 :     mod = 16 - mod;
     652           0 :     mod *= filter_weight;
     653             : 
     654           0 :     return mod;
     655             : }
     656             : 
     657             : // Adjust value of the modified (weight of filtering) based on the distortion and strength parameter - highbd
     658           0 : static INLINE int adjust_modifier_highbd(int64_t sum_dist,
     659             :                                          int index,
     660             :                                          int rounding,
     661             :                                          int strength,
     662             :                                          int filter_weight) {
     663           0 :     assert(index >= 0 && index <= 13);
     664           0 :     assert(index_mult_highbd[index] != 0);
     665             : 
     666             :     //mod = (sum_dist / index) * 3;
     667           0 :     int mod = (int)((AOMMIN(sum_dist, INT32_MAX) * index_mult_highbd[index]) >> 32);
     668             : 
     669           0 :     mod += rounding;
     670           0 :     mod >>= strength;
     671             : 
     672           0 :     mod = AOMMIN(16, mod);
     673             : 
     674           0 :     mod = 16 - mod;
     675           0 :     mod *= filter_weight;
     676             : 
     677           0 :     return mod;
     678             : }
     679             : 
     680           0 : static INLINE void calculate_squared_errors(const uint8_t *s,
     681             :                                             int s_stride,
     682             :                                             const uint8_t *p,
     683             :                                             int p_stride,
     684             :                                             uint16_t *diff_sse,
     685             :                                             unsigned int w,
     686             :                                             unsigned int h) {
     687           0 :     int idx = 0;
     688             :     unsigned int i, j;
     689             : 
     690           0 :     for (i = 0; i < h; i++) {
     691           0 :         for (j = 0; j < w; j++) {
     692           0 :             const int16_t diff = s[i * s_stride + j] - p[i * p_stride + j];
     693           0 :             diff_sse[idx] = (uint16_t)(diff * diff);
     694           0 :             idx++;
     695             :         }
     696             :     }
     697           0 : }
     698             : 
     699           0 : static INLINE void calculate_squared_errors_highbd(const uint16_t *s,
     700             :                                                    int s_stride,
     701             :                                                    const uint16_t *p,
     702             :                                                    int p_stride,
     703             :                                                    uint32_t *diff_sse,
     704             :                                                    unsigned int w,
     705             :                                                    unsigned int h) {
     706           0 :     int idx = 0;
     707             :     unsigned int i, j;
     708             : 
     709           0 :     for (i = 0; i < h; i++) {
     710           0 :         for (j = 0; j < w; j++) {
     711           0 :             const int32_t diff = s[i * s_stride + j] - p[i * p_stride + j];
     712           0 :             diff_sse[idx] = (uint32_t)(diff * diff);
     713           0 :             idx++;
     714             :         }
     715             :     }
     716           0 : }
     717             : 
     718             : // Main function that applies filtering to a block according to the weights
     719           0 : void svt_av1_apply_filtering_c(const uint8_t *y_src,
     720             :                                int y_src_stride,
     721             :                                const uint8_t *y_pre,
     722             :                                int y_pre_stride,
     723             :                                const uint8_t *u_src,
     724             :                                const uint8_t *v_src,
     725             :                                int uv_src_stride,
     726             :                                const uint8_t *u_pre,
     727             :                                const uint8_t *v_pre,
     728             :                                int uv_pre_stride,
     729             :                                unsigned int block_width,
     730             :                                unsigned int block_height,
     731             :                                int ss_x,
     732             :                                int ss_y,
     733             :                                int strength,
     734             :                                const int *blk_fw,
     735             :                                int use_whole_blk,
     736             :                                uint32_t *y_accum,
     737             :                                uint16_t *y_count,
     738             :                                uint32_t *u_accum,
     739             :                                uint16_t *u_count,
     740             :                                uint32_t *v_accum,
     741             :                                uint16_t *v_count){ // sub-block filter weights
     742             : 
     743             :     unsigned int i, j, k, m;
     744             :     int idx, idy;
     745             :     int modifier;
     746           0 :     const int rounding = (1 << strength) >> 1;
     747           0 :     const unsigned int uv_block_width = block_width >> ss_x;
     748           0 :     const unsigned int uv_block_height = block_height >> ss_y;
     749             :     DECLARE_ALIGNED(16, uint16_t, y_diff_se[BLK_PELS]);
     750             :     DECLARE_ALIGNED(16, uint16_t, u_diff_se[BLK_PELS]);
     751             :     DECLARE_ALIGNED(16, uint16_t, v_diff_se[BLK_PELS]);
     752             : 
     753           0 :     memset(y_diff_se, 0, BLK_PELS * sizeof(uint16_t));
     754           0 :     memset(u_diff_se, 0, BLK_PELS * sizeof(uint16_t));
     755           0 :     memset(v_diff_se, 0, BLK_PELS * sizeof(uint16_t));
     756             : 
     757           0 :     assert(use_whole_blk == 0);
     758             :     UNUSED(use_whole_blk);
     759             : 
     760             :     // Calculate squared differences for each pixel of the block (pred-orig)
     761           0 :     calculate_squared_errors(y_src, y_src_stride, y_pre, y_pre_stride, y_diff_se,
     762             :                              block_width, block_height);
     763           0 :     calculate_squared_errors(u_src, uv_src_stride, u_pre, uv_pre_stride,
     764             :                              u_diff_se, uv_block_width, uv_block_height);
     765           0 :     calculate_squared_errors(v_src, uv_src_stride, v_pre, uv_pre_stride,
     766             :                              v_diff_se, uv_block_width, uv_block_height);
     767             : 
     768           0 :     for (i = 0; i < block_height; i++) {
     769           0 :         for (j = 0; j < block_width; j++) {
     770           0 :             const int pixel_value = y_pre[i * y_pre_stride + j];
     771             : 
     772             :             int filter_weight;
     773             : 
     774           0 :             if(block_width == (BW>>1)){
     775           0 :                 filter_weight = get_subblock_filter_weight_4subblocks(i, j, block_height, block_width, blk_fw);
     776             :             }else{
     777           0 :                 filter_weight = get_subblock_filter_weight_16subblocks(i, j, block_height, block_width, blk_fw);
     778             :             }
     779             : 
     780             :             // non-local mean approach
     781           0 :             int y_index = 0;
     782             : 
     783           0 :             const int uv_r = i >> ss_y;
     784           0 :             const int uv_c = j >> ss_x;
     785           0 :             modifier = 0;
     786             : 
     787           0 :             for (idy = -1; idy <= 1; ++idy) {
     788           0 :                 for (idx = -1; idx <= 1; ++idx) {
     789           0 :                     const int row = (int)i + idy;
     790           0 :                     const int col = (int)j + idx;
     791             : 
     792           0 :                     if (row >= 0 && row < (int)block_height && col >= 0 &&
     793           0 :                         col < (int)block_width) {
     794           0 :                         modifier += y_diff_se[row * (int)block_width + col];
     795           0 :                         ++y_index;
     796             :                     }
     797             :                 }
     798             :             }
     799             : 
     800           0 :             assert(y_index > 0);
     801             : 
     802           0 :             modifier += u_diff_se[uv_r * uv_block_width + uv_c];
     803           0 :             modifier += v_diff_se[uv_r * uv_block_width + uv_c];
     804             : 
     805           0 :             y_index += 2;
     806             : 
     807           0 :             modifier = adjust_modifier(modifier, y_index, rounding, strength, filter_weight);
     808             : 
     809           0 :             k = i * y_pre_stride + j;
     810             : 
     811           0 :             y_count[k] += modifier;
     812           0 :             y_accum[k] += modifier * pixel_value;
     813             : 
     814             :             // Process chroma component
     815           0 :             if (!(i & ss_y) && !(j & ss_x)) {
     816           0 :                 const int u_pixel_value = u_pre[uv_r * uv_pre_stride + uv_c];
     817           0 :                 const int v_pixel_value = v_pre[uv_r * uv_pre_stride + uv_c];
     818             : 
     819             :                 // non-local mean approach
     820           0 :                 int cr_index = 0;
     821           0 :                 int u_mod = 0, v_mod = 0;
     822           0 :                 int y_diff = 0;
     823             : 
     824           0 :                 for (idy = -1; idy <= 1; ++idy) {
     825           0 :                     for (idx = -1; idx <= 1; ++idx) {
     826           0 :                         const int row = uv_r + idy;
     827           0 :                         const int col = uv_c + idx;
     828             : 
     829           0 :                         if (row >= 0 && row < (int)uv_block_height && col >= 0 &&
     830           0 :                             col < (int)uv_block_width) {
     831           0 :                             u_mod += u_diff_se[row * uv_block_width + col];
     832           0 :                             v_mod += v_diff_se[row * uv_block_width + col];
     833           0 :                             ++cr_index;
     834             :                         }
     835             :                     }
     836             :                 }
     837             : 
     838           0 :                 assert(cr_index > 0);
     839             : 
     840           0 :                 for (idy = 0; idy < 1 + ss_y; ++idy) {
     841           0 :                     for (idx = 0; idx < 1 + ss_x; ++idx) {
     842           0 :                         const int row = (uv_r << ss_y) + idy;
     843           0 :                         const int col = (uv_c << ss_x) + idx;
     844           0 :                         y_diff += y_diff_se[row * (int)block_width + col];
     845           0 :                         ++cr_index;
     846             :                     }
     847             :                 }
     848             : 
     849           0 :                 u_mod += y_diff;
     850           0 :                 v_mod += y_diff;
     851             : 
     852           0 :                 u_mod = adjust_modifier(u_mod, cr_index, rounding, strength, filter_weight);
     853           0 :                 v_mod = adjust_modifier(v_mod, cr_index, rounding, strength, filter_weight);
     854             : 
     855           0 :                 m = (i>>ss_y) * uv_pre_stride + (j>>ss_x);
     856             : 
     857           0 :                 u_count[m] += u_mod;
     858           0 :                 u_accum[m] += u_mod * u_pixel_value;
     859             : 
     860           0 :                 m = (i>>ss_y) * uv_pre_stride + (j>>ss_x);
     861             : 
     862           0 :                 v_count[m] += v_mod;
     863           0 :                 v_accum[m] += v_mod * v_pixel_value;
     864             :             }
     865             :         }
     866             :     }
     867           0 : }
     868             : 
     869             : // Main function that applies filtering to a block according to the weights - highbd
     870           0 : void svt_av1_apply_filtering_highbd_c(const uint16_t *y_src,
     871             :                                       int y_src_stride,
     872             :                                       const uint16_t *y_pre,
     873             :                                       int y_pre_stride,
     874             :                                       const uint16_t *u_src,
     875             :                                       const uint16_t *v_src,
     876             :                                       int uv_src_stride,
     877             :                                       const uint16_t *u_pre,
     878             :                                       const uint16_t *v_pre,
     879             :                                       int uv_pre_stride,
     880             :                                       unsigned int block_width,
     881             :                                       unsigned int block_height,
     882             :                                       int ss_x,
     883             :                                       int ss_y,
     884             :                                       int strength,
     885             :                                       const int *blk_fw,
     886             :                                       int use_whole_blk,
     887             :                                       uint32_t *y_accum,
     888             :                                       uint16_t *y_count,
     889             :                                       uint32_t *u_accum,
     890             :                                       uint16_t *u_count,
     891             :                                       uint32_t *v_accum,
     892             :                                       uint16_t *v_count){ // sub-block filter weights
     893             : 
     894             :     unsigned int i, j, k, m;
     895             :     int idx, idy;
     896           0 :     const int rounding = (1 << strength) >> 1;
     897           0 :     const unsigned int uv_block_width = block_width >> ss_x;
     898           0 :     const unsigned int uv_block_height = block_height >> ss_y;
     899             :     DECLARE_ALIGNED(16, uint32_t, y_diff_se[BLK_PELS]);
     900             :     DECLARE_ALIGNED(16, uint32_t, u_diff_se[BLK_PELS]);
     901             :     DECLARE_ALIGNED(16, uint32_t, v_diff_se[BLK_PELS]);
     902             : 
     903           0 :     memset(y_diff_se, 0, BLK_PELS * sizeof(uint32_t));
     904           0 :     memset(u_diff_se, 0, BLK_PELS * sizeof(uint32_t));
     905           0 :     memset(v_diff_se, 0, BLK_PELS * sizeof(uint32_t));
     906             : 
     907           0 :     assert(use_whole_blk == 0);
     908             :     UNUSED(use_whole_blk);
     909             : 
     910             :     // Calculate squared differences for each pixel of the block (pred-orig)
     911           0 :     calculate_squared_errors_highbd(y_src, y_src_stride, y_pre, y_pre_stride, y_diff_se,
     912             :                              block_width, block_height);
     913           0 :     calculate_squared_errors_highbd(u_src, uv_src_stride, u_pre, uv_pre_stride,
     914             :                              u_diff_se, uv_block_width, uv_block_height);
     915           0 :     calculate_squared_errors_highbd(v_src, uv_src_stride, v_pre, uv_pre_stride,
     916             :                              v_diff_se, uv_block_width, uv_block_height);
     917             : 
     918           0 :     for (i = 0; i < block_height; i++) {
     919           0 :         for (j = 0; j < block_width; j++) {
     920           0 :             const int pixel_value = y_pre[i * y_pre_stride + j];
     921             : 
     922             :             int filter_weight;
     923             : 
     924           0 :             if(block_width == (BW>>1)){
     925           0 :                 filter_weight = get_subblock_filter_weight_4subblocks(i, j, block_height, block_width, blk_fw);
     926             :             }else{
     927           0 :                 filter_weight = get_subblock_filter_weight_16subblocks(i, j, block_height, block_width, blk_fw);
     928             :             }
     929             : 
     930             :             // non-local mean approach
     931           0 :             int y_index = 0;
     932             : 
     933           0 :             const int uv_r = i >> ss_y;
     934           0 :             const int uv_c = j >> ss_x;
     935             :             int final_y_mod;
     936           0 :             int64_t y_mod = 0;
     937             : 
     938           0 :             for (idy = -1; idy <= 1; ++idy) {
     939           0 :                 for (idx = -1; idx <= 1; ++idx) {
     940           0 :                     const int row = (int)i + idy;
     941           0 :                     const int col = (int)j + idx;
     942             : 
     943           0 :                     if (row >= 0 && row < (int)block_height && col >= 0 &&
     944           0 :                         col < (int)block_width) {
     945           0 :                         y_mod += y_diff_se[row * (int)block_width + col];
     946           0 :                         ++y_index;
     947             :                     }
     948             :                 }
     949             :             }
     950             : 
     951           0 :             assert(y_index > 0);
     952             : 
     953           0 :             y_mod += u_diff_se[uv_r * uv_block_width + uv_c];
     954           0 :             y_mod += v_diff_se[uv_r * uv_block_width + uv_c];
     955             : 
     956           0 :             y_index += 2;
     957             : 
     958           0 :             final_y_mod = adjust_modifier_highbd(y_mod, y_index, rounding, strength, filter_weight);
     959             : 
     960           0 :             k = i * y_pre_stride + j;
     961             : 
     962           0 :             y_count[k] += final_y_mod;
     963           0 :             y_accum[k] += final_y_mod * pixel_value;
     964             : 
     965             :             // Process chroma component
     966           0 :             if (!(i & ss_y) && !(j & ss_x)) {
     967           0 :                 const int u_pixel_value = u_pre[uv_r * uv_pre_stride + uv_c];
     968           0 :                 const int v_pixel_value = v_pre[uv_r * uv_pre_stride + uv_c];
     969             : 
     970             :                 // non-local mean approach
     971           0 :                 int cr_index = 0;
     972           0 :                 int64_t u_mod = 0, v_mod = 0;
     973             :                 int final_u_mod, final_v_mod;
     974           0 :                 int y_diff = 0;
     975             : 
     976           0 :                 for (idy = -1; idy <= 1; ++idy) {
     977           0 :                     for (idx = -1; idx <= 1; ++idx) {
     978           0 :                         const int row = uv_r + idy;
     979           0 :                         const int col = uv_c + idx;
     980             : 
     981           0 :                         if (row >= 0 && row < (int)uv_block_height && col >= 0 &&
     982           0 :                             col < (int)uv_block_width) {
     983           0 :                             u_mod += u_diff_se[row * uv_block_width + col];
     984           0 :                             v_mod += v_diff_se[row * uv_block_width + col];
     985           0 :                             ++cr_index;
     986             :                         }
     987             :                     }
     988             :                 }
     989             : 
     990           0 :                 assert(cr_index > 0);
     991             : 
     992           0 :                 for (idy = 0; idy < 1 + ss_y; ++idy) {
     993           0 :                     for (idx = 0; idx < 1 + ss_x; ++idx) {
     994           0 :                         const int row = (uv_r << ss_y) + idy;
     995           0 :                         const int col = (uv_c << ss_x) + idx;
     996           0 :                         y_diff += y_diff_se[row * (int)block_width + col];
     997           0 :                         ++cr_index;
     998             :                     }
     999             :                 }
    1000             : 
    1001           0 :                 u_mod += y_diff;
    1002           0 :                 v_mod += y_diff;
    1003             : 
    1004           0 :                 final_u_mod = adjust_modifier_highbd(u_mod, cr_index, rounding, strength, filter_weight);
    1005           0 :                 final_v_mod = adjust_modifier_highbd(v_mod, cr_index, rounding, strength, filter_weight);
    1006             : 
    1007           0 :                 m = (i>>ss_y) * uv_pre_stride + (j>>ss_x);
    1008             : 
    1009           0 :                 u_count[m] += final_u_mod;
    1010           0 :                 u_accum[m] += final_u_mod * u_pixel_value;
    1011             : 
    1012           0 :                 m = (i>>ss_y) * uv_pre_stride + (j>>ss_x);
    1013             : 
    1014           0 :                 v_count[m] += final_v_mod;
    1015           0 :                 v_accum[m] += final_v_mod * v_pixel_value;
    1016             :             }
    1017             :         }
    1018             :     }
    1019           0 : }
    1020             : 
    1021       11520 : static void apply_filtering_block(int block_row,
    1022             :                                   int block_col,
    1023             :                                   EbByte *src,
    1024             :                                   uint16_t **src_16bit,
    1025             :                                   EbByte *pred,
    1026             :                                   uint16_t **pred_16bit,
    1027             :                                   uint32_t **accum,
    1028             :                                   uint16_t **count,
    1029             :                                   uint32_t *stride,
    1030             :                                   uint32_t *stride_pred,
    1031             :                                   int block_width,
    1032             :                                   int block_height,
    1033             :                                   uint32_t ss_x, // chroma sub-sampling in x
    1034             :                                   uint32_t ss_y, // chroma sub-sampling in y
    1035             :                                   int altref_strength,
    1036             :                                   const int *blk_fw,
    1037             :                                   EbBool is_highbd) {
    1038             : 
    1039       11520 :     int blk_h = BH >> 1; int blk_w = BW >> 1; // fixed 32x32 blocks for now
    1040             : 
    1041       11520 :     int offset_src_buffer_Y = block_row * blk_h * stride[C_Y] + block_col * blk_w;
    1042       11520 :     int offset_src_buffer_U = block_row * (blk_h >> ss_y) * stride[C_U] + block_col * (blk_w >> ss_x);
    1043       11520 :     int offset_src_buffer_V = block_row * (blk_h >> ss_y) * stride[C_V] + block_col * (blk_w >> ss_x);
    1044             : 
    1045       11520 :     int offset_block_buffer_Y = block_row * blk_h * stride_pred[C_Y] + block_col * blk_w;
    1046       11520 :     int offset_block_buffer_U = block_row * (blk_h >> ss_y) * stride_pred[C_U] + block_col * (blk_w >> ss_x);
    1047       11520 :     int offset_block_buffer_V = block_row * (blk_h >> ss_y) * stride_pred[C_V] + block_col * (blk_w >> ss_x);
    1048             : 
    1049             :     int blk_fw_32x32[4];
    1050             : 
    1051       11520 :     int idx_32x32 = block_row * 2 + block_col;
    1052             : 
    1053             :     uint8_t *src_ptr[COLOR_CHANNELS];
    1054             :     uint8_t *pred_ptr[COLOR_CHANNELS];
    1055             :     uint32_t *accum_ptr[COLOR_CHANNELS];
    1056             :     uint16_t *count_ptr[COLOR_CHANNELS];
    1057             : 
    1058             :     uint16_t *src_ptr_16bit[COLOR_CHANNELS];
    1059             :     uint16_t *pred_ptr_16bit[COLOR_CHANNELS];
    1060             : 
    1061       57600 :     for (int ifw = 0; ifw < 4; ifw++) {
    1062       46080 :         int ifw_index = index_16x16_from_subindexes[idx_32x32][ifw];
    1063             : 
    1064       46080 :         blk_fw_32x32[ifw] = blk_fw[ifw_index];
    1065             :     }
    1066             : 
    1067       11520 :     accum_ptr[C_Y] = accum[C_Y] + offset_block_buffer_Y;
    1068       11520 :     accum_ptr[C_U] = accum[C_U] + offset_block_buffer_U;
    1069       11520 :     accum_ptr[C_V] = accum[C_V] + offset_block_buffer_V;
    1070             : 
    1071       11520 :     count_ptr[C_Y] = count[C_Y] + offset_block_buffer_Y;
    1072       11520 :     count_ptr[C_U] = count[C_U] + offset_block_buffer_U;
    1073       11520 :     count_ptr[C_V] = count[C_V] + offset_block_buffer_V;
    1074             : 
    1075       11520 :     if(!is_highbd){
    1076       11520 :         src_ptr[C_Y] = src[C_Y] + offset_src_buffer_Y;
    1077       11520 :         src_ptr[C_U] = src[C_U] + offset_src_buffer_U;
    1078       11520 :         src_ptr[C_V] = src[C_V] + offset_src_buffer_V;
    1079             : 
    1080       11520 :         pred_ptr[C_Y] = pred[C_Y] + offset_block_buffer_Y;
    1081       11520 :         pred_ptr[C_U] = pred[C_U] + offset_block_buffer_U;
    1082       11520 :         pred_ptr[C_V] = pred[C_V] + offset_block_buffer_V;
    1083             : 
    1084             :         // Apply the temporal filtering strategy
    1085       11520 :         svt_av1_apply_filtering(src_ptr[C_Y],
    1086       11520 :                                    stride[C_Y],
    1087       11520 :                                    pred_ptr[C_Y],
    1088       11520 :                                    stride_pred[C_Y],
    1089       11520 :                                    src_ptr[C_U],
    1090       11520 :                                    src_ptr[C_V],
    1091       11520 :                                    stride[C_U],
    1092       11520 :                                    pred_ptr[C_U],
    1093       11520 :                                    pred_ptr[C_V],
    1094       11520 :                                    stride_pred[C_U],
    1095             :                                    (unsigned int)block_width,
    1096             :                                    (unsigned int)block_height,
    1097             :                                    ss_x,
    1098             :                                    ss_y,
    1099             :                                    altref_strength,
    1100             :                                    blk_fw_32x32,
    1101             :                                    0, // use_32x32
    1102             :                                    accum_ptr[C_Y],
    1103             :                                    count_ptr[C_Y],
    1104             :                                    accum_ptr[C_U],
    1105             :                                    count_ptr[C_U],
    1106             :                                    accum_ptr[C_V],
    1107             :                                    count_ptr[C_V]);
    1108             :     }else{
    1109           0 :         src_ptr_16bit[C_Y] = src_16bit[C_Y] + offset_src_buffer_Y;
    1110           0 :         src_ptr_16bit[C_U] = src_16bit[C_U] + offset_src_buffer_U;
    1111           0 :         src_ptr_16bit[C_V] = src_16bit[C_V] + offset_src_buffer_V;
    1112             : 
    1113           0 :         pred_ptr_16bit[C_Y] = pred_16bit[C_Y] + offset_block_buffer_Y;
    1114           0 :         pred_ptr_16bit[C_U] = pred_16bit[C_U] + offset_block_buffer_U;
    1115           0 :         pred_ptr_16bit[C_V] = pred_16bit[C_V] + offset_block_buffer_V;
    1116             : 
    1117             :         // Apply the temporal filtering strategy
    1118           0 :         svt_av1_apply_filtering_highbd(src_ptr_16bit[C_Y],
    1119           0 :                                  stride[C_Y],
    1120           0 :                                  pred_ptr_16bit[C_Y],
    1121           0 :                                  stride_pred[C_Y],
    1122           0 :                                  src_ptr_16bit[C_U],
    1123           0 :                                  src_ptr_16bit[C_V],
    1124           0 :                                  stride[C_U],
    1125           0 :                                  pred_ptr_16bit[C_U],
    1126           0 :                                  pred_ptr_16bit[C_V],
    1127           0 :                                  stride_pred[C_U],
    1128             :                                  (unsigned int)block_width,
    1129             :                                  (unsigned int)block_height,
    1130             :                                  ss_x,
    1131             :                                  ss_y,
    1132             :                                  altref_strength,
    1133             :                                  blk_fw_32x32,
    1134             :                                  0, // use_32x32
    1135             :                                  accum_ptr[C_Y],
    1136             :                                  count_ptr[C_Y],
    1137             :                                  accum_ptr[C_U],
    1138             :                                  count_ptr[C_U],
    1139             :                                  accum_ptr[C_V],
    1140             :                                  count_ptr[C_V]);
    1141             :     }
    1142             : 
    1143       11520 : }
    1144             : 
    1145             : // Apply filtering to the central picture
    1146         479 : static void apply_filtering_central(EbByte *pred,
    1147             :                                     uint32_t **accum,
    1148             :                                     uint16_t **count,
    1149             :                                     uint16_t blk_width,
    1150             :                                     uint16_t blk_height,
    1151             :                                     uint32_t ss_x,
    1152             :                                     uint32_t ss_y) {
    1153             : 
    1154             :     uint16_t i, j, k;
    1155         479 :     uint16_t blk_height_y = blk_height;
    1156         479 :     uint16_t blk_width_y = blk_width;
    1157         479 :     uint16_t blk_height_ch = blk_height >> ss_y;
    1158         479 :     uint16_t blk_width_ch = blk_width >> ss_x;
    1159         479 :     uint16_t blk_stride_y = blk_width;
    1160         479 :     uint16_t blk_stride_ch = blk_width >> ss_x;
    1161             : 
    1162         479 :     int filter_weight = INIT_WEIGHT;
    1163         479 :     const int modifier = filter_weight * WEIGHT_MULTIPLIER;
    1164             : 
    1165             :     // Luma
    1166         479 :     k = 0;
    1167       31146 :     for (i = 0; i < blk_height_y; i++) {
    1168     1979690 :         for (j = 0; j < blk_width_y; j++) {
    1169     1949030 :             accum[C_Y][k] += modifier * pred[C_Y][i * blk_stride_y + j];
    1170     1949030 :             count[C_Y][k] += modifier;
    1171     1949030 :             ++k;
    1172             :         }
    1173             :     }
    1174             : 
    1175             :     // Chroma
    1176         479 :     k = 0;
    1177       15833 :     for (i = 0; i < blk_height_ch; i++) {
    1178      505235 :         for (j = 0; j < blk_width_ch; j++) {
    1179      489881 :             accum[C_U][k] += modifier * pred[C_U][i * blk_stride_ch + j];
    1180      489881 :             count[C_U][k] += modifier;
    1181             : 
    1182      489881 :             accum[C_V][k] += modifier * pred[C_V][i * blk_stride_ch + j];
    1183      489881 :             count[C_V][k] += modifier;
    1184      489881 :             ++k;
    1185             :         }
    1186             :     }
    1187         479 : }
    1188             : 
    1189             : // Apply filtering to the central picture
    1190           0 : static void apply_filtering_central_highbd(uint16_t **pred_16bit,
    1191             :                                            uint32_t **accum,
    1192             :                                            uint16_t **count,
    1193             :                                            uint16_t blk_width,
    1194             :                                            uint16_t blk_height,
    1195             :                                            uint32_t ss_x,
    1196             :                                            uint32_t ss_y) {
    1197             : 
    1198             :     uint16_t i, j, k;
    1199           0 :     uint16_t blk_height_y = blk_height;
    1200           0 :     uint16_t blk_width_y = blk_width;
    1201           0 :     uint16_t blk_height_ch= blk_height >> ss_y;
    1202           0 :     uint16_t blk_width_ch = blk_width >> ss_x;
    1203           0 :     uint16_t blk_stride_y = blk_width;
    1204           0 :     uint16_t blk_stride_ch = blk_width >> ss_x;
    1205             : 
    1206           0 :     int filter_weight = INIT_WEIGHT;
    1207           0 :     const int modifier = filter_weight * WEIGHT_MULTIPLIER;
    1208             : 
    1209             :     // Luma
    1210           0 :     k = 0;
    1211           0 :     for (i = 0; i < blk_height_y; i++) {
    1212           0 :         for (j = 0; j < blk_width_y; j++) {
    1213           0 :             accum[C_Y][k] += modifier * pred_16bit[C_Y][i * blk_stride_y + j];
    1214           0 :             count[C_Y][k] += modifier;
    1215           0 :             ++k;
    1216             :         }
    1217             :     }
    1218             : 
    1219             :     // Chroma
    1220           0 :     k = 0;
    1221           0 :     for (i = 0; i < blk_height_ch; i++) {
    1222           0 :         for (j = 0; j < blk_width_ch; j++) {
    1223           0 :             accum[C_U][k] += modifier * pred_16bit[C_U][i * blk_stride_ch + j];
    1224           0 :             count[C_U][k] += modifier;
    1225             : 
    1226           0 :             accum[C_V][k] += modifier * pred_16bit[C_V][i * blk_stride_ch + j];
    1227           0 :             count[C_V][k] += modifier;
    1228           0 :             ++k;
    1229             :         }
    1230             :     }
    1231           0 : }
    1232             : 
    1233             : uint32_t get_mds_idx(uint32_t  orgx, uint32_t  orgy, uint32_t  size, uint32_t use_128x128);
    1234             : 
    1235        2880 : static void tf_inter_prediction(PictureParentControlSet *picture_control_set_ptr,
    1236             :                                 MeContext *context_ptr,
    1237             :                                 EbPictureBufferDesc *pic_ptr_ref,
    1238             :                                 EbByte *pred,
    1239             :                                 uint16_t **pred_16bit,
    1240             :                                 uint32_t *stride_pred,
    1241             :                                 EbByte *src,
    1242             :                                 uint16_t **src_16bit,
    1243             :                                 uint32_t *stride_src,
    1244             :                                 uint32_t sb_origin_x,
    1245             :                                 uint32_t sb_origin_y,
    1246             :                                 uint32_t ss_x,
    1247             :                                 uint32_t ss_y,
    1248             :                                 const int* use_16x16_subblocks,
    1249             :                                 int encoder_bit_depth)
    1250             : {
    1251             :     const InterpFilters interp_filters =
    1252        2880 :         av1_make_interp_filters(MULTITAP_SHARP, MULTITAP_SHARP);
    1253             : 
    1254        2880 :     EbBool is_highbd = (encoder_bit_depth == 8) ? (uint8_t)EB_FALSE : (uint8_t)EB_TRUE;
    1255             : 
    1256             :     CodingUnit       cu_ptr;
    1257             :     MacroBlockD      av1xd;
    1258        2880 :     cu_ptr.av1xd = &av1xd;
    1259             :     MvUnit   mv_unit;
    1260        2880 :     mv_unit.pred_direction = UNI_PRED_LIST_0;
    1261             : 
    1262             :     EbPictureBufferDesc      reference_ptr;
    1263             :     EbPictureBufferDesc      prediction_ptr;
    1264             : 
    1265             :     UNUSED(ss_x);
    1266             : 
    1267        2880 :     prediction_ptr.origin_x = 0;
    1268        2880 :     prediction_ptr.origin_y = 0;
    1269        2880 :     prediction_ptr.stride_y = BW;
    1270        2880 :     prediction_ptr.stride_cb = (uint16_t)BW >> ss_x;
    1271        2880 :     prediction_ptr.stride_cr = (uint16_t)BW >> ss_x;
    1272             : 
    1273        2880 :     if(!is_highbd){
    1274        2880 :         assert(src[C_Y] != NULL);
    1275        2880 :         assert(src[C_U] != NULL);
    1276        2880 :         assert(src[C_V] != NULL);
    1277        2880 :         prediction_ptr.buffer_y = pred[C_Y];
    1278        2880 :         prediction_ptr.buffer_cb = pred[C_U];
    1279        2880 :         prediction_ptr.buffer_cr = pred[C_V];
    1280             :     }else{
    1281           0 :         assert(src_16bit[C_Y] != NULL);
    1282           0 :         assert(src_16bit[C_U] != NULL);
    1283           0 :         assert(src_16bit[C_V] != NULL);
    1284           0 :         prediction_ptr.buffer_y = (uint8_t*) pred_16bit[C_Y];
    1285           0 :         prediction_ptr.buffer_cb = (uint8_t*) pred_16bit[C_U];
    1286           0 :         prediction_ptr.buffer_cr = (uint8_t*) pred_16bit[C_V];
    1287             : 
    1288           0 :         reference_ptr.buffer_y = (uint8_t*)malloc(pic_ptr_ref->luma_size * sizeof(uint16_t));
    1289           0 :         reference_ptr.buffer_cb = (uint8_t*)malloc(pic_ptr_ref->chroma_size * sizeof(uint16_t));
    1290           0 :         reference_ptr.buffer_cr = (uint8_t*)malloc(pic_ptr_ref->chroma_size * sizeof(uint16_t));
    1291             : 
    1292           0 :         reference_ptr.origin_x = pic_ptr_ref->origin_x;
    1293           0 :         reference_ptr.origin_y = pic_ptr_ref->origin_y;
    1294           0 :         reference_ptr.stride_y = pic_ptr_ref->stride_y;
    1295           0 :         reference_ptr.stride_cb = pic_ptr_ref->stride_cb;
    1296           0 :         reference_ptr.stride_cr = pic_ptr_ref->stride_cr;
    1297           0 :         reference_ptr.width = pic_ptr_ref->width;
    1298           0 :         reference_ptr.height = pic_ptr_ref->height;
    1299             : 
    1300           0 :         uint32_t height_y = (uint32_t)(2*reference_ptr.origin_y + reference_ptr.height);
    1301             : 
    1302           0 :         pack2d_src(pic_ptr_ref->buffer_y,
    1303           0 :                    reference_ptr.stride_y,
    1304             :                    pic_ptr_ref->buffer_bit_inc_y,
    1305           0 :                    pic_ptr_ref->stride_bit_inc_y,
    1306           0 :                    (uint16_t*)reference_ptr.buffer_y,
    1307           0 :                    reference_ptr.stride_y,
    1308           0 :                    reference_ptr.stride_y,
    1309             :                    height_y);
    1310             : 
    1311           0 :         pack2d_src(pic_ptr_ref->buffer_cb,
    1312           0 :                    reference_ptr.stride_cb,
    1313             :                    pic_ptr_ref->buffer_bit_inc_cb,
    1314           0 :                    pic_ptr_ref->stride_bit_inc_cb,
    1315           0 :                    (uint16_t*)reference_ptr.buffer_cb,
    1316           0 :                    reference_ptr.stride_cb,
    1317           0 :                    reference_ptr.stride_cb,
    1318             :                    height_y >> ss_y);
    1319             : 
    1320           0 :         pack2d_src(pic_ptr_ref->buffer_cr,
    1321           0 :                    reference_ptr.stride_cr,
    1322             :                    pic_ptr_ref->buffer_bit_inc_cr,
    1323           0 :                    pic_ptr_ref->stride_bit_inc_cr,
    1324           0 :                    (uint16_t*)reference_ptr.buffer_cr,
    1325           0 :                    reference_ptr.stride_cr,
    1326           0 :                    reference_ptr.stride_cr,
    1327             :                    height_y >> ss_y);
    1328             :     }
    1329             : 
    1330       14399 :     for (uint32_t idx_32x32 = 0; idx_32x32 < 4; idx_32x32++) {
    1331       11520 :         if (use_16x16_subblocks[idx_32x32] != 0) {
    1332       11520 :             uint32_t    bsize = 16;
    1333             : 
    1334       57586 :             for (uint32_t idx_16x16 = 0; idx_16x16 < 4; idx_16x16++) {
    1335       46067 :                 uint32_t pu_index = index_16x16_from_subindexes[idx_32x32][idx_16x16];
    1336             : 
    1337       46067 :                 uint32_t idx_y = subblock_xy_16x16[pu_index][0];
    1338       46067 :                 uint32_t idx_x = subblock_xy_16x16[pu_index][1];
    1339       46067 :                 uint16_t local_origin_x = idx_x * bsize;
    1340       46067 :                 uint16_t local_origin_y = idx_y * bsize;
    1341       46067 :                 uint16_t pu_origin_x = sb_origin_x + local_origin_x;
    1342       46067 :                 uint16_t pu_origin_y = sb_origin_y + local_origin_y;
    1343       46067 :                 uint32_t mirow = pu_origin_y >> MI_SIZE_LOG2;
    1344       46067 :                 uint32_t micol = pu_origin_x >> MI_SIZE_LOG2;
    1345       46067 :                 cu_ptr.mds_idx = get_mds_idx(local_origin_x, local_origin_y, bsize, picture_control_set_ptr->sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128);
    1346             : 
    1347       46065 :                 const int32_t bw = mi_size_wide[BLOCK_16X16];
    1348       46065 :                 const int32_t bh = mi_size_high[BLOCK_16X16];
    1349       46065 :                 cu_ptr.av1xd->mb_to_top_edge = -(int32_t)((mirow * MI_SIZE) * 8);
    1350       46065 :                 cu_ptr.av1xd->mb_to_bottom_edge = ((picture_control_set_ptr->av1_cm->mi_rows - bw - mirow) * MI_SIZE) * 8;
    1351       46065 :                 cu_ptr.av1xd->mb_to_left_edge = -(int32_t)((micol * MI_SIZE) * 8);
    1352       46065 :                 cu_ptr.av1xd->mb_to_right_edge = ((picture_control_set_ptr->av1_cm->mi_cols - bh - micol) * MI_SIZE) * 8;
    1353             : 
    1354       46065 :                 uint32_t mv_index = tab16x16[pu_index];
    1355       46065 :                 mv_unit.mv->x = _MVXT(context_ptr->p_best_mv16x16[mv_index]);
    1356       46065 :                 mv_unit.mv->y = _MVYT(context_ptr->p_best_mv16x16[mv_index]);
    1357             :                 //AV1 MVs are always in 1/8th pel precision.
    1358       46065 :                 mv_unit.mv->x = mv_unit.mv->x << 1;
    1359       46065 :                 mv_unit.mv->y = mv_unit.mv->y << 1;
    1360       46065 :                 uint64_t best_distortion = (uint64_t)~0;
    1361       46065 :                 signed short best_mv_x = 0;
    1362       46065 :                 signed short best_mv_y = 0;
    1363       46065 :                 signed short mv_x = (_MVXT(context_ptr->p_best_mv16x16[mv_index])) << 1;
    1364       46065 :                 signed short mv_y = (_MVYT(context_ptr->p_best_mv16x16[mv_index])) << 1;
    1365             : 
    1366      184380 :                 for (signed short i = -1; i <= 1; i++) {
    1367      552455 :                     for (signed short j = -1; j <= 1; j++) {
    1368             : 
    1369      414140 :                         mv_unit.mv->x = mv_x + i;
    1370      414140 :                         mv_unit.mv->y = mv_y + j;
    1371             : 
    1372      414140 :                         av1_inter_prediction_function_table[is_highbd](
    1373             :                             NULL,  //picture_control_set_ptr,
    1374             :                             (uint32_t)interp_filters,
    1375             :                             &cu_ptr,
    1376             :                             0,//ref_frame_type,
    1377             :                             &mv_unit,
    1378             :                             0,//use_intrabc,
    1379             : #if OBMC_FLAG
    1380             :                             SIMPLE_TRANSLATION,
    1381             :                             0,
    1382             :                             0,
    1383             : #endif
    1384             :                             1,//compound_idx not used
    1385             :                             NULL,// interinter_comp not used
    1386             : #if II_COMP_FLAG
    1387             :                             NULL,
    1388             :                             NULL,
    1389             :                             NULL,
    1390             :                             NULL,
    1391             :                             0,
    1392             :                             0,
    1393             :                             0,
    1394             :                             0,
    1395             : #endif
    1396             :                             pu_origin_x,
    1397             :                             pu_origin_y,
    1398             :                             bsize,
    1399             :                             bsize,
    1400             :                             !is_highbd ? pic_ptr_ref : &reference_ptr,
    1401             :                             NULL,//ref_pic_list1,
    1402             :                             &prediction_ptr,
    1403             :                             local_origin_x,
    1404             :                             local_origin_y,
    1405             :                             1,//perform_chroma,
    1406      414140 :                             (uint8_t)encoder_bit_depth);
    1407             : 
    1408             :                         uint64_t distortion;
    1409      413835 :                         if(!is_highbd){
    1410      413854 :                             uint8_t *pred_y_ptr = pred[C_Y] + bsize * idx_y*stride_pred[C_Y] + bsize * idx_x;
    1411      413854 :                             uint8_t *src_y_ptr = src[C_Y] + bsize * idx_y*stride_src[C_Y] + bsize * idx_x;
    1412             : 
    1413      413854 :                             const aom_variance_fn_ptr_t *fn_ptr = &mefn_ptr[BLOCK_16X16];
    1414             : 
    1415             :                             unsigned int sse;
    1416      413854 :                             distortion = fn_ptr->vf(pred_y_ptr, stride_pred[C_Y], src_y_ptr, stride_src[C_Y], &sse);
    1417             :                         }else{
    1418           0 :                             uint16_t *pred_y_ptr = pred_16bit[C_Y] + bsize * idx_y*stride_pred[C_Y] + bsize * idx_x;
    1419           0 :                             uint16_t *src_y_ptr = src_16bit[C_Y] + bsize * idx_y*stride_src[C_Y] + bsize * idx_x;;
    1420             : 
    1421             :                             unsigned int sse;
    1422           0 :                             distortion = variance_highbd_c(pred_y_ptr, stride_pred[C_Y], src_y_ptr, stride_src[C_Y], 16, 16, &sse);
    1423             :                         }
    1424             : 
    1425      414290 :                         if (distortion < best_distortion) {
    1426      106024 :                             best_distortion = distortion;
    1427      106024 :                             best_mv_x = mv_unit.mv->x;
    1428      106024 :                             best_mv_y = mv_unit.mv->y;
    1429             :                         }
    1430             :                     }
    1431             :                 }
    1432             : 
    1433             :                 // Perform final pass using the 1/8 MV
    1434             :                 //AV1 MVs are always in 1/8th pel precision.
    1435       46215 :                 mv_unit.mv->x = best_mv_x;
    1436       46215 :                 mv_unit.mv->y = best_mv_y;
    1437             : 
    1438       46215 :                 av1_inter_prediction_function_table[is_highbd](
    1439             :                     NULL,  //picture_control_set_ptr,
    1440             :                     (uint32_t)interp_filters,
    1441             :                     &cu_ptr,
    1442             :                     0,//ref_frame_type,
    1443             :                     &mv_unit,
    1444             :                     0,//use_intrabc,
    1445             : #if OBMC_FLAG
    1446             :                     SIMPLE_TRANSLATION,
    1447             :                     0,
    1448             :                     0,
    1449             : #endif
    1450             :                     1,//compound_idx not used
    1451             :                     NULL,// interinter_comp not used
    1452             : #if II_COMP_FLAG
    1453             :                     NULL,
    1454             :                     NULL,
    1455             :                     NULL,
    1456             :                     NULL,
    1457             :                     0,
    1458             :                     0,
    1459             :                     0,
    1460             :                     0,
    1461             : #endif
    1462             :                     pu_origin_x,
    1463             :                     pu_origin_y,
    1464             :                     bsize,
    1465             :                     bsize,
    1466             :                     !is_highbd ? pic_ptr_ref : &reference_ptr,
    1467             :                     NULL,//ref_pic_list1,
    1468             :                     &prediction_ptr,
    1469             :                     local_origin_x,
    1470             :                     local_origin_y,
    1471             :                     1,//perform_chroma,
    1472       46215 :                     (uint8_t)encoder_bit_depth);
    1473             : 
    1474             :             }
    1475             :         }
    1476             :     }
    1477             : 
    1478        2879 :     if(is_highbd){
    1479           0 :         free(reference_ptr.buffer_y);
    1480           0 :         free(reference_ptr.buffer_cb);
    1481           0 :         free(reference_ptr.buffer_cr);
    1482             :     }
    1483             : 
    1484        2879 : }
    1485             : 
    1486         480 : static void get_final_filtered_pixels(EbByte *src_center_ptr_start,
    1487             :                                       uint16_t **altref_buffer_highbd_start,
    1488             :                                       uint32_t **accum,
    1489             :                                       uint16_t **count,
    1490             :                                       const uint32_t *stride,
    1491             :                                       int blk_y_src_offset,
    1492             :                                       int blk_ch_src_offset,
    1493             :                                       uint16_t blk_width_ch,
    1494             :                                       uint16_t blk_height_ch,
    1495             :                                       uint64_t *filtered_sse,
    1496             :                                       uint64_t *filtered_sse_uv,
    1497             :                                       EbBool is_highbd){
    1498             : 
    1499             :             int i, j, k;
    1500             : 
    1501         480 :             if(!is_highbd){
    1502             :                 // Process luma
    1503         480 :                 int pos = blk_y_src_offset;
    1504       31116 :                 for (i = 0, k = 0; i < BH; i++) {
    1505     1981440 :                     for (j = 0; j < BW; j++, k++) {
    1506     1950800 :                         (*filtered_sse) += (uint64_t)((int32_t)src_center_ptr_start[C_Y][pos] - (int32_t)OD_DIVU(accum[C_Y][k] + (count[C_Y][k] >> 1), count[C_Y][k]))* ((int32_t)src_center_ptr_start[C_Y][pos] - (int32_t)OD_DIVU(accum[C_Y][k] + (count[C_Y][k] >> 1), count[C_Y][k]));
    1507     1950750 :                         src_center_ptr_start[C_Y][pos] = (uint8_t)OD_DIVU(accum[C_Y][k] + (count[C_Y][k] >> 1), count[C_Y][k]);
    1508     1950740 :                         pos++;
    1509             :                     }
    1510       30636 :                     pos += stride[C_Y] - BW;
    1511             :                 }
    1512             :                 // Process chroma
    1513         415 :                 pos = blk_ch_src_offset;
    1514       15776 :                 for (i = 0, k = 0; i < blk_height_ch; i++) {
    1515      506341 :                     for (j = 0; j < blk_width_ch; j++, k++) {
    1516      490980 :                         (*filtered_sse_uv) += (uint64_t)((int32_t)src_center_ptr_start[C_U][pos] - (int32_t)OD_DIVU(accum[C_U][k] + (count[C_U][k] >> 1), count[C_U][k]))* ((int32_t)src_center_ptr_start[C_U][pos] - (int32_t)OD_DIVU(accum[C_U][k] + (count[C_U][k] >> 1), count[C_U][k]));
    1517      490980 :                         (*filtered_sse_uv) += (uint64_t)((int32_t)src_center_ptr_start[C_V][pos] - (int32_t)OD_DIVU(accum[C_V][k] + (count[C_V][k] >> 1), count[C_V][k]))* ((int32_t)src_center_ptr_start[C_V][pos] - (int32_t)OD_DIVU(accum[C_V][k] + (count[C_V][k] >> 1), count[C_V][k]));
    1518      490981 :                         src_center_ptr_start[C_U][pos] = (uint8_t)OD_DIVU(accum[C_U][k] + (count[C_U][k] >> 1), count[C_U][k]);
    1519      490981 :                         src_center_ptr_start[C_V][pos] = (uint8_t)OD_DIVU(accum[C_V][k] + (count[C_V][k] >> 1), count[C_V][k]);
    1520      490981 :                         pos++;
    1521             :                     }
    1522       15361 :                     pos += stride[C_U] - blk_width_ch;
    1523             :                 }
    1524             :             }else{
    1525             :                 // Process luma
    1526           0 :                 int pos = blk_y_src_offset;
    1527           0 :                 for (i = 0, k = 0; i < BH; i++) {
    1528           0 :                     for (j = 0; j < BW; j++, k++) {
    1529           0 :                         (*filtered_sse) += (uint64_t)((int32_t)altref_buffer_highbd_start[C_Y][pos] - (int32_t)OD_DIVU(accum[C_Y][k] + (count[C_Y][k] >> 1), count[C_Y][k]))* ((int32_t)altref_buffer_highbd_start[C_Y][pos] - (int32_t)OD_DIVU(accum[C_Y][k] + (count[C_Y][k] >> 1), count[C_Y][k]));
    1530           0 :                         altref_buffer_highbd_start[C_Y][pos] = (uint16_t)OD_DIVU(accum[C_Y][k] + (count[C_Y][k] >> 1), count[C_Y][k]);
    1531           0 :                         pos++;
    1532             :                     }
    1533           0 :                     pos += stride[C_Y] - BW;
    1534             :                 }
    1535             :                 // Process chroma
    1536           0 :                 pos = blk_ch_src_offset;
    1537           0 :                 for (i = 0, k = 0; i < blk_height_ch; i++) {
    1538           0 :                     for (j = 0; j < blk_width_ch; j++, k++) {
    1539           0 :                         (*filtered_sse_uv) += (uint64_t)((int32_t)altref_buffer_highbd_start[C_U][pos] - (int32_t)OD_DIVU(accum[C_U][k] + (count[C_U][k] >> 1), count[C_U][k]))* ((int32_t)altref_buffer_highbd_start[C_U][pos] - (int32_t)OD_DIVU(accum[C_U][k] + (count[C_U][k] >> 1), count[C_U][k]));
    1540           0 :                         (*filtered_sse_uv) += (uint64_t)((int32_t)altref_buffer_highbd_start[C_V][pos] - (int32_t)OD_DIVU(accum[C_V][k] + (count[C_V][k] >> 1), count[C_V][k]))* ((int32_t)altref_buffer_highbd_start[C_V][pos] - (int32_t)OD_DIVU(accum[C_V][k] + (count[C_V][k] >> 1), count[C_V][k]));
    1541           0 :                         altref_buffer_highbd_start[C_U][pos] = (uint16_t)OD_DIVU(accum[C_U][k] + (count[C_U][k] >> 1), count[C_U][k]);
    1542           0 :                         altref_buffer_highbd_start[C_V][pos] = (uint16_t)OD_DIVU(accum[C_V][k] + (count[C_V][k] >> 1), count[C_V][k]);
    1543           0 :                         pos++;
    1544             :                     }
    1545           0 :                     pos += stride[C_U] - blk_width_ch;
    1546             :                 }
    1547             :             }
    1548         416 : }
    1549             : 
    1550             : // Produce the filtered alt-ref picture
    1551             : // - core function
    1552         480 : static EbErrorType produce_temporally_filtered_pic(PictureParentControlSet **list_picture_control_set_ptr,
    1553             :                                                    EbPictureBufferDesc **list_input_picture_ptr,
    1554             :                                                    uint8_t altref_strength,
    1555             :                                                    uint8_t index_center,
    1556             :                                                    uint64_t *filtered_sse,
    1557             :                                                    uint64_t *filtered_sse_uv,
    1558             :                                                    MotionEstimationContext_t *me_context_ptr,
    1559             :                                                    int32_t segment_index,
    1560             :                                                    EbBool is_highbd) {
    1561             :     int frame_index;
    1562             :     DECLARE_ALIGNED(16, uint32_t, accumulator[BLK_PELS * COLOR_CHANNELS]);
    1563             :     DECLARE_ALIGNED(16, uint16_t, counter[BLK_PELS * COLOR_CHANNELS]);
    1564         480 :     uint32_t *accum[COLOR_CHANNELS] = { accumulator, accumulator + BLK_PELS, accumulator + (BLK_PELS<<1) };
    1565         480 :     uint16_t *count[COLOR_CHANNELS] = { counter, counter + BLK_PELS, counter + (BLK_PELS<<1) };
    1566             : 
    1567         480 :     EbByte predictor = { NULL };
    1568         480 :     uint16_t *predictor_16bit = { NULL };
    1569         480 :     if(!is_highbd){
    1570         480 :         EB_MALLOC_ALIGNED_ARRAY(predictor, BLK_PELS * COLOR_CHANNELS);
    1571             :     }else{
    1572           0 :         EB_MALLOC_ALIGNED_ARRAY(predictor_16bit, BLK_PELS * COLOR_CHANNELS);
    1573             :     }
    1574         480 :     EbByte pred[COLOR_CHANNELS] = { predictor, predictor + BLK_PELS, predictor + (BLK_PELS<<1) };
    1575         480 :     uint16_t* pred_16bit[COLOR_CHANNELS] = { predictor_16bit, predictor_16bit + BLK_PELS, predictor_16bit + (BLK_PELS<<1) };
    1576             : 
    1577         480 :     EbByte src_center_ptr_start[COLOR_CHANNELS], src_center_ptr[COLOR_CHANNELS] = { NULL };
    1578         480 :     uint16_t* altref_buffer_highbd_start[COLOR_CHANNELS], *altref_buffer_highbd_ptr[COLOR_CHANNELS] = { NULL };
    1579             : 
    1580             :     uint32_t blk_row, blk_col;
    1581         480 :     int blk_y_src_offset = 0, blk_ch_src_offset = 0;
    1582             : 
    1583         480 :     PictureParentControlSet *picture_control_set_ptr_central = list_picture_control_set_ptr[index_center];
    1584         480 :     EbPictureBufferDesc *input_picture_ptr_central = list_input_picture_ptr[index_center];
    1585             : 
    1586         480 :     int encoder_bit_depth = (int)picture_control_set_ptr_central->sequence_control_set_ptr->static_config.encoder_bit_depth;
    1587             : 
    1588             :     // chroma subsampling
    1589         480 :     uint32_t ss_x = picture_control_set_ptr_central->sequence_control_set_ptr->subsampling_x;
    1590         480 :     uint32_t ss_y = picture_control_set_ptr_central->sequence_control_set_ptr->subsampling_y;
    1591         480 :     uint16_t blk_width_ch = (uint16_t)BW >> ss_x;
    1592         480 :     uint16_t blk_height_ch = (uint16_t)BH >> ss_y;
    1593             : 
    1594         480 :     uint32_t blk_cols = (uint32_t)(input_picture_ptr_central->width + BW - 1) / BW; // I think only the part of the picture
    1595         480 :     uint32_t blk_rows = (uint32_t)(input_picture_ptr_central->height + BH - 1) / BH; // that fits to the 32x32 blocks are actually filtered
    1596             : 
    1597         480 :     uint32_t stride[COLOR_CHANNELS] = { input_picture_ptr_central->stride_y,
    1598         480 :                                         input_picture_ptr_central->stride_cb,
    1599         480 :                                         input_picture_ptr_central->stride_cr };
    1600         480 :     uint32_t stride_pred[COLOR_CHANNELS] = {BW, blk_width_ch, blk_width_ch};
    1601             : 
    1602         480 :     MeContext *context_ptr = me_context_ptr->me_context_ptr;
    1603             : 
    1604             :     uint32_t  x_seg_idx;
    1605             :     uint32_t  y_seg_idx;
    1606         480 :     uint32_t picture_width_in_b64 = blk_cols;
    1607         480 :     uint32_t picture_height_in_b64 = blk_rows;
    1608         480 :     SEGMENT_CONVERT_IDX_TO_XY(segment_index, x_seg_idx, y_seg_idx, picture_control_set_ptr_central->tf_segments_column_count);
    1609         480 :     uint32_t x_b64_start_idx = SEGMENT_START_IDX(x_seg_idx, picture_width_in_b64,  picture_control_set_ptr_central->tf_segments_column_count);
    1610         480 :     uint32_t x_b64_end_idx   = SEGMENT_END_IDX  (x_seg_idx, picture_width_in_b64,  picture_control_set_ptr_central->tf_segments_column_count);
    1611         480 :     uint32_t y_b64_start_idx = SEGMENT_START_IDX(y_seg_idx, picture_height_in_b64, picture_control_set_ptr_central->tf_segments_row_count);
    1612         480 :     uint32_t y_b64_end_idx   = SEGMENT_END_IDX  (y_seg_idx, picture_height_in_b64, picture_control_set_ptr_central->tf_segments_row_count);
    1613             : 
    1614             :     // first position of the frame buffer according to the index center
    1615         480 :     src_center_ptr_start[C_Y] = input_picture_ptr_central->buffer_y +
    1616         480 :                             input_picture_ptr_central->origin_y*input_picture_ptr_central->stride_y +
    1617         480 :                             input_picture_ptr_central->origin_x;
    1618             : 
    1619         480 :     src_center_ptr_start[C_U] = input_picture_ptr_central->buffer_cb +
    1620         480 :                             (input_picture_ptr_central->origin_y>>ss_y)*input_picture_ptr_central->stride_cb +
    1621         480 :                             (input_picture_ptr_central->origin_x>>ss_x);
    1622             : 
    1623         480 :     src_center_ptr_start[C_V] = input_picture_ptr_central->buffer_cr +
    1624         480 :                             (input_picture_ptr_central->origin_y>>ss_y)*input_picture_ptr_central->stride_cr +
    1625         480 :                             (input_picture_ptr_central->origin_x>>ss_x);
    1626             : 
    1627         480 :     altref_buffer_highbd_start[C_Y] = picture_control_set_ptr_central->altref_buffer_highbd[C_Y] +
    1628         480 :                                 input_picture_ptr_central->origin_y*input_picture_ptr_central->stride_y +
    1629         480 :                                 input_picture_ptr_central->origin_x;
    1630             : 
    1631         480 :     altref_buffer_highbd_start[C_U] = picture_control_set_ptr_central->altref_buffer_highbd[C_U] +
    1632         480 :                                 (input_picture_ptr_central->origin_y>>ss_y)*input_picture_ptr_central->stride_bit_inc_cb +
    1633         480 :                                 (input_picture_ptr_central->origin_x>>ss_x);
    1634             : 
    1635         480 :     altref_buffer_highbd_start[C_V] = picture_control_set_ptr_central->altref_buffer_highbd[C_V] +
    1636         480 :                                 (input_picture_ptr_central->origin_y>>ss_y)*input_picture_ptr_central->stride_bit_inc_cr +
    1637         480 :                                 (input_picture_ptr_central->origin_x>>ss_x);
    1638             : 
    1639         480 :     *filtered_sse       = 0;
    1640         480 :     *filtered_sse_uv    = 0;
    1641             : 
    1642         960 :     for (blk_row = y_b64_start_idx; blk_row < y_b64_end_idx; blk_row++) {
    1643         960 :         for (blk_col = x_b64_start_idx; blk_col < x_b64_end_idx; blk_col++) {
    1644             : 
    1645         480 :             blk_y_src_offset  = (blk_col * BW) + (blk_row * BH) * stride[C_Y];
    1646         480 :             blk_ch_src_offset  = (blk_col * blk_width_ch) + (blk_row * blk_height_ch) * stride[C_U];
    1647             : 
    1648             :             // reset accumulator and count
    1649         480 :             memset(accumulator, 0, BLK_PELS * COLOR_CHANNELS * sizeof(accumulator[0]));
    1650         480 :             memset(counter, 0, BLK_PELS * COLOR_CHANNELS * sizeof(counter[0]));
    1651             : 
    1652             :             int blk_fw[N_16X16_BLOCKS];
    1653         480 :             int use_16x16_subblocks[N_32X32_BLOCKS] = {0};
    1654             :             int me_16x16_subblock_vf[N_16X16_BLOCKS];
    1655             :             int me_32x32_subblock_vf[N_32X32_BLOCKS];
    1656             : 
    1657         480 :             populate_list_with_value(blk_fw, 16, INIT_WEIGHT);
    1658             : 
    1659             :             // for every frame to filter
    1660        3840 :             for (frame_index = 0; frame_index < (picture_control_set_ptr_central->past_altref_nframes + picture_control_set_ptr_central->future_altref_nframes + 1); frame_index++) {
    1661             : 
    1662        3360 :                 if(!is_highbd){
    1663        3360 :                     src_center_ptr[C_Y] = src_center_ptr_start[C_Y] + blk_y_src_offset;
    1664        3360 :                     src_center_ptr[C_U] = src_center_ptr_start[C_U] + blk_ch_src_offset;
    1665        3360 :                     src_center_ptr[C_V] = src_center_ptr_start[C_V] + blk_ch_src_offset;
    1666             :                 }else{
    1667           0 :                     altref_buffer_highbd_ptr[C_Y] = altref_buffer_highbd_start[C_Y] + blk_y_src_offset;
    1668           0 :                     altref_buffer_highbd_ptr[C_U] = altref_buffer_highbd_start[C_U] + blk_ch_src_offset;
    1669           0 :                     altref_buffer_highbd_ptr[C_V] = altref_buffer_highbd_start[C_V] + blk_ch_src_offset;
    1670             :                 }
    1671             : 
    1672             :                 // ------------
    1673             :                 // Step 1: motion estimation + compensation
    1674             :                 // ------------
    1675             : 
    1676             :                 // if frame to process is the center frame
    1677        3360 :                 if (frame_index == index_center) {
    1678             :                     // skip MC (central frame)
    1679             : 
    1680         480 :                     populate_list_with_value(blk_fw, N_16X16_BLOCKS, 2);
    1681         480 :                     populate_list_with_value(use_16x16_subblocks, N_32X32_BLOCKS, 0);
    1682             : 
    1683         480 :                     if(!is_highbd){
    1684         480 :                         pic_copy_kernel_8bit(src_center_ptr[C_Y], stride[C_Y], pred[C_Y], stride_pred[C_Y], BW, BH);
    1685         480 :                         pic_copy_kernel_8bit(src_center_ptr[C_U], stride[C_U], pred[C_U], stride_pred[C_U], blk_width_ch, blk_height_ch);
    1686         480 :                         pic_copy_kernel_8bit(src_center_ptr[C_V], stride[C_V], pred[C_V], stride_pred[C_V], blk_width_ch, blk_height_ch);
    1687             :                     }else{
    1688           0 :                         pic_copy_kernel_16bit(altref_buffer_highbd_ptr[C_Y], stride[C_Y], pred_16bit[C_Y], stride_pred[C_Y], BW, BH);
    1689           0 :                         pic_copy_kernel_16bit(altref_buffer_highbd_ptr[C_U], stride[C_U], pred_16bit[C_U], stride_pred[C_U], blk_width_ch, blk_height_ch);
    1690           0 :                         pic_copy_kernel_16bit(altref_buffer_highbd_ptr[C_V], stride[C_V], pred_16bit[C_V], stride_pred[C_V], blk_width_ch, blk_height_ch);
    1691             :                     }
    1692             : 
    1693             :                 }else{
    1694             :                     // Initialize ME context
    1695        2880 :                     create_ME_context_and_picture_control(me_context_ptr,
    1696        2880 :                                                           list_picture_control_set_ptr[frame_index],
    1697        2880 :                                                           list_picture_control_set_ptr[index_center],
    1698             :                                                           input_picture_ptr_central,
    1699             :                                                           blk_row,
    1700             :                                                           blk_col,
    1701             :                                                           ss_x,
    1702             :                                                           ss_y);
    1703             : 
    1704             :                     // Perform ME - context_ptr will store the outputs (MVs, buffers, etc)
    1705             :                     // Block-based MC using open-loop HME + refinement
    1706        2880 :                     motion_estimate_lcu( picture_control_set_ptr_central, // source picture control set -> references come from here
    1707        2880 :                                         (uint32_t)blk_row*blk_cols + blk_col,
    1708             :                                         (uint32_t)blk_col*BW, // x block
    1709             :                                         (uint32_t)blk_row*BH, // y block
    1710             :                                         context_ptr,
    1711             :                                         input_picture_ptr_central); // source picture
    1712             : 
    1713        2880 :                     EbBool use_16x16_subblocks_only = EB_TRUE; // TODO: hardcoded to use 16x16 subblocks only, however,
    1714             :                                                                // the support for the use of 32x32 subblocks as well is almost complete
    1715             :                                                                // experiments have shown low gains by adding this possibility
    1716        2880 :                     populate_list_with_value(use_16x16_subblocks,N_32X32_BLOCKS,1);
    1717             : 
    1718             :                     // Perform MC using the information acquired using the ME step
    1719        2880 :                     tf_inter_prediction(picture_control_set_ptr_central,
    1720             :                                         context_ptr,
    1721        2880 :                                         list_input_picture_ptr[frame_index],
    1722             :                                         pred,
    1723             :                                         pred_16bit,
    1724             :                                         stride_pred,
    1725             :                                         src_center_ptr,
    1726             :                                         altref_buffer_highbd_ptr,
    1727             :                                         stride,
    1728             :                                         (uint32_t)blk_col*BW,
    1729             :                                         (uint32_t)blk_row*BH,
    1730             :                                         ss_x,
    1731             :                                         ss_y,
    1732             :                                         use_16x16_subblocks,
    1733             :                                         encoder_bit_depth);
    1734             : 
    1735             :                     // Retrieve distortion (variance) on 32x32 and 16x16 sub-blocks
    1736        2880 :                     if(!is_highbd)
    1737        2880 :                         get_ME_distortion(me_32x32_subblock_vf,
    1738             :                                           me_16x16_subblock_vf,
    1739             :                                           pred[C_Y],
    1740        2880 :                                           stride_pred[C_Y],
    1741             :                                           src_center_ptr[C_Y],
    1742        2880 :                                           stride[C_Y]);
    1743             :                     else
    1744           0 :                         get_ME_distortion_highbd(me_32x32_subblock_vf,
    1745             :                                                  me_16x16_subblock_vf,
    1746             :                                                  pred_16bit[C_Y],
    1747           0 :                                                  stride_pred[C_Y],
    1748             :                                                  altref_buffer_highbd_ptr[C_Y],
    1749           0 :                                                  stride[C_Y]);
    1750             : 
    1751             :                     // Get sub-block filter weights depending on the variance
    1752        2880 :                     get_blk_fw_using_dist(me_32x32_subblock_vf,
    1753             :                                           me_16x16_subblock_vf,
    1754             :                                           use_16x16_subblocks_only,
    1755             :                                           blk_fw,
    1756             :                                           is_highbd);
    1757             :                 }
    1758             : 
    1759             :                 // ------------
    1760             :                 // Step 2: temporal filtering using the motion compensated blocks
    1761             :                 // ------------
    1762             : 
    1763             :                 // if frame to process is the center frame
    1764        3360 :                 if (frame_index == index_center) {
    1765         480 :                     if(!is_highbd)
    1766         480 :                         apply_filtering_central(pred,
    1767             :                                                 accum,
    1768             :                                                 count,
    1769             :                                                 BW,
    1770             :                                                 BH,
    1771             :                                                 ss_x,
    1772             :                                                 ss_y);
    1773             :                     else
    1774           0 :                         apply_filtering_central_highbd(pred_16bit,
    1775             :                                                        accum,
    1776             :                                                        count,
    1777             :                                                        BW,
    1778             :                                                        BH,
    1779             :                                                        ss_x,
    1780             :                                                        ss_y);
    1781             :                 }else{
    1782             :                     // split filtering function into 32x32 blocks
    1783             :                     // TODO: implement a 64x64 SIMD version
    1784        8640 :                     for(int block_row = 0; block_row<2; block_row++){
    1785       17280 :                         for(int block_col = 0; block_col<2; block_col++) {
    1786       11520 :                             apply_filtering_block(block_row,
    1787             :                                                   block_col,
    1788             :                                                   src_center_ptr,
    1789             :                                                   altref_buffer_highbd_ptr,
    1790             :                                                   pred,
    1791             :                                                   pred_16bit,
    1792             :                                                   accum,
    1793             :                                                   count,
    1794             :                                                   stride,
    1795             :                                                   stride_pred,
    1796             :                                                   BW >> 1, // fixed 32x32
    1797             :                                                   BH >> 1, // fixed 32x32
    1798             :                                                   ss_x, // chroma sub-sampling in x
    1799             :                                                   ss_y, // chroma sub-sampling in y
    1800             :                                                   altref_strength,
    1801             :                                                   blk_fw,
    1802             :                                                   is_highbd);
    1803             :                         }
    1804             :                     }
    1805             :                 }
    1806             :             }
    1807             : 
    1808             :             // Normalize filter output to produce temporally filtered frame
    1809         480 :             get_final_filtered_pixels(src_center_ptr_start,
    1810             :                                       altref_buffer_highbd_start,
    1811             :                                       accum,
    1812             :                                       count,
    1813             :                                       stride,
    1814             :                                       blk_y_src_offset,
    1815             :                                       blk_ch_src_offset,
    1816             :                                       blk_width_ch,
    1817             :                                       blk_height_ch,
    1818             :                                       filtered_sse,
    1819             :                                       filtered_sse_uv,
    1820             :                                       is_highbd);
    1821             :         }
    1822             :     }
    1823             : 
    1824         480 :     if(!is_highbd)
    1825         480 :         EB_FREE_ALIGNED_ARRAY(predictor);
    1826             :     else
    1827           0 :         EB_FREE_ALIGNED_ARRAY(predictor_16bit);
    1828             : 
    1829         480 :     return EB_ErrorNone;
    1830             : }
    1831             : 
    1832             : // This is an adaptation of the mehtod in the following paper:
    1833             : // Shen-Chuan Tai, Shih-Ming Yang, "A fast method for image noise
    1834             : // estimation using Laplacian operator and adaptive edge detection,"
    1835             : // Proc. 3rd International Symposium on Communications, Control and
    1836             : // Signal Processing, 2008, St Julians, Malta.
    1837             : // Return noise estimate, or -1.0 if there was a failure
    1838             : // function from libaom
    1839             : // Standard bit depht input (=8 bits) to estimate the noise, I don't think there needs to be two methods for this
    1840             : // Operates on the Y component only
    1841           8 : static double estimate_noise(const uint8_t *src,
    1842             :                              uint16_t width,
    1843             :                              uint16_t height,
    1844             :                              uint16_t stride_y) {
    1845           8 :     int64_t sum = 0;
    1846           8 :     int64_t num = 0;
    1847             : 
    1848        2872 :     for (int i = 1; i < height - 1; ++i) {
    1849     1830100 :         for (int j = 1; j < width - 1; ++j) {
    1850     1827230 :             const int k = i * stride_y + j;
    1851             :             // Sobel gradients
    1852     1827230 :             const int Gx = (src[k - stride_y - 1] - src[k - stride_y + 1]) +
    1853     1827230 :                            (src[k + stride_y - 1] - src[k + stride_y + 1]) +
    1854     1827230 :                            2 * (src[k - 1] - src[k + 1]);
    1855     1827230 :             const int Gy = (src[k - stride_y - 1] - src[k + stride_y - 1]) +
    1856     1827230 :                            (src[k - stride_y + 1] - src[k + stride_y + 1]) +
    1857     1827230 :                            2 * (src[k - stride_y] - src[k + stride_y]);
    1858     1827230 :             const int Ga = abs(Gx) + abs(Gy);
    1859     1827230 :             if (Ga < EDGE_THRESHOLD) {  // Do not consider edge pixels to estimate the noise
    1860             :                 // Find Laplacian
    1861     1284960 :                 const int v =
    1862     1284960 :                         4 * src[k] -
    1863     1284960 :                         2 * (src[k - 1] + src[k + 1] + src[k - stride_y] + src[k + stride_y]) +
    1864     1284960 :                         (src[k - stride_y - 1] + src[k - stride_y + 1] + src[k + stride_y - 1] +
    1865     1284960 :                          src[k + stride_y + 1]);
    1866     1284960 :                 sum += abs(v);
    1867     1284960 :                 ++num;
    1868             :             }
    1869             :         }
    1870             :     }
    1871             :     // If very few smooth pels, return -1 since the estimate is unreliable
    1872           8 :     if (num < SMOOTH_THRESHOLD)
    1873           0 :         return -1.0;
    1874             : 
    1875           8 :     const double sigma = (double)sum / (6 * num) * SQRT_PI_BY_2;
    1876             : 
    1877           8 :     return sigma;
    1878             : }
    1879             : 
    1880             : // Noise estimation for highbd
    1881           0 : static double estimate_noise_highbd(const uint16_t *src,
    1882             :                                     int width,
    1883             :                                     int height,
    1884             :                                     int stride,
    1885             :                                     int bd) {
    1886           0 :     int64_t sum = 0;
    1887           0 :     int64_t num = 0;
    1888             : 
    1889           0 :     for (int i = 1; i < height - 1; ++i) {
    1890           0 :         for (int j = 1; j < width - 1; ++j) {
    1891           0 :             const int k = i * stride + j;
    1892             :             // Sobel gradients
    1893           0 :             const int Gx = (src[k - stride - 1] - src[k - stride + 1]) +
    1894           0 :                            (src[k + stride - 1] - src[k + stride + 1]) +
    1895           0 :                            2 * (src[k - 1] - src[k + 1]);
    1896           0 :             const int Gy = (src[k - stride - 1] - src[k + stride - 1]) +
    1897           0 :                            (src[k - stride + 1] - src[k + stride + 1]) +
    1898           0 :                            2 * (src[k - stride] - src[k + stride]);
    1899           0 :             const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bd - 8); // divide by 2^2 and round up
    1900           0 :             if (Ga < EDGE_THRESHOLD) {  // Do not consider edge pixels to estimate the noise
    1901             :                 // Find Laplacian
    1902           0 :                 const int v =
    1903           0 :                         4 * src[k] -
    1904           0 :                         2 * (src[k - 1] + src[k + 1] + src[k - stride] + src[k + stride]) +
    1905           0 :                         (src[k - stride - 1] + src[k - stride + 1] + src[k + stride - 1] +
    1906           0 :                          src[k + stride + 1]);
    1907           0 :                 sum += ROUND_POWER_OF_TWO(abs(v), bd - 8);
    1908           0 :                 ++num;
    1909             :             }
    1910             :         }
    1911             :     }
    1912             :     // If very few smooth pels, return -1 since the estimate is unreliable
    1913           0 :     if (num < SMOOTH_THRESHOLD) return -1.0;
    1914             : 
    1915           0 :     const double sigma = (double)sum / (6 * num) * SQRT_PI_BY_2;
    1916           0 :     return sigma;
    1917             : }
    1918             : 
    1919             : // Adjust filtering parameters: strength and nframes
    1920           8 : static void adjust_filter_strength(
    1921             : #if TWO_PASS
    1922             :                                    PictureParentControlSet *picture_control_set_ptr_central,
    1923             : #endif
    1924             :                                    double noise_level,
    1925             :                                    uint8_t *altref_strength,
    1926             :                                    EbBool is_highbd,
    1927             :                                    uint32_t encoder_bit_depth) {
    1928             : 
    1929           8 :     int strength = *altref_strength, adj_strength=strength;
    1930             : 
    1931             :     // Adjust the strength of the temporal filtering
    1932             :     // based on the amount of noise present in the frame
    1933             :     // adjustment in the integer range [-2, 1]
    1934             :     // if noiselevel < 0, it means that the estimation was
    1935             :     // unsuccessful and therefore keep the strength as it was set
    1936           8 :     if (noise_level > 0) {
    1937             :         int noiselevel_adj;
    1938           8 :         if (noise_level < 0.75)
    1939           8 :             noiselevel_adj = -2;
    1940           0 :         else if (noise_level < 1.75)
    1941           0 :             noiselevel_adj = -1;
    1942           0 :         else if (noise_level < 4.0)
    1943           0 :             noiselevel_adj = 0;
    1944             :         else
    1945           0 :             noiselevel_adj = 1;
    1946             : #if TWO_PASS
    1947           8 :         if (picture_control_set_ptr_central->sequence_control_set_ptr->use_input_stat_file &&
    1948           0 :             picture_control_set_ptr_central->temporal_layer_index == 0 && picture_control_set_ptr_central->sc_content_detected == 0) {
    1949           0 :             if (noiselevel_adj < 0) {
    1950           0 :                 if ((picture_control_set_ptr_central->referenced_area_avg < 20 && picture_control_set_ptr_central->slice_type == 2) ||
    1951           0 :                     (picture_control_set_ptr_central->referenced_area_avg < 30 && picture_control_set_ptr_central->slice_type != 2)) {
    1952           0 :                     noiselevel_adj = CLIP3(-2, 0, noiselevel_adj - 1);
    1953             :                 }
    1954             :                 else
    1955           0 :                     noiselevel_adj = 0;
    1956             :             }
    1957             :         }
    1958             : #endif
    1959           8 :         adj_strength += noiselevel_adj;
    1960             :     }
    1961             : 
    1962           8 :     if(adj_strength > 0)
    1963           8 :         strength = adj_strength;
    1964             :     else
    1965           0 :         strength = 0;
    1966             : 
    1967             :     // if highbd, adjust filter strength strength = strength + 2*(bit depth - 8)
    1968           8 :     if(is_highbd)
    1969           0 :         strength = strength + 2 * (encoder_bit_depth - 8);
    1970             : 
    1971             : #if DEBUG_TF
    1972             :     printf("[DEBUG] noise level: %g, strength = %d, adj_strength = %d\n", noise_level, *altref_strength, strength);
    1973             : #endif
    1974             : 
    1975           8 :     *altref_strength = (uint8_t)strength;
    1976             : 
    1977             :     // TODO: apply further refinements to the filter parameters according to 1st pass statistics
    1978             : 
    1979           8 : }
    1980             : 
    1981           8 : static void pad_and_decimate_filtered_pic(PictureParentControlSet *picture_control_set_ptr_central){
    1982             :     // reference structures (padded pictures + downsampled versions)
    1983           8 :     EbPaReferenceObject *src_object = (EbPaReferenceObject*)picture_control_set_ptr_central->pa_reference_picture_wrapper_ptr->object_ptr;
    1984           8 :     EbPictureBufferDesc *padded_pic_ptr = src_object->input_padded_picture_ptr;
    1985           8 :     generate_padding(
    1986             :         &(padded_pic_ptr->buffer_y[C_Y]),
    1987           8 :         padded_pic_ptr->stride_y,
    1988           8 :         padded_pic_ptr->width,
    1989           8 :         padded_pic_ptr->height,
    1990           8 :         padded_pic_ptr->origin_x,
    1991           8 :         padded_pic_ptr->origin_y);
    1992             : 
    1993             :     // 1/4 & 1/16 input picture decimation
    1994           8 :     DownsampleDecimationInputPicture(
    1995             :         picture_control_set_ptr_central,
    1996             :         padded_pic_ptr,
    1997             :         src_object->quarter_decimated_picture_ptr,
    1998             :         src_object->sixteenth_decimated_picture_ptr);
    1999             : 
    2000             :     // 1/4 & 1/16 input picture downsampling through filtering
    2001           8 :     SequenceControlSet *sequence_control_set_ptr = (SequenceControlSet*)picture_control_set_ptr_central->sequence_control_set_wrapper_ptr->object_ptr;
    2002           8 :     if (sequence_control_set_ptr->down_sampling_method_me_search == ME_FILTERED_DOWNSAMPLED)
    2003           4 :         DownsampleFilteringInputPicture(
    2004             :             picture_control_set_ptr_central,
    2005             :             padded_pic_ptr,
    2006             :             src_object->quarter_filtered_picture_ptr,
    2007             :             src_object->sixteenth_filtered_picture_ptr);
    2008           8 : }
    2009             : 
    2010             : // save original enchanced_picture_ptr buffer in a separate buffer (to be replaced by the temporally filtered pic)
    2011           0 : static EbErrorType save_src_pic_buffers(PictureParentControlSet *picture_control_set_ptr_central,
    2012             :                                         uint32_t ss_y,
    2013             :                                         EbBool is_highbd){
    2014             : 
    2015             :     // allocate memory for the copy of the original enhanced buffer
    2016           0 :     EB_MALLOC_ARRAY(picture_control_set_ptr_central->save_enhanced_picture_ptr[C_Y],
    2017             :               picture_control_set_ptr_central->enhanced_picture_ptr->luma_size);
    2018           0 :     EB_MALLOC_ARRAY(picture_control_set_ptr_central->save_enhanced_picture_ptr[C_U],
    2019             :               picture_control_set_ptr_central->enhanced_picture_ptr->chroma_size);
    2020           0 :     EB_MALLOC_ARRAY(picture_control_set_ptr_central->save_enhanced_picture_ptr[C_V],
    2021             :               picture_control_set_ptr_central->enhanced_picture_ptr->chroma_size);
    2022             : 
    2023             :     // if highbd, allocate memory for the copy of the original enhanced buffer - bit inc
    2024           0 :     if(is_highbd){
    2025           0 :         EB_MALLOC_ARRAY(picture_control_set_ptr_central->save_enhanced_picture_bit_inc_ptr[C_Y],
    2026             :                         picture_control_set_ptr_central->enhanced_picture_ptr->luma_size);
    2027           0 :         EB_MALLOC_ARRAY(picture_control_set_ptr_central->save_enhanced_picture_bit_inc_ptr[C_U],
    2028             :                         picture_control_set_ptr_central->enhanced_picture_ptr->chroma_size);
    2029           0 :         EB_MALLOC_ARRAY(picture_control_set_ptr_central->save_enhanced_picture_bit_inc_ptr[C_V],
    2030             :                         picture_control_set_ptr_central->enhanced_picture_ptr->chroma_size);
    2031             :     }
    2032             : 
    2033             :     // copy buffers
    2034             :     // Y
    2035           0 :     uint32_t height_y = (uint32_t)(picture_control_set_ptr_central->enhanced_picture_ptr->height +
    2036           0 :                                   picture_control_set_ptr_central->enhanced_picture_ptr->origin_y + picture_control_set_ptr_central->enhanced_picture_ptr->origin_bot_y);
    2037           0 :     uint32_t height_uv = (uint32_t)((picture_control_set_ptr_central->enhanced_picture_ptr->height +
    2038           0 :                                    picture_control_set_ptr_central->enhanced_picture_ptr->origin_y + picture_control_set_ptr_central->enhanced_picture_ptr->origin_bot_y) >> ss_y);
    2039             : 
    2040           0 :     assert(height_y * picture_control_set_ptr_central->enhanced_picture_ptr->stride_y == picture_control_set_ptr_central->enhanced_picture_ptr->luma_size);
    2041           0 :     assert(height_uv * picture_control_set_ptr_central->enhanced_picture_ptr->stride_cb == picture_control_set_ptr_central->enhanced_picture_ptr->chroma_size);
    2042           0 :     assert(height_uv * picture_control_set_ptr_central->enhanced_picture_ptr->stride_cr == picture_control_set_ptr_central->enhanced_picture_ptr->chroma_size);
    2043             : 
    2044           0 :     pic_copy_kernel_8bit(picture_control_set_ptr_central->enhanced_picture_ptr->buffer_y,
    2045           0 :                          picture_control_set_ptr_central->enhanced_picture_ptr->stride_y,
    2046             :                          picture_control_set_ptr_central->save_enhanced_picture_ptr[C_Y],
    2047           0 :                          picture_control_set_ptr_central->enhanced_picture_ptr->stride_y,
    2048           0 :                          picture_control_set_ptr_central->enhanced_picture_ptr->stride_y,
    2049             :                          height_y);
    2050             : 
    2051           0 :     pic_copy_kernel_8bit(picture_control_set_ptr_central->enhanced_picture_ptr->buffer_cb,
    2052           0 :                          picture_control_set_ptr_central->enhanced_picture_ptr->stride_cb,
    2053             :                          picture_control_set_ptr_central->save_enhanced_picture_ptr[C_U],
    2054           0 :                          picture_control_set_ptr_central->enhanced_picture_ptr->stride_cb,
    2055           0 :                          picture_control_set_ptr_central->enhanced_picture_ptr->stride_cb,
    2056             :                          height_uv);
    2057             : 
    2058           0 :     pic_copy_kernel_8bit(picture_control_set_ptr_central->enhanced_picture_ptr->buffer_cr,
    2059           0 :                          picture_control_set_ptr_central->enhanced_picture_ptr->stride_cr,
    2060             :                          picture_control_set_ptr_central->save_enhanced_picture_ptr[C_V],
    2061           0 :                          picture_control_set_ptr_central->enhanced_picture_ptr->stride_cr,
    2062           0 :                          picture_control_set_ptr_central->enhanced_picture_ptr->stride_cr,
    2063             :                          height_uv);
    2064             : 
    2065           0 :     if(is_highbd){
    2066             :         // if highbd, copy bit inc buffers
    2067             :         // Y
    2068           0 :         pic_copy_kernel_8bit(picture_control_set_ptr_central->enhanced_picture_ptr->buffer_bit_inc_y,
    2069           0 :                              picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_y,
    2070             :                              picture_control_set_ptr_central->save_enhanced_picture_bit_inc_ptr[C_Y],
    2071           0 :                              picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_y,
    2072           0 :                              picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_y,
    2073             :                              height_y);
    2074             :         // U
    2075           0 :         pic_copy_kernel_8bit(picture_control_set_ptr_central->enhanced_picture_ptr->buffer_bit_inc_cb,
    2076           0 :                              picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_cb,
    2077             :                              picture_control_set_ptr_central->save_enhanced_picture_bit_inc_ptr[C_U],
    2078           0 :                              picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_cb,
    2079           0 :                              picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_cb,
    2080             :                              height_uv);
    2081             :         // V
    2082           0 :         pic_copy_kernel_8bit(picture_control_set_ptr_central->enhanced_picture_ptr->buffer_bit_inc_cr,
    2083           0 :                              picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_cr,
    2084             :                              picture_control_set_ptr_central->save_enhanced_picture_bit_inc_ptr[C_V],
    2085           0 :                              picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_cr,
    2086           0 :                              picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_cr,
    2087             :                              height_uv);
    2088             :     }
    2089             : 
    2090           0 :     return EB_ErrorNone;
    2091             : 
    2092             : }
    2093             : 
    2094         477 : EbErrorType svt_av1_init_temporal_filtering(PictureParentControlSet **list_picture_control_set_ptr,
    2095             :                                             PictureParentControlSet *picture_control_set_ptr_central,
    2096             :                                             MotionEstimationContext_t *me_context_ptr,
    2097             :                                             int32_t segment_index) {
    2098             :     uint8_t *altref_strength_ptr, index_center;
    2099             :     EbPictureBufferDesc *central_picture_ptr;
    2100             : 
    2101         477 :     altref_strength_ptr = &(picture_control_set_ptr_central->altref_strength);
    2102             : 
    2103             :     // index of the central source frame
    2104         477 :     index_center = picture_control_set_ptr_central->past_altref_nframes;
    2105             : 
    2106             :     // if this assertion does not fail (as I think it should not, then remove picture_control_set_ptr_central from the input parameters of init_temporal_filtering())
    2107         477 :     assert(list_picture_control_set_ptr[index_center] == picture_control_set_ptr_central);
    2108             : 
    2109             :     // source central frame picture buffer
    2110         477 :     central_picture_ptr = picture_control_set_ptr_central->enhanced_picture_ptr;
    2111             : 
    2112         477 :     uint32_t encoder_bit_depth = picture_control_set_ptr_central->sequence_control_set_ptr->static_config.encoder_bit_depth;
    2113         477 :     EbBool is_highbd = (encoder_bit_depth == 8) ? (uint8_t)EB_FALSE : (uint8_t)EB_TRUE;
    2114             : 
    2115             :     // chroma subsampling
    2116         477 :     uint32_t ss_x = picture_control_set_ptr_central->sequence_control_set_ptr->subsampling_x;
    2117         477 :     uint32_t ss_y = picture_control_set_ptr_central->sequence_control_set_ptr->subsampling_y;
    2118             : 
    2119             :     //only one performs any picture based prep
    2120         477 :     eb_block_on_mutex(picture_control_set_ptr_central->temp_filt_mutex);
    2121         480 :     if (picture_control_set_ptr_central->temp_filt_prep_done == 0){
    2122             : 
    2123           8 :         picture_control_set_ptr_central->temp_filt_prep_done = 1;
    2124             : 
    2125             :         // allocate 16 bit buffer
    2126           8 :         if (is_highbd) {
    2127           0 :             EB_MALLOC_ARRAY(picture_control_set_ptr_central->altref_buffer_highbd[C_Y], central_picture_ptr->luma_size);
    2128           0 :             EB_MALLOC_ARRAY(picture_control_set_ptr_central->altref_buffer_highbd[C_U], central_picture_ptr->chroma_size);
    2129           0 :             EB_MALLOC_ARRAY(picture_control_set_ptr_central->altref_buffer_highbd[C_V], central_picture_ptr->chroma_size);
    2130             : 
    2131             :             // pack byte buffers to 16 bit buffer
    2132           0 :             pack_highbd_pic(central_picture_ptr, picture_control_set_ptr_central->altref_buffer_highbd, ss_x, ss_y, EB_TRUE);
    2133             :         }
    2134             : 
    2135             :         // Estimate source noise level
    2136             :         double noise_level;
    2137           8 :         if(is_highbd){
    2138           0 :             noise_level = estimate_noise_highbd(picture_control_set_ptr_central->altref_buffer_highbd[C_Y], // Y only
    2139           0 :                                                 central_picture_ptr->width,
    2140           0 :                                                 central_picture_ptr->height,
    2141           0 :                                                 central_picture_ptr->stride_y,
    2142             :                                                 encoder_bit_depth);
    2143             :         }
    2144             :         else{
    2145           8 :             EbByte buffer_y = central_picture_ptr->buffer_y + central_picture_ptr->origin_y*central_picture_ptr->stride_y + central_picture_ptr->origin_x;
    2146           8 :             noise_level = estimate_noise(buffer_y, // Y only
    2147           8 :                                          central_picture_ptr->width,
    2148           8 :                                          central_picture_ptr->height,
    2149           8 :                                          central_picture_ptr->stride_y);
    2150             :         }
    2151             : 
    2152             :         // adjust filter parameter based on the estimated noise of the picture
    2153             : #if TWO_PASS
    2154           8 :         adjust_filter_strength( picture_control_set_ptr_central,
    2155             :                                 noise_level,
    2156             :                                 altref_strength_ptr,
    2157             :                                 is_highbd,
    2158             :                                 encoder_bit_depth);
    2159             : #else
    2160             :         adjust_filter_strength(noise_level, altref_strength_ptr, is_highbd, encoder_bit_depth);
    2161             : #endif
    2162             : 
    2163             :         // Pad chroma reference samples - once only per picture
    2164          64 :         for (int i = 0; i < (picture_control_set_ptr_central->past_altref_nframes + picture_control_set_ptr_central->future_altref_nframes + 1); i++) {
    2165          56 :             EbPictureBufferDesc *pic_ptr_ref = list_picture_control_set_ptr[i]->enhanced_picture_ptr;
    2166             : #if FIX_ALTREF
    2167          56 :             if (i != picture_control_set_ptr_central->past_altref_nframes)
    2168             : #endif
    2169          48 :                 generate_padding_pic(pic_ptr_ref,
    2170             :                     ss_x,
    2171             :                     ss_y,
    2172             :                     is_highbd);
    2173             :         }
    2174             : 
    2175           8 :         picture_control_set_ptr_central->temporal_filtering_on = EB_TRUE; // set temporal filtering flag ON for current picture
    2176             : 
    2177             :         // save original source picture (to be replaced by the temporally filtered pic)
    2178             :         // if stat_report is enabled for PSNR computation
    2179           8 :         if(picture_control_set_ptr_central->sequence_control_set_ptr->static_config.stat_report){
    2180           0 :             save_src_pic_buffers(picture_control_set_ptr_central,
    2181             :                                  ss_y,
    2182             :                                  is_highbd);
    2183             :         }
    2184             : 
    2185             :     }
    2186         480 :     eb_release_mutex(picture_control_set_ptr_central->temp_filt_mutex);
    2187             : 
    2188             :     // populate source frames picture buffer list
    2189         480 :     EbPictureBufferDesc *list_input_picture_ptr[ALTREF_MAX_NFRAMES] = { NULL };
    2190        3840 :     for (int i = 0; i < (picture_control_set_ptr_central->past_altref_nframes + picture_control_set_ptr_central->future_altref_nframes + 1); i++)
    2191        3360 :         list_input_picture_ptr[i] = list_picture_control_set_ptr[i]->enhanced_picture_ptr;
    2192             : 
    2193             :     uint64_t filtered_sse, filtered_sse_uv;
    2194             : 
    2195         480 :     produce_temporally_filtered_pic(list_picture_control_set_ptr,
    2196             :                                     list_input_picture_ptr,
    2197         480 :                                     *altref_strength_ptr,
    2198             :                                     index_center,
    2199             :                                     &filtered_sse,
    2200             :                                     &filtered_sse_uv,
    2201             :                                     me_context_ptr,
    2202             :                                     segment_index,
    2203             :                                     is_highbd);
    2204             : 
    2205         480 :     eb_block_on_mutex(picture_control_set_ptr_central->temp_filt_mutex);
    2206         480 :     picture_control_set_ptr_central->temp_filt_seg_acc++;
    2207             : 
    2208         480 :     if(!is_highbd){
    2209         480 :         picture_control_set_ptr_central->filtered_sse += filtered_sse;
    2210         480 :         picture_control_set_ptr_central->filtered_sse_uv += filtered_sse_uv;
    2211             :     }else{
    2212           0 :         picture_control_set_ptr_central->filtered_sse += filtered_sse >> 4;
    2213           0 :         picture_control_set_ptr_central->filtered_sse_uv += filtered_sse_uv >> 4;
    2214             :     }
    2215             : 
    2216         480 :     if (picture_control_set_ptr_central->temp_filt_seg_acc == picture_control_set_ptr_central->tf_segments_total_count){
    2217             : 
    2218             : #if DEBUG_TF
    2219             :         if(!is_highbd)
    2220             :             save_YUV_to_file("filtered_picture.yuv",
    2221             :                              central_picture_ptr->buffer_y,
    2222             :                              central_picture_ptr->buffer_cb,
    2223             :                              central_picture_ptr->buffer_cr,
    2224             :                              central_picture_ptr->width,
    2225             :                              central_picture_ptr->height,
    2226             :                              central_picture_ptr->stride_y,
    2227             :                              central_picture_ptr->stride_cb,
    2228             :                              central_picture_ptr->stride_cr,
    2229             :                              central_picture_ptr->origin_y,
    2230             :                              central_picture_ptr->origin_x,
    2231             :                              ss_x,
    2232             :                              ss_y);
    2233             :         else
    2234             :             save_YUV_to_file_highbd("filtered_picture.yuv",
    2235             :                                     picture_control_set_ptr_central->altref_buffer_highbd[C_Y],
    2236             :                                     picture_control_set_ptr_central->altref_buffer_highbd[C_U],
    2237             :                                     picture_control_set_ptr_central->altref_buffer_highbd[C_V],
    2238             :                                     central_picture_ptr->width,
    2239             :                                     central_picture_ptr->height,
    2240             :                                     central_picture_ptr->stride_y,
    2241             :                                     central_picture_ptr->stride_cb,
    2242             :                                     central_picture_ptr->stride_cb,
    2243             :                                     central_picture_ptr->origin_y,
    2244             :                                     central_picture_ptr->origin_x,
    2245             :                                     ss_x,
    2246             :                                     ss_y);
    2247             : #endif
    2248             : 
    2249           8 :         if(is_highbd) {
    2250           0 :             unpack_highbd_pic(picture_control_set_ptr_central->altref_buffer_highbd,
    2251             :                               central_picture_ptr,
    2252             :                               ss_x,
    2253             :                               ss_y,
    2254             :                               EB_TRUE);
    2255             : 
    2256           0 :             EB_FREE_ARRAY(picture_control_set_ptr_central->altref_buffer_highbd[C_Y]);
    2257           0 :             EB_FREE_ARRAY(picture_control_set_ptr_central->altref_buffer_highbd[C_U]);
    2258           0 :             EB_FREE_ARRAY(picture_control_set_ptr_central->altref_buffer_highbd[C_V]);
    2259             :         }
    2260             : 
    2261             :         // padding + decimation: even if highbd src, this is only performed on the 8 bit buffer (excluding the LSBs)
    2262           8 :         pad_and_decimate_filtered_pic(picture_control_set_ptr_central);
    2263             : 
    2264             :         // Normalize the filtered SSE. Add 8 bit precision.
    2265           8 :         picture_control_set_ptr_central->filtered_sse = (picture_control_set_ptr_central->filtered_sse << 8) / central_picture_ptr->width / central_picture_ptr->height;
    2266           8 :         picture_control_set_ptr_central->filtered_sse_uv = ((picture_control_set_ptr_central->filtered_sse_uv << 8) / (central_picture_ptr->width >> ss_x) / (central_picture_ptr->height >> ss_y)) / 2;
    2267             : 
    2268             :         // signal that temp filt is done
    2269           8 :         eb_post_semaphore(picture_control_set_ptr_central->temp_filt_done_semaphore);
    2270             :     }
    2271             : 
    2272         480 :     eb_release_mutex(picture_control_set_ptr_central->temp_filt_mutex);
    2273             : 
    2274         480 :     return EB_ErrorNone;
    2275             : 
    2276             : }

Generated by: LCOV version 1.14