LCOV - code coverage report
Current view: top level - Codec - EbRestoration.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 478 759 63.0 %
Date: 2019-11-25 17:38:06 Functions: 29 40 72.5 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  *
      11             :  */
      12             : #include "EbDefinitions.h"
      13             : #include "EbSequenceControlSet.h"
      14             : #include "EbPictureBufferDesc.h"
      15             : #include "EbPictureControlSet.h"
      16             : #include "aom_dsp_rtcd.h"
      17             : #include "EbRestoration.h"
      18             : 
      19             : void av1_upscale_normative_rows(const Av1Common *cm, const uint8_t *src,
      20             :     int src_stride, uint8_t *dst, int dst_stride, int rows, int sub_x, int bd);
      21             : 
      22             : void av1_foreach_rest_unit_in_frame(Av1Common *cm, int32_t plane,
      23             :     RestTileStartVisitor on_tile,
      24             :     RestUnitVisitor on_rest_unit,
      25             :     void *priv);
      26             : 
      27             : void eb_aom_yv12_copy_y_c(const Yv12BufferConfig *src_ybc, Yv12BufferConfig *dst_ybc);
      28             : void eb_aom_yv12_copy_u_c(const Yv12BufferConfig *src_bc, Yv12BufferConfig *dst_bc);
      29             : void eb_aom_yv12_copy_v_c(const Yv12BufferConfig *src_bc, Yv12BufferConfig *dst_bc);
      30             : 
      31             : int32_t eb_aom_realloc_frame_buffer(Yv12BufferConfig *ybf, int32_t width, int32_t height,
      32             :     int32_t ss_x, int32_t ss_y, int32_t use_highbitdepth,
      33             :     int32_t border, int32_t byte_alignment,
      34             :     AomCodecFrameBuffer *fb,
      35             :     aom_get_frame_buffer_cb_fn_t cb, void *cb_priv);
      36             : 
      37             : ///---filter.h
      38             : #define MAX_FILTER_TAP 8
      39             : 
      40             : // With CONFIG_DUAL_FILTER, pack two InterpFilter's into a uint32_t: since
      41             : // there are at most 10 filters, we can use 16 bits for each and have more than
      42             : // enough space. This reduces argument passing and unifies the operation of
      43             : // setting a (pair of) filters.
      44             : //
      45             : // Without CONFIG_DUAL_FILTER,
      46             : typedef uint32_t InterpFilters;
      47             : 
      48             : #define LOG_SWITCHABLE_FILTERS \
      49             :   2 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
      50             : 
      51             : #define MAX_SUBPEL_TAPS 12
      52             : #define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
      53             : #define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
      54             : #define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
      55             : 
      56             : //typedef struct InterpFilterParams {
      57             : //    const int16_t *filter_ptr;
      58             : //    uint16_t taps;
      59             : //    uint16_t subpel_shifts;
      60             : //    InterpFilter interp_filter;
      61             : //} InterpFilterParams;
      62             : 
      63             : InterpFilterParams av1_get_interp_filter_params_with_block_size(
      64             :     const InterpFilter interp_filter, const int32_t w);
      65             : 
      66             : void *eb_aom_memset16(void *dest, int32_t val, size_t length);
      67             : 
      68             : ///---convolve.h
      69             : #define FILTER_BITS 7
      70             : 
      71             : //typedef uint16_t ConvBufType;
      72             : //typedef struct ConvolveParams {
      73             : //    int32_t ref;
      74             : //    int32_t do_average;
      75             : //    ConvBufType *dst;
      76             : //    int32_t dst_stride;
      77             : //    int32_t round_0;
      78             : //    int32_t round_1;
      79             : //    int32_t plane;
      80             : //    int32_t is_compound;
      81             : //    int32_t use_jnt_comp_avg;
      82             : //    int32_t fwd_offset;
      83             : //    int32_t bck_offset;
      84             : //} ConvolveParams;
      85             : 
      86             : #define ROUND0_BITS 3
      87             : #define COMPOUND_ROUND1_BITS 7
      88             : #define WIENER_ROUND0_BITS 3
      89             : 
      90             : #define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0))
      91             : 
      92             : typedef void(*aom_convolve_fn_t)(const uint8_t *src, int32_t src_stride,
      93             :     uint8_t *dst, int32_t dst_stride, int32_t w, int32_t h,
      94             :     InterpFilterParams *filter_params_x,
      95             :     InterpFilterParams *filter_params_y,
      96             :     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
      97             :     ConvolveParams *conv_params);
      98             : 
      99             : typedef void(*aom_highbd_convolve_fn_t)(
     100             :     const uint16_t *src, int32_t src_stride, uint16_t *dst, int32_t dst_stride, int32_t w,
     101             :     int32_t h, InterpFilterParams *filter_params_x,
     102             :     InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
     103             :     const int32_t subpel_y_q4, ConvolveParams *conv_params, int32_t bd);
     104             : 
     105             : struct AV1Common;
     106             : struct scale_factors;
     107             : 
     108       55050 : static INLINE ConvolveParams get_conv_params_wiener(int32_t bd) {
     109             :     ConvolveParams conv_params;
     110             :     (void)bd;
     111       55050 :     conv_params.ref = 0;
     112       55050 :     conv_params.do_average = 0;
     113       55050 :     conv_params.is_compound = 0;
     114       55050 :     conv_params.round_0 = WIENER_ROUND0_BITS;
     115       55050 :     conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0;
     116       55050 :     const int32_t intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
     117             :     ASSERT(IMPLIES(bd < 12, intbufrange <= 16));
     118       55050 :     if (intbufrange > 16) {
     119           0 :         conv_params.round_0 += intbufrange - 16;
     120           0 :         conv_params.round_1 -= intbufrange - 16;
     121             :     }
     122       55050 :     conv_params.dst = NULL;
     123       55050 :     conv_params.dst_stride = 0;
     124       55050 :     conv_params.plane = 0;
     125             : 
     126             :     // Initialization
     127       55050 :     conv_params.fwd_offset = 0;
     128       55050 :     conv_params.bck_offset = 0;
     129       55050 :     conv_params.use_jnt_comp_avg = 0;
     130             : 
     131       55050 :     return conv_params;
     132             : }
     133             : 
     134             : void eb_av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
     135             :     uint8_t *dst, ptrdiff_t dst_stride,
     136             :     const int16_t *filter_x, int32_t x_step_q4,
     137             :     const int16_t *filter_y, int32_t y_step_q4,
     138             :     int32_t w, int32_t h,
     139             :     const ConvolveParams *conv_params);
     140             : 
     141             : void eb_av1_highbd_wiener_convolve_add_src_c(
     142             :     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     143             :     ptrdiff_t dst_stride, const int16_t *filter_x, int32_t x_step_q4,
     144             :     const int16_t *filter_y, int32_t y_step_q4, int32_t w, int32_t h,
     145             :     const ConvolveParams *conv_params, int32_t bd);
     146             : 
     147             : void *eb_aom_memalign(size_t align, size_t size);
     148             : void eb_aom_free(void *memblk);
     149             : 
     150             : // The 's' values are calculated based on original 'r' and 'e' values in the
     151             : // spec using GenSgrprojVtable().
     152             : // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
     153             : const SgrParamsType eb_sgr_params[SGRPROJ_PARAMS] = {
     154             :   { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
     155             :   { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
     156             :   { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
     157             :   { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
     158             :   { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
     159             :   { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
     160             :   { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
     161             :   { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
     162             : };
     163             : 
     164        6846 : AV1PixelRect whole_frame_rect(FrameSize *frm_size, int32_t sub_x,
     165             :     int32_t sub_y, int32_t is_uv)
     166             : {
     167             :     AV1PixelRect rect;
     168             : 
     169        6846 :     int32_t ss_x = is_uv && sub_x;
     170        6846 :     int32_t ss_y = is_uv && sub_y;
     171             : 
     172        6846 :     rect.top = 0;
     173        6846 :     rect.bottom = ROUND_POWER_OF_TWO(frm_size->frame_height, ss_y);
     174        6846 :     rect.left = 0;
     175        6846 :     rect.right = ROUND_POWER_OF_TWO(frm_size->superres_upscaled_width, ss_x);
     176        6846 :     return rect;
     177             : }
     178             : 
     179             : // Count horizontal or vertical units per tile (use a width or height for
     180             : // tile_size, respectively). We basically want to divide the tile size by the
     181             : // size of a restoration unit. Rather than rounding up unconditionally as you
     182             : // might expect, we round to nearest, which models the way a right or bottom
     183             : // restoration unit can extend to up to 150% its normal width or height. The
     184             : // max with 1 is to deal with tiles that are smaller than half of a restoration
     185             : // unit.
     186        9504 : static int32_t count_units_in_tile(int32_t unit_size, int32_t tile_size) {
     187        9504 :     return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
     188             : }
     189             : 
     190         432 : EbErrorType eb_av1_alloc_restoration_struct(struct Av1Common *cm, RestorationInfo *rsi,
     191             :     int32_t is_uv) {
     192             :     // We need to allocate enough space for restoration units to cover the
     193             :     // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
     194             :     // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
     195             :     // to do the computation ourselves, iterating over the tiles and keeping
     196             :     // track of the largest width and height, then upscaling.
     197         432 :     const AV1PixelRect tile_rect = whole_frame_rect(&cm->frm_size,
     198             :         cm->subsampling_x, cm->subsampling_y, is_uv);
     199         432 :     const int32_t max_tile_w = tile_rect.right - tile_rect.left;
     200         432 :     const int32_t max_tile_h = tile_rect.bottom - tile_rect.top;
     201             : 
     202             :     // To calculate hpertile and vpertile (horizontal and vertical units per
     203             :     // tile), we basically want to divide the largest tile width or height by the
     204             :     // size of a restoration unit. Rather than rounding up unconditionally as you
     205             :     // might expect, we round to nearest, which models the way a right or bottom
     206             :     // restoration unit can extend to up to 150% its normal width or height. The
     207             :     // max with 1 is to deal with tiles that are smaller than half of a
     208             :     // restoration unit.
     209         432 :     const int32_t unit_size = rsi->restoration_unit_size;
     210         432 :     const int32_t hpertile = count_units_in_tile(unit_size, max_tile_w);   //FB of size < 1/2 unit_size are included in neigh FB making them bigger!!
     211         432 :     const int32_t vpertile = count_units_in_tile(unit_size, max_tile_h);
     212             : 
     213         432 :     rsi->units_per_tile = hpertile * vpertile;//pic_tot_FB
     214         432 :     rsi->horz_units_per_tile = hpertile;      //pic_width_in_FB
     215         432 :     rsi->vert_units_per_tile = vpertile;      //pic_height_in_FB
     216             : 
     217         432 :     const int32_t ntiles = 1;
     218         432 :     const int32_t nunits = ntiles * rsi->units_per_tile;
     219             : 
     220         432 :     EB_MALLOC_ARRAY(rsi->unit_info, nunits);
     221             : 
     222         432 :         return EB_ErrorNone;
     223             : }
     224             : 
     225         432 : static void extend_frame_lowbd(uint8_t *data, int32_t width, int32_t height, int32_t stride,
     226             :     int32_t border_horz, int32_t border_vert) {
     227             :     uint8_t *data_p;
     228             :     int32_t i;
     229      110592 :     for (i = 0; i < height; ++i) {
     230      110160 :         data_p = data + i * stride;
     231      110160 :         memset(data_p - border_horz, data_p[0], border_horz);
     232      110160 :         memset(data_p + width, data_p[width - 1], border_horz);
     233             :     }
     234         432 :     data_p = data - border_horz;
     235        1728 :     for (i = -border_vert; i < 0; ++i)
     236        1296 :         memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
     237        1728 :     for (i = height; i < height + border_vert; ++i) {
     238        1296 :         memcpy(data_p + i * stride, data_p + (height - 1) * stride,
     239        1296 :             width + 2 * border_horz);
     240             :     }
     241         432 : }
     242             : 
     243           0 : static void extend_frame_highbd(uint16_t *data, int32_t width, int32_t height,
     244             :     int32_t stride, int32_t border_horz, int32_t border_vert) {
     245             :     uint16_t *data_p;
     246             :     int32_t i, j;
     247           0 :     for (i = 0; i < height; ++i) {
     248           0 :         data_p = data + i * stride;
     249           0 :         for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
     250           0 :         for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
     251             :     }
     252           0 :     data_p = data - border_horz;
     253           0 :     for (i = -border_vert; i < 0; ++i) {
     254           0 :         memcpy(data_p + i * stride, data_p,
     255           0 :             (width + 2 * border_horz) * sizeof(uint16_t));
     256             :     }
     257           0 :     for (i = height; i < height + border_vert; ++i) {
     258           0 :         memcpy(data_p + i * stride, data_p + (height - 1) * stride,
     259           0 :             (width + 2 * border_horz) * sizeof(uint16_t));
     260             :     }
     261           0 : }
     262             : 
     263         432 : void eb_extend_frame(uint8_t *data, int32_t width, int32_t height, int32_t stride,
     264             :     int32_t border_horz, int32_t border_vert, int32_t highbd) {
     265         432 :     if (highbd)
     266           0 :         extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
     267             :             border_horz, border_vert);
     268             :     else
     269         432 :         extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
     270         432 : }
     271             : 
     272         116 : static void copy_tile_lowbd(int32_t width, int32_t height, const uint8_t *src,
     273             :     int32_t src_stride, uint8_t *dst, int32_t dst_stride) {
     274       41876 :     for (int32_t i = 0; i < height; ++i)
     275       41760 :         memcpy(dst + i * dst_stride, src + i * src_stride, width);
     276         116 : }
     277             : 
     278           0 : static void copy_tile_highbd(int32_t width, int32_t height, const uint16_t *src,
     279             :     int32_t src_stride, uint16_t *dst, int32_t dst_stride) {
     280           0 :     for (int32_t i = 0; i < height; ++i)
     281           0 :         memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
     282           0 : }
     283             : 
     284         116 : static void copy_tile(int32_t width, int32_t height, const uint8_t *src, int32_t src_stride,
     285             :     uint8_t *dst, int32_t dst_stride, int32_t highbd) {
     286         116 :     if (highbd)
     287           0 :         copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
     288           0 :             CONVERT_TO_SHORTPTR(dst), dst_stride);
     289             :     else
     290         116 :         copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
     291         116 : }
     292             : 
     293             : // With striped loop restoration, the filtering for each 64-pixel stripe gets
     294             : // most of its input from the output of CDEF (stored in data8), but we need to
     295             : // fill out a border of 3 pixels above/below the stripe according to the
     296             : // following
     297             : // rules:
     298             : //
     299             : // * At a frame boundary, we copy the outermost row of CDEF pixels three times.
     300             : //   This extension is done by a call to eb_extend_frame() at the start of the loop
     301             : //   restoration process, so the value of copy_above/copy_below doesn't strictly
     302             : //   matter.
     303             : //   However, by setting *copy_above = *copy_below = 1 whenever loop filtering
     304             : //   across tiles is disabled, we can allow
     305             : //   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
     306             : //   data has always been copied, simplifying the behaviour at the left and
     307             : //   right edges of tiles.
     308             : //
     309             : // * If we're at a tile boundary and loop filtering across tiles is enabled,
     310             : //   then there is a logical stripe which is 64 pixels high, but which is split
     311             : //   into an 8px high and a 56px high stripe so that the processing (and
     312             : //   coefficient set usage) can be aligned to tiles.
     313             : //   In this case, we use the 3 rows of CDEF output across the boundary for
     314             : //   context; this corresponds to leaving the frame buffer as-is.
     315             : //
     316             : // * If we're at a tile boundary and loop filtering across tiles is disabled,
     317             : //   then we take the outermost row of CDEF pixels *within the current tile*
     318             : //   and copy it three times. Thus we behave exactly as if the tile were a full
     319             : //   frame.
     320             : //
     321             : // * Otherwise, we're at a stripe boundary within a tile. In that case, we
     322             : //   take 2 rows of deblocked pixels and extend them to 3 rows of context.
     323             : //
     324             : // The distinction between the latter two cases is handled by the
     325             : // eb_av1_loop_restoration_save_boundary_lines() function, so here we just need
     326             : // to decide if we're overwriting the above/below boundary pixels or not.
     327       57678 : static void get_stripe_boundary_info(const RestorationTileLimits *limits,
     328             :     const AV1PixelRect *tile_rect, int32_t ss_y,
     329             :     int32_t *copy_above, int32_t *copy_below) {
     330       57678 :     *copy_above = 1;
     331       57678 :     *copy_below = 1;
     332             : 
     333       57678 :     const int32_t full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
     334       57678 :     const int32_t runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
     335             : 
     336       57678 :     const int32_t first_stripe_in_tile = (limits->v_start == tile_rect->top);
     337       57678 :     const int32_t this_stripe_height =
     338       57678 :         full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
     339       57678 :     const int32_t last_stripe_in_tile =
     340       57678 :         (limits->v_start + this_stripe_height >= tile_rect->bottom);
     341             : 
     342       57678 :     if (first_stripe_in_tile) *copy_above = 0;
     343       57678 :     if (last_stripe_in_tile) *copy_below = 0;
     344       57678 : }
     345             : 
     346             : // Overwrite the border pixels around a processing stripe so that the conditions
     347             : // listed above get_stripe_boundary_info() are preserved.
     348             : // We save the pixels which get overwritten into a temporary buffer, so that
     349             : // they can be restored by restore_processing_stripe_boundary() after we've
     350             : // processed the stripe.
     351             : //
     352             : // limits gives the rectangular limits of the remaining stripes for the current
     353             : // restoration unit. rsb is the stored stripe boundaries (taken from either
     354             : // deblock or CDEF output as necessary).
     355             : //
     356             : // tile_rect is the limits of the current tile and tile_stripe0 is the index of
     357             : // the first stripe in this tile (needed to convert the tile-relative stripe
     358             : // index we get from limits into something we can look up in rsb).
     359       57678 : static void setup_processing_stripe_boundary(
     360             :     const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
     361             :     int32_t rsb_row, int32_t use_highbd, int32_t h, uint8_t *data8, int32_t data_stride,
     362             :     RestorationLineBuffers *rlbs, int32_t copy_above, int32_t copy_below, int32_t opt) {
     363             :     // Offsets within the line buffers. The buffer logically starts at column
     364             :     // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
     365             :     // has column x0 in the buffer.
     366       57678 :     const int32_t buf_stride = rsb->stripe_boundary_stride;
     367       57678 :     const int32_t buf_x0_off = limits->h_start;
     368       57678 :     const int32_t line_width =
     369       57678 :         (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
     370       57678 :     const int32_t line_size = line_width << use_highbd;
     371             : 
     372       57678 :     const int32_t data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
     373             : 
     374             :     // Replace RESTORATION_BORDER pixels above the top of the stripe
     375             :     // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
     376             :     // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
     377             :     // duplicating the topmost of the 2 lines (see the AOMMAX call when
     378             :     // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
     379             :     //
     380             :     // Special case: If we're at the top of a tile, which isn't on the topmost
     381             :     // tile row, and we're allowed to loop filter across tiles, then we have a
     382             :     // logical 64-pixel-high stripe which has been split into an 8-pixel high
     383             :     // stripe and a 56-pixel high stripe (the current one). So, in this case,
     384             :     // we want to leave the boundary alone!
     385       57678 :     if (!opt) {
     386       57642 :         if (copy_above) {
     387       48035 :             uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
     388             : 
     389      192138 :             for (int32_t i = -RESTORATION_BORDER; i < 0; ++i) {
     390      144103 :                 const int32_t buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
     391      144103 :                 const int32_t buf_off = buf_x0_off + buf_row * buf_stride;
     392      144103 :                 const uint8_t *buf =
     393      144103 :                     rsb->stripe_boundary_above + (buf_off << use_highbd);
     394      144103 :                 uint8_t *dst8 = data8_tl + i * data_stride;
     395             :                 // Save old pixels, then replace with data from stripe_boundary_above
     396      144103 :                 memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
     397           0 :                     REAL_PTR(use_highbd, dst8), line_size);
     398      144103 :                 memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
     399             :             }
     400             :         }
     401             : 
     402             :         // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
     403             :         // The second buffer row is repeated, so src_row gets the values 0, 1, 1
     404             :         // for i = 0, 1, 2.
     405       57642 :         if (copy_below) {
     406       48035 :             const int32_t stripe_end = limits->v_start + h;
     407       48035 :             uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
     408             : 
     409      192139 :             for (int32_t i = 0; i < RESTORATION_BORDER; ++i) {
     410      144104 :                 const int32_t buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
     411      144104 :                 const int32_t buf_off = buf_x0_off + buf_row * buf_stride;
     412      144104 :                 const uint8_t *src =
     413      144104 :                     rsb->stripe_boundary_below + (buf_off << use_highbd);
     414             : 
     415      144104 :                 uint8_t *dst8 = data8_bl + i * data_stride;
     416             :                 // Save old pixels, then replace with data from stripe_boundary_below
     417      144104 :                 memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
     418      144104 :                 memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
     419             :             }
     420             :         }
     421             :     }
     422             :     else {
     423          36 :         if (copy_above) {
     424          30 :             uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
     425             : 
     426             :             // Only save and overwrite i=-RESTORATION_BORDER line.
     427          30 :             uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
     428             :             // Save old pixels, then replace with data from stripe_boundary_above
     429          30 :             memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
     430          60 :             memcpy(REAL_PTR(use_highbd, dst8),
     431          30 :                 REAL_PTR(use_highbd,
     432             :                     data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
     433             :                 line_size);
     434             :         }
     435             : 
     436          36 :         if (copy_below) {
     437          30 :             const int32_t stripe_end = limits->v_start + h;
     438          30 :             uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
     439             : 
     440             :             // Only save and overwrite i=2 line.
     441          30 :             uint8_t *dst8 = data8_bl + 2 * data_stride;
     442             :             // Save old pixels, then replace with data from stripe_boundary_below
     443          30 :             memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
     444          60 :             memcpy(REAL_PTR(use_highbd, dst8),
     445          30 :                 REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
     446             :         }
     447             :     }
     448       57678 : }
     449             : 
     450             : // This function restores the boundary lines modified by
     451             : // setup_processing_stripe_boundary.
     452             : //
     453             : // Note: We need to be careful when handling the corners of the processing
     454             : // unit, because (eg.) the top-left corner is considered to be part of
     455             : // both the left and top borders. This means that, depending on the
     456             : // loop_filter_across_tiles_enabled flag, the corner pixels might get
     457             : // overwritten twice, once as part of the "top" border and once as part
     458             : // of the "left" border (or similar for other corners).
     459             : //
     460             : // Everything works out fine as long as we make sure to reverse the order
     461             : // when restoring, ie. we need to restore the left/right borders followed
     462             : // by the top/bottom borders.
     463       57677 : static void restore_processing_stripe_boundary(
     464             :     const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
     465             :     int32_t use_highbd, int32_t h, uint8_t *data8, int32_t data_stride, int32_t copy_above,
     466             :     int32_t copy_below, int32_t opt) {
     467       57677 :     const int32_t line_width =
     468       57677 :         (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
     469       57677 :     const int32_t line_size = line_width << use_highbd;
     470             : 
     471       57677 :     const int32_t data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
     472             : 
     473       57677 :     if (!opt) {
     474       57641 :         if (copy_above) {
     475       48034 :             uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
     476      192136 :             for (int32_t i = -RESTORATION_BORDER; i < 0; ++i) {
     477      144102 :                 uint8_t *dst8 = data8_tl + i * data_stride;
     478      144102 :                 memcpy(REAL_PTR(use_highbd, dst8),
     479      144102 :                     rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
     480             :             }
     481             :         }
     482             : 
     483       57641 :         if (copy_below) {
     484       48035 :             const int32_t stripe_bottom = limits->v_start + h;
     485       48035 :             uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
     486             : 
     487      192139 :             for (int32_t i = 0; i < RESTORATION_BORDER; ++i) {
     488      144104 :                 if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
     489             : 
     490      144104 :                 uint8_t *dst8 = data8_bl + i * data_stride;
     491      144104 :                 memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
     492             :             }
     493             :         }
     494             :     }
     495             :     else {
     496          36 :         if (copy_above) {
     497          30 :             uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
     498             : 
     499             :             // Only restore i=-RESTORATION_BORDER line.
     500          30 :             uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
     501          30 :             memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
     502             :         }
     503             : 
     504          36 :         if (copy_below) {
     505          30 :             const int32_t stripe_bottom = limits->v_start + h;
     506          30 :             uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
     507             : 
     508             :             // Only restore i=2 line.
     509          30 :             if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
     510          30 :                 uint8_t *dst8 = data8_bl + 2 * data_stride;
     511          30 :                 memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
     512             :             }
     513             :         }
     514             :     }
     515       57677 : }
     516             : 
     517       55050 : static void wiener_filter_stripe(const RestorationUnitInfo *rui,
     518             :     int32_t stripe_width, int32_t stripe_height,
     519             :     int32_t procunit_width, const uint8_t *src,
     520             :     int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     521             :     int32_t *tmpbuf, int32_t bit_depth) {
     522             :     (void)tmpbuf;
     523             :     (void)bit_depth;
     524             :     assert(bit_depth == 8);
     525       55050 :     const ConvolveParams conv_params = get_conv_params_wiener(8);
     526             : 
     527      353090 :     for (int32_t j = 0; j < stripe_width; j += procunit_width) {
     528      298044 :         int32_t w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
     529      298044 :         const uint8_t *src_p = src + j;
     530      298044 :         uint8_t *dst_p = dst + j;//CHKN  SSE
     531      298044 :         eb_av1_wiener_convolve_add_src(
     532      298044 :             src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
     533      298044 :             rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
     534             :     }
     535       55046 : }
     536             : 
     537             : /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
     538             :    over the input. The window is of size (2r + 1)x(2r + 1), and we
     539             :    specialize to r = 1, 2, 3. A default function is used for r > 3.
     540             : 
     541             :    Each loop follows the same format: We keep a window's worth of input
     542             :    in individual variables and select data out of that as appropriate.
     543             : */
     544           0 : static void boxsum1(int32_t *src, int32_t width, int32_t height, int32_t src_stride,
     545             :     int32_t sqr, int32_t *dst, int32_t dst_stride) {
     546             :     int32_t i, j, a, b, c;
     547             :     assert(width > 2 * SGRPROJ_BORDER_HORZ);
     548             :     assert(height > 2 * SGRPROJ_BORDER_VERT);
     549             : 
     550             :     // Vertical sum over 3-pixel regions, from src into dst.
     551           0 :     if (!sqr) {
     552           0 :         for (j = 0; j < width; ++j) {
     553           0 :             a = src[j];
     554           0 :             b = src[src_stride + j];
     555           0 :             c = src[2 * src_stride + j];
     556             : 
     557           0 :             dst[j] = a + b;
     558           0 :             for (i = 1; i < height - 2; ++i) {
     559             :                 // Loop invariant: At the start of each iteration,
     560             :                 // a = src[(i - 1) * src_stride + j]
     561             :                 // b = src[(i    ) * src_stride + j]
     562             :                 // c = src[(i + 1) * src_stride + j]
     563           0 :                 dst[i * dst_stride + j] = a + b + c;
     564           0 :                 a = b;
     565           0 :                 b = c;
     566           0 :                 c = src[(i + 2) * src_stride + j];
     567             :             }
     568           0 :             dst[i * dst_stride + j] = a + b + c;
     569           0 :             dst[(i + 1) * dst_stride + j] = b + c;
     570             :         }
     571             :     }
     572             :     else {
     573           0 :         for (j = 0; j < width; ++j) {
     574           0 :             a = src[j] * src[j];
     575           0 :             b = src[src_stride + j] * src[src_stride + j];
     576           0 :             c = src[2 * src_stride + j] * src[2 * src_stride + j];
     577             : 
     578           0 :             dst[j] = a + b;
     579           0 :             for (i = 1; i < height - 2; ++i) {
     580           0 :                 dst[i * dst_stride + j] = a + b + c;
     581           0 :                 a = b;
     582           0 :                 b = c;
     583           0 :                 c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
     584             :             }
     585           0 :             dst[i * dst_stride + j] = a + b + c;
     586           0 :             dst[(i + 1) * dst_stride + j] = b + c;
     587             :         }
     588             :     }
     589             : 
     590             :     // Horizontal sum over 3-pixel regions of dst
     591           0 :     for (i = 0; i < height; ++i) {
     592           0 :         a = dst[i * dst_stride];
     593           0 :         b = dst[i * dst_stride + 1];
     594           0 :         c = dst[i * dst_stride + 2];
     595             : 
     596           0 :         dst[i * dst_stride] = a + b;
     597           0 :         for (j = 1; j < width - 2; ++j) {
     598             :             // Loop invariant: At the start of each iteration,
     599             :             // a = src[i * src_stride + (j - 1)]
     600             :             // b = src[i * src_stride + (j    )]
     601             :             // c = src[i * src_stride + (j + 1)]
     602           0 :             dst[i * dst_stride + j] = a + b + c;
     603           0 :             a = b;
     604           0 :             b = c;
     605           0 :             c = dst[i * dst_stride + (j + 2)];
     606             :         }
     607           0 :         dst[i * dst_stride + j] = a + b + c;
     608           0 :         dst[i * dst_stride + (j + 1)] = b + c;
     609             :     }
     610           0 : }
     611             : 
     612           0 : static void boxsum2(int32_t *src, int32_t width, int32_t height, int32_t src_stride,
     613             :     int32_t sqr, int32_t *dst, int32_t dst_stride) {
     614             :     int32_t i, j, a, b, c, d, e;
     615             :     assert(width > 2 * SGRPROJ_BORDER_HORZ);
     616             :     assert(height > 2 * SGRPROJ_BORDER_VERT);
     617             : 
     618             :     // Vertical sum over 5-pixel regions, from src into dst.
     619           0 :     if (!sqr) {
     620           0 :         for (j = 0; j < width; ++j) {
     621           0 :             a = src[j];
     622           0 :             b = src[src_stride + j];
     623           0 :             c = src[2 * src_stride + j];
     624           0 :             d = src[3 * src_stride + j];
     625           0 :             e = src[4 * src_stride + j];
     626             : 
     627           0 :             dst[j] = a + b + c;
     628           0 :             dst[dst_stride + j] = a + b + c + d;
     629           0 :             for (i = 2; i < height - 3; ++i) {
     630             :                 // Loop invariant: At the start of each iteration,
     631             :                 // a = src[(i - 2) * src_stride + j]
     632             :                 // b = src[(i - 1) * src_stride + j]
     633             :                 // c = src[(i    ) * src_stride + j]
     634             :                 // d = src[(i + 1) * src_stride + j]
     635             :                 // e = src[(i + 2) * src_stride + j]
     636           0 :                 dst[i * dst_stride + j] = a + b + c + d + e;
     637           0 :                 a = b;
     638           0 :                 b = c;
     639           0 :                 c = d;
     640           0 :                 d = e;
     641           0 :                 e = src[(i + 3) * src_stride + j];
     642             :             }
     643           0 :             dst[i * dst_stride + j] = a + b + c + d + e;
     644           0 :             dst[(i + 1) * dst_stride + j] = b + c + d + e;
     645           0 :             dst[(i + 2) * dst_stride + j] = c + d + e;
     646             :         }
     647             :     }
     648             :     else {
     649           0 :         for (j = 0; j < width; ++j) {
     650           0 :             a = src[j] * src[j];
     651           0 :             b = src[src_stride + j] * src[src_stride + j];
     652           0 :             c = src[2 * src_stride + j] * src[2 * src_stride + j];
     653           0 :             d = src[3 * src_stride + j] * src[3 * src_stride + j];
     654           0 :             e = src[4 * src_stride + j] * src[4 * src_stride + j];
     655             : 
     656           0 :             dst[j] = a + b + c;
     657           0 :             dst[dst_stride + j] = a + b + c + d;
     658           0 :             for (i = 2; i < height - 3; ++i) {
     659           0 :                 dst[i * dst_stride + j] = a + b + c + d + e;
     660           0 :                 a = b;
     661           0 :                 b = c;
     662           0 :                 c = d;
     663           0 :                 d = e;
     664           0 :                 e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
     665             :             }
     666           0 :             dst[i * dst_stride + j] = a + b + c + d + e;
     667           0 :             dst[(i + 1) * dst_stride + j] = b + c + d + e;
     668           0 :             dst[(i + 2) * dst_stride + j] = c + d + e;
     669             :         }
     670             :     }
     671             : 
     672             :     // Horizontal sum over 5-pixel regions of dst
     673           0 :     for (i = 0; i < height; ++i) {
     674           0 :         a = dst[i * dst_stride];
     675           0 :         b = dst[i * dst_stride + 1];
     676           0 :         c = dst[i * dst_stride + 2];
     677           0 :         d = dst[i * dst_stride + 3];
     678           0 :         e = dst[i * dst_stride + 4];
     679             : 
     680           0 :         dst[i * dst_stride] = a + b + c;
     681           0 :         dst[i * dst_stride + 1] = a + b + c + d;
     682           0 :         for (j = 2; j < width - 3; ++j) {
     683             :             // Loop invariant: At the start of each iteration,
     684             :             // a = src[i * src_stride + (j - 2)]
     685             :             // b = src[i * src_stride + (j - 1)]
     686             :             // c = src[i * src_stride + (j    )]
     687             :             // d = src[i * src_stride + (j + 1)]
     688             :             // e = src[i * src_stride + (j + 2)]
     689           0 :             dst[i * dst_stride + j] = a + b + c + d + e;
     690           0 :             a = b;
     691           0 :             b = c;
     692           0 :             c = d;
     693           0 :             d = e;
     694           0 :             e = dst[i * dst_stride + (j + 3)];
     695             :         }
     696           0 :         dst[i * dst_stride + j] = a + b + c + d + e;
     697           0 :         dst[i * dst_stride + (j + 1)] = b + c + d + e;
     698           0 :         dst[i * dst_stride + (j + 2)] = c + d + e;
     699             :     }
     700           0 : }
     701             : 
     702           0 : static void boxsum(int32_t *src, int32_t width, int32_t height, int32_t src_stride, int32_t r,
     703             :     int32_t sqr, int32_t *dst, int32_t dst_stride) {
     704           0 :     if (r == 1)
     705           0 :         boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
     706           0 :     else if (r == 2)
     707           0 :         boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
     708             :     else
     709             :         assert(0 && "Invalid value of r in self-guided filter");
     710           0 : }
     711             : 
     712       50749 : void eb_decode_xq(const int32_t *xqd, int32_t *xq, const SgrParamsType *params) {
     713       50749 :     if (params->r[0] == 0) {
     714        5528 :         xq[0] = 0;
     715        5528 :         xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
     716             :     }
     717       45221 :     else if (params->r[1] == 0) {
     718       10061 :         xq[0] = xqd[0];
     719       10061 :         xq[1] = 0;
     720             :     }
     721             :     else {
     722       35160 :         xq[0] = xqd[0];
     723       35160 :         xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
     724             :     }
     725       50749 : }
     726             : 
     727             : const int32_t eb_x_by_xplus1[256] = {
     728             :     // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
     729             :     // instead of 0. See comments in selfguided_restoration_internal() for why
     730             :     1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
     731             :     240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
     732             :     248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
     733             :     250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
     734             :     252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
     735             :     253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
     736             :     253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
     737             :     254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
     738             :     254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
     739             :     254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
     740             :     254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
     741             :     254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     742             :     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     743             :     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     744             :     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     745             :     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     746             :     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     747             :     256,
     748             : };
     749             : 
     750             : const int32_t eb_one_by_x[MAX_NELEM] = {
     751             :   4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
     752             :   293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
     753             : };
     754             : 
     755           0 : static void selfguided_restoration_fast_internal(
     756             :     int32_t *dgd, int32_t width, int32_t height, int32_t dgd_stride, int32_t *dst,
     757             :     int32_t dst_stride, int32_t bit_depth, int32_t sgr_params_idx, int32_t radius_idx)
     758             : {
     759           0 :     const SgrParamsType *const params = &eb_sgr_params[sgr_params_idx];
     760           0 :     const int32_t r = params->r[radius_idx];
     761           0 :     const int32_t width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
     762           0 :     const int32_t height_ext = height + 2 * SGRPROJ_BORDER_VERT;
     763             :     // Adjusting the stride of A and B here appears to avoid bad cache effects,
     764             :     // leading to a significant speed improvement.
     765             :     // We also align the stride to a multiple of 16 bytes, for consistency
     766             :     // with the SIMD version of this function.
     767           0 :     int32_t buf_stride = ((width_ext + 3) & ~3) + 16;
     768             :     int32_t A_[RESTORATION_PROC_UNIT_PELS];
     769             :     int32_t B_[RESTORATION_PROC_UNIT_PELS];
     770           0 :     int32_t *A = A_;
     771           0 :     int32_t *B = B_;
     772             :     int32_t i, j;
     773             : 
     774             :     assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
     775             :     assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
     776             :         "Need SGRPROJ_BORDER_* >= r+1");
     777             : 
     778           0 :     boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
     779             :         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
     780           0 :     boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
     781             :         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
     782           0 :     A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
     783           0 :     B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
     784             :     // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
     785             :     // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
     786           0 :     for (i = -1; i < height + 1; i += 2) {
     787           0 :         for (j = -1; j < width + 1; ++j) {
     788           0 :             const int32_t k = i * buf_stride + j;
     789           0 :             const int32_t n = (2 * r + 1) * (2 * r + 1);
     790             : 
     791             :             // a < 2^16 * n < 2^22 regardless of bit depth
     792           0 :             uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
     793             :             // b < 2^8 * n < 2^14 regardless of bit depth
     794           0 :             uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
     795             : 
     796             :             // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
     797             :             // and p itself satisfies p < 2^14 * n^2 < 2^26.
     798             :             // This bound on p is due to:
     799             :             // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
     800             :             //
     801             :             // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
     802             :             // This is an artefact of rounding, and can only happen if all pixels
     803             :             // are (almost) identical, so in this case we saturate to p=0.
     804           0 :             uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
     805             : 
     806           0 :             const uint32_t s = params->s[radius_idx];
     807             : 
     808             :             // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
     809             :             // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
     810             :             // (this holds even after accounting for the rounding in s)
     811           0 :             const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
     812             : 
     813             :             // Note: We have to be quite careful about the value of A[k].
     814             :             // This is used as a blend factor between individual pixel values and the
     815             :             // local mean. So it logically has a range of [0, 256], including both
     816             :             // endpoints.
     817             :             //
     818             :             // This is a pain for hardware, as we'd like something which can be stored
     819             :             // in exactly 8 bits.
     820             :             // Further, in the calculation of B[k] below, if z == 0 and r == 2,
     821             :             // then A[k] "should be" 0. But then we can end up setting B[k] to a value
     822             :             // slightly above 2^(8 + bit depth), due to rounding in the value of
     823             :             // eb_one_by_x[25-1].
     824             :             //
     825             :             // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
     826             :             // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
     827             :             // overflow), without significantly affecting the final result: z == 0
     828             :             // implies that the image is essentially "flat", so the local mean and
     829             :             // individual pixel values are very similar.
     830             :             //
     831             :             // Note that saturating on the other side, ie. requring A[k] <= 255,
     832             :             // would be a bad idea, as that corresponds to the case where the image
     833             :             // is very variable, when we want to preserve the local pixel value as
     834             :             // much as possible.
     835           0 :             A[k] = eb_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
     836             : 
     837             :             // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
     838             :             // eb_one_by_x[n - 1] = round(2^12 / n)
     839             :             // => the product here is < 2^(20 + bit_depth) <= 2^32,
     840             :             // and B[k] is set to a value < 2^(8 + bit depth)
     841             :             // This holds even with the rounding in eb_one_by_x and in the overall
     842             :             // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
     843           0 :             B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
     844             :                 (uint32_t)B[k] *
     845             :                 (uint32_t)eb_one_by_x[n - 1],
     846             :                 SGRPROJ_RECIP_BITS);
     847             :         }
     848             :     }
     849             :     // Use the A[] and B[] arrays to calculate the filtered image
     850             :     assert(r == 2);
     851           0 :     for (i = 0; i < height; ++i) {
     852           0 :         if (!(i & 1)) {  // even row
     853           0 :             for (j = 0; j < width; ++j) {
     854           0 :                 const int32_t k = i * buf_stride + j;
     855           0 :                 const int32_t l = i * dgd_stride + j;
     856           0 :                 const int32_t m = i * dst_stride + j;
     857           0 :                 const int32_t nb = 5;
     858           0 :                 const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
     859           0 :                     (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
     860           0 :                         A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
     861             :                     5;
     862           0 :                 const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
     863           0 :                     (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
     864           0 :                         B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
     865             :                     5;
     866           0 :                 const int32_t v = a * dgd[l] + b;
     867           0 :                 dst[m] =
     868           0 :                     ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
     869             :             }
     870             :         }
     871             :         else {  // odd row
     872           0 :             for (j = 0; j < width; ++j) {
     873           0 :                 const int32_t k = i * buf_stride + j;
     874           0 :                 const int32_t l = i * dgd_stride + j;
     875           0 :                 const int32_t m = i * dst_stride + j;
     876           0 :                 const int32_t nb = 4;
     877           0 :                 const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
     878           0 :                 const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
     879           0 :                 const int32_t v = a * dgd[l] + b;
     880           0 :                 dst[m] =
     881           0 :                     ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
     882             :             }
     883             :         }
     884             :     }
     885           0 : }
     886             : 
     887           0 : static void selfguided_restoration_internal(int32_t *dgd, int32_t width, int32_t height,
     888             :     int32_t dgd_stride, int32_t *dst,
     889             :     int32_t dst_stride, int32_t bit_depth,
     890             :     int32_t sgr_params_idx,
     891             :     int32_t radius_idx) {
     892           0 :     const SgrParamsType *const params = &eb_sgr_params[sgr_params_idx];
     893           0 :     const int32_t r = params->r[radius_idx];
     894           0 :     const int32_t width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
     895           0 :     const int32_t height_ext = height + 2 * SGRPROJ_BORDER_VERT;
     896             :     // Adjusting the stride of A and B here appears to avoid bad cache effects,
     897             :     // leading to a significant speed improvement.
     898             :     // We also align the stride to a multiple of 16 bytes, for consistency
     899             :     // with the SIMD version of this function.
     900           0 :     int32_t buf_stride = ((width_ext + 3) & ~3) + 16;
     901             :     int32_t A_[RESTORATION_PROC_UNIT_PELS];
     902             :     int32_t B_[RESTORATION_PROC_UNIT_PELS];
     903           0 :     int32_t *A = A_;
     904           0 :     int32_t *B = B_;
     905             :     int32_t i, j;
     906             : 
     907             :     assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
     908             :     assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
     909             :         "Need SGRPROJ_BORDER_* >= r+1");
     910             : 
     911           0 :     boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
     912             :         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
     913           0 :     boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
     914             :         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
     915           0 :     A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
     916           0 :     B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
     917             :     // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
     918             :     // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
     919           0 :     for (i = -1; i < height + 1; ++i) {
     920           0 :         for (j = -1; j < width + 1; ++j) {
     921           0 :             const int32_t k = i * buf_stride + j;
     922           0 :             const int32_t n = (2 * r + 1) * (2 * r + 1);
     923             : 
     924             :             // a < 2^16 * n < 2^22 regardless of bit depth
     925           0 :             uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
     926             :             // b < 2^8 * n < 2^14 regardless of bit depth
     927           0 :             uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
     928             : 
     929             :             // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
     930             :             // and p itself satisfies p < 2^14 * n^2 < 2^26.
     931             :             // This bound on p is due to:
     932             :             // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
     933             :             //
     934             :             // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
     935             :             // This is an artefact of rounding, and can only happen if all pixels
     936             :             // are (almost) identical, so in this case we saturate to p=0.
     937           0 :             uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
     938             : 
     939           0 :             const uint32_t s = params->s[radius_idx];
     940             : 
     941             :             // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
     942             :             // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
     943             :             // (this holds even after accounting for the rounding in s)
     944           0 :             const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
     945             : 
     946             :             // Note: We have to be quite careful about the value of A[k].
     947             :             // This is used as a blend factor between individual pixel values and the
     948             :             // local mean. So it logically has a range of [0, 256], including both
     949             :             // endpoints.
     950             :             //
     951             :             // This is a pain for hardware, as we'd like something which can be stored
     952             :             // in exactly 8 bits.
     953             :             // Further, in the calculation of B[k] below, if z == 0 and r == 2,
     954             :             // then A[k] "should be" 0. But then we can end up setting B[k] to a value
     955             :             // slightly above 2^(8 + bit depth), due to rounding in the value of
     956             :             // eb_one_by_x[25-1].
     957             :             //
     958             :             // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
     959             :             // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
     960             :             // overflow), without significantly affecting the final result: z == 0
     961             :             // implies that the image is essentially "flat", so the local mean and
     962             :             // individual pixel values are very similar.
     963             :             //
     964             :             // Note that saturating on the other side, ie. requring A[k] <= 255,
     965             :             // would be a bad idea, as that corresponds to the case where the image
     966             :             // is very variable, when we want to preserve the local pixel value as
     967             :             // much as possible.
     968           0 :             A[k] = eb_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
     969             : 
     970             :             // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
     971             :             // eb_one_by_x[n - 1] = round(2^12 / n)
     972             :             // => the product here is < 2^(20 + bit_depth) <= 2^32,
     973             :             // and B[k] is set to a value < 2^(8 + bit depth)
     974             :             // This holds even with the rounding in eb_one_by_x and in the overall
     975             :             // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
     976           0 :             B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
     977             :                 (uint32_t)B[k] *
     978             :                 (uint32_t)eb_one_by_x[n - 1],
     979             :                 SGRPROJ_RECIP_BITS);
     980             :         }
     981             :     }
     982             :     // Use the A[] and B[] arrays to calculate the filtered image
     983           0 :     for (i = 0; i < height; ++i) {
     984           0 :         for (j = 0; j < width; ++j) {
     985           0 :             const int32_t k = i * buf_stride + j;
     986           0 :             const int32_t l = i * dgd_stride + j;
     987           0 :             const int32_t m = i * dst_stride + j;
     988           0 :             const int32_t nb = 5;
     989           0 :             const int32_t a =
     990           0 :                 (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
     991             :                 4 +
     992           0 :                 (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
     993           0 :                     A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
     994             :                 3;
     995           0 :             const int32_t b =
     996           0 :                 (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
     997             :                 4 +
     998           0 :                 (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
     999           0 :                     B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
    1000             :                 3;
    1001           0 :             const int32_t v = a * dgd[l] + b;
    1002           0 :             dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
    1003             :         }
    1004             :     }
    1005           0 : }
    1006             : 
    1007           0 : void eb_av1_selfguided_restoration_c(const uint8_t *dgd8, int32_t width, int32_t height,
    1008             :     int32_t dgd_stride, int32_t *flt0, int32_t *flt1,
    1009             :     int32_t flt_stride, int32_t sgr_params_idx,
    1010             :     int32_t bit_depth, int32_t highbd) {
    1011             :     int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
    1012           0 :     const int32_t dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
    1013           0 :     int32_t *dgd32 =
    1014           0 :         dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
    1015             : 
    1016           0 :     if (highbd) {
    1017           0 :         const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
    1018           0 :         for (int32_t i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
    1019           0 :             for (int32_t j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j)
    1020           0 :                 dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
    1021             :         }
    1022             :     }
    1023             :     else {
    1024           0 :         for (int32_t i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
    1025           0 :             for (int32_t j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j)
    1026           0 :                 dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
    1027             :         }
    1028             :     }
    1029             : 
    1030           0 :     const SgrParamsType *const params = &eb_sgr_params[sgr_params_idx];
    1031             :     // If params->r == 0 we skip the corresponding filter. We only allow one of
    1032             :     // the radii to be 0, as having both equal to 0 would be equivalent to
    1033             :     // skipping SGR entirely.
    1034             :     assert(!(params->r[0] == 0 && params->r[1] == 0));
    1035             : 
    1036           0 :     if (params->r[0] > 0)
    1037           0 :         selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
    1038             :             flt0, flt_stride, bit_depth,
    1039             :             sgr_params_idx, 0);
    1040           0 :     if (params->r[1] > 0)
    1041           0 :         selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
    1042             :             flt_stride, bit_depth, sgr_params_idx, 1);
    1043           0 : }
    1044             : 
    1045           0 : void eb_apply_selfguided_restoration_c(const uint8_t *dat8, int32_t width, int32_t height,
    1046             :     int32_t stride, int32_t eps, const int32_t *xqd,
    1047             :     uint8_t *dst8, int32_t dst_stride,
    1048             :     int32_t *tmpbuf, int32_t bit_depth,
    1049             :     int32_t highbd) {
    1050           0 :     int32_t *flt0 = tmpbuf;
    1051           0 :     int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
    1052             :     assert(width * height <= RESTORATION_UNITPELS_MAX);
    1053             : 
    1054           0 :     eb_av1_selfguided_restoration_c(dat8, width, height, stride, flt0, flt1, width,
    1055             :         eps, bit_depth, highbd);
    1056           0 :     const SgrParamsType *const params = &eb_sgr_params[eps];
    1057             :     int32_t xq[2];
    1058           0 :     eb_decode_xq(xqd, xq, params);
    1059           0 :     for (int32_t i = 0; i < height; ++i) {
    1060           0 :         for (int32_t j = 0; j < width; ++j) {
    1061           0 :             const int32_t k = i * width + j;
    1062           0 :             uint8_t *dst8ij = dst8 + i * dst_stride + j;
    1063           0 :             const uint8_t *dat8ij = dat8 + i * stride + j;
    1064             : 
    1065           0 :             const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
    1066           0 :             const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
    1067           0 :             int32_t v = u << SGRPROJ_PRJ_BITS;
    1068             :             // If params->r == 0 then we skipped the filtering in
    1069             :             // eb_av1_selfguided_restoration_c, i.e. flt[k] == u
    1070           0 :             if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
    1071           0 :             if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
    1072           0 :             const int16_t w =
    1073           0 :                 (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
    1074             : 
    1075           0 :             const uint16_t out = clip_pixel_highbd(w, bit_depth);
    1076           0 :             if (highbd)
    1077           0 :                 *CONVERT_TO_SHORTPTR(dst8ij) = out;
    1078             :             else
    1079           0 :                 *dst8ij = (uint8_t)out;
    1080             :         }
    1081             :     }
    1082           0 : }
    1083             : 
    1084        2628 : static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
    1085             :     int32_t stripe_width, int32_t stripe_height,
    1086             :     int32_t procunit_width, const uint8_t *src,
    1087             :     int32_t src_stride, uint8_t *dst, int32_t dst_stride,
    1088             :     int32_t *tmpbuf, int32_t bit_depth) {
    1089             :     (void)bit_depth;
    1090             :     assert(bit_depth == 8);
    1091             : 
    1092       16788 :     for (int32_t j = 0; j < stripe_width; j += procunit_width) {
    1093       14160 :         int32_t w = AOMMIN(procunit_width, stripe_width - j);
    1094             :         //CHKN SSE
    1095       14160 :         eb_apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
    1096       14160 :             rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
    1097             :             dst + j, dst_stride, tmpbuf, bit_depth, 0);
    1098             :     }
    1099        2628 : }
    1100             : 
    1101           0 : static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
    1102             :     int32_t stripe_width, int32_t stripe_height,
    1103             :     int32_t procunit_width, const uint8_t *src8,
    1104             :     int32_t src_stride, uint8_t *dst8,
    1105             :     int32_t dst_stride, int32_t *tmpbuf,
    1106             :     int32_t bit_depth) {
    1107             :     (void)tmpbuf;
    1108           0 :     const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
    1109             : 
    1110           0 :     for (int32_t j = 0; j < stripe_width; j += procunit_width) {
    1111           0 :         int32_t w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
    1112           0 :         const uint8_t *src8_p = src8 + j;
    1113           0 :         uint8_t *dst8_p = dst8 + j;
    1114           0 :         eb_av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,  //CHKN  SSE
    1115           0 :             rui->wiener_info.hfilter, 16,
    1116           0 :             rui->wiener_info.vfilter, 16, w,
    1117             :             stripe_height, &conv_params, bit_depth);
    1118             :     }
    1119           0 : }
    1120             : 
    1121           0 : static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
    1122             :     int32_t stripe_width, int32_t stripe_height,
    1123             :     int32_t procunit_width,
    1124             :     const uint8_t *src8, int32_t src_stride,
    1125             :     uint8_t *dst8, int32_t dst_stride,
    1126             :     int32_t *tmpbuf, int32_t bit_depth) {
    1127           0 :     for (int32_t j = 0; j < stripe_width; j += procunit_width) {
    1128           0 :         int32_t w = AOMMIN(procunit_width, stripe_width - j);
    1129             : 
    1130             :         //CHKN SSE
    1131           0 :         eb_apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
    1132           0 :             rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
    1133             :             dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
    1134             :     }
    1135           0 : }
    1136             : 
    1137             : typedef void(*stripe_filter_fun)(const RestorationUnitInfo *rui,
    1138             :     int32_t stripe_width, int32_t stripe_height,
    1139             :     int32_t procunit_width, const uint8_t *src,
    1140             :     int32_t src_stride, uint8_t *dst, int32_t dst_stride,
    1141             :     int32_t *tmpbuf, int32_t bit_depth);
    1142             : 
    1143             : #define NUM_STRIPE_FILTERS 4
    1144             : 
    1145             : static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
    1146             :   wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
    1147             :   sgrproj_filter_stripe_highbd
    1148             : };
    1149             : 
    1150             : // Filter one restoration unit
    1151        9729 : void eb_av1_loop_restoration_filter_unit(
    1152             :     uint8_t need_bounadaries,
    1153             :     const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
    1154             :     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
    1155             :     const AV1PixelRect *tile_rect, int32_t tile_stripe0, int32_t ss_x, int32_t ss_y,
    1156             :     int32_t highbd, int32_t bit_depth, uint8_t *data8, int32_t stride, uint8_t *dst8,
    1157             :     int32_t dst_stride, int32_t *tmpbuf, int32_t optimized_lr) {
    1158        9729 :     RestorationType unit_rtype = rui->restoration_type;
    1159             : 
    1160        9729 :     int32_t unit_h = limits->v_end - limits->v_start;
    1161        9729 :     int32_t unit_w = limits->h_end - limits->h_start;
    1162        9729 :     uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
    1163        9729 :     uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
    1164             : 
    1165        9729 :     if (unit_rtype == RESTORE_NONE) {
    1166         116 :         copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
    1167         116 :         return;
    1168             :     }
    1169             : 
    1170        9613 :     const int32_t filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
    1171             :     assert(filter_idx < NUM_STRIPE_FILTERS);
    1172        9613 :     const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
    1173             : 
    1174        9613 :     const int32_t procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
    1175             : 
    1176             :     // Convolve the whole tile one stripe at a time
    1177        9613 :     RestorationTileLimits remaining_stripes = *limits;
    1178        9613 :     int32_t i = 0;
    1179       67291 :     while (i < unit_h) {
    1180             :         int32_t copy_above, copy_below;
    1181       57678 :         remaining_stripes.v_start = limits->v_start + i;
    1182             : 
    1183       57678 :         get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
    1184             :             &copy_below);
    1185             : 
    1186       57678 :         const int32_t full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
    1187       57678 :         const int32_t runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
    1188             : 
    1189             :         // Work out where this stripe's boundaries are within
    1190             :         // rsb->stripe_boundary_{above,below}
    1191       57678 :         const int32_t tile_stripe =
    1192       57678 :             (remaining_stripes.v_start - tile_rect->top + runit_offset) /
    1193             :             full_stripe_height;
    1194       57678 :         const int32_t frame_stripe = tile_stripe0 + tile_stripe;
    1195       57678 :         const int32_t rsb_row = RESTORATION_CTX_VERT * frame_stripe;
    1196             : 
    1197             :         // Calculate this stripe's height, based on two rules:
    1198             :         // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
    1199             :         // * We can't extend past the end of the current restoration unit
    1200       57678 :         const int32_t nominal_stripe_height =
    1201       57678 :             full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
    1202       57678 :         const int32_t h = AOMMIN(nominal_stripe_height,
    1203             :             remaining_stripes.v_end - remaining_stripes.v_start);
    1204             : 
    1205       57678 :         if(need_bounadaries)
    1206       57678 :         setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
    1207             :             h, data8, stride, rlbs, copy_above,
    1208             :             copy_below, optimized_lr);
    1209             : 
    1210       57678 :         stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
    1211       57678 :             dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
    1212       57677 :         if (need_bounadaries)
    1213       57677 :         restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
    1214             :             data8, stride, copy_above, copy_below,
    1215             :             optimized_lr);
    1216             : 
    1217       57678 :         i += h;
    1218             :     }
    1219             : }
    1220             : 
    1221             : typedef struct {
    1222             :     const RestorationInfo *rsi;
    1223             :     RestorationLineBuffers *rlbs;
    1224             :     const Av1Common *cm;
    1225             :     int32_t tile_stripe0;
    1226             :     int32_t ss_x, ss_y;
    1227             :     int32_t highbd, bit_depth;
    1228             :     uint8_t *data8, *dst8;
    1229             :     int32_t data_stride, dst_stride;
    1230             :     int32_t *tmpbuf;
    1231             : } FilterFrameCtxt;
    1232             : 
    1233          72 : static void filter_frame_on_tile(int32_t tile_row, int32_t tile_col, void *priv) {
    1234             :     (void)tile_col;
    1235          72 :     FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
    1236          72 :     ctxt->tile_stripe0 =
    1237          72 :         (tile_row == 0) ? 0 : ctxt->cm->rst_end_stripe[tile_row - 1];
    1238          72 : }
    1239             : 
    1240         192 : static void filter_frame_on_unit(const RestorationTileLimits *limits,
    1241             :     const AV1PixelRect *tile_rect,
    1242             :     int32_t rest_unit_idx, void *priv) {
    1243         192 :     FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
    1244         192 :     const RestorationInfo *rsi = ctxt->rsi;
    1245             : 
    1246         192 :     eb_av1_loop_restoration_filter_unit(
    1247             :         1,
    1248         192 :         limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, ctxt->rlbs,
    1249             :         tile_rect, ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
    1250             :         ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
    1251             :         ctxt->dst_stride, ctxt->tmpbuf, rsi->optimized_lr);
    1252         192 : }
    1253             : 
    1254          60 : void eb_av1_loop_restoration_filter_frame(Yv12BufferConfig *frame,
    1255             :     Av1Common *cm, int32_t optimized_lr) {
    1256             :     // assert(!cm->all_lossless);
    1257          60 :     const int32_t num_planes = 3;// av1_num_planes(cm);
    1258             :     typedef void(*copy_fun)(const Yv12BufferConfig *src,
    1259             :         Yv12BufferConfig *dst);
    1260             :     static const copy_fun copy_funs[3] = { eb_aom_yv12_copy_y_c, eb_aom_yv12_copy_u_c, eb_aom_yv12_copy_v_c };//CHKN SSE
    1261             : 
    1262          60 :     Yv12BufferConfig *dst = &cm->rst_frame;
    1263             : 
    1264          60 :     const int32_t frame_width = frame->crop_widths[0];
    1265          60 :     const int32_t frame_height = frame->crop_heights[0];
    1266          60 :     if (eb_aom_realloc_frame_buffer(dst, frame_width, frame_height,
    1267             :         cm->subsampling_x, cm->subsampling_y,
    1268             :         cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
    1269             :         cm->byte_alignment, NULL, NULL, NULL) < 0)
    1270           0 :         printf("Failed to allocate restoration dst buffer\n");
    1271             : 
    1272             :     RestorationLineBuffers rlbs;
    1273          60 :     const int32_t bit_depth = cm->bit_depth;
    1274          60 :     const int32_t highbd = cm->use_highbitdepth;
    1275             : 
    1276         240 :     for (int32_t plane = 0; plane < num_planes; ++plane) {
    1277         180 :         RestorationInfo *rsi = &cm->rst_info[plane];
    1278         180 :         RestorationType rtype = rsi->frame_restoration_type;
    1279         180 :         rsi->optimized_lr = optimized_lr;
    1280             : 
    1281         180 :         if (rtype == RESTORE_NONE)
    1282         108 :             continue;
    1283          72 :         const int32_t is_uv = plane > 0;
    1284          72 :         const int32_t plane_width = frame->crop_widths[is_uv];
    1285          72 :         const int32_t plane_height = frame->crop_heights[is_uv];
    1286             : 
    1287          72 :         eb_extend_frame(frame->buffers[plane], plane_width, plane_height,
    1288             :             frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER,
    1289             :             highbd);
    1290             : 
    1291             :         FilterFrameCtxt ctxt;
    1292          72 :         ctxt.rsi = rsi;
    1293          72 :         ctxt.rlbs = &rlbs;
    1294          72 :         ctxt.cm = cm;
    1295          72 :         ctxt.ss_x = is_uv && cm->subsampling_x;
    1296          72 :         ctxt.ss_y = is_uv && cm->subsampling_y;
    1297          72 :         ctxt.highbd = highbd;
    1298          72 :         ctxt.bit_depth = bit_depth;
    1299          72 :         ctxt.data8 = frame->buffers[plane];
    1300          72 :         ctxt.dst8 = dst->buffers[plane];
    1301          72 :         ctxt.data_stride = frame->strides[is_uv];
    1302          72 :         ctxt.dst_stride = dst->strides[is_uv];
    1303          72 :         ctxt.tmpbuf = cm->rst_tmpbuf;
    1304             : 
    1305          72 :         av1_foreach_rest_unit_in_frame(cm, plane, filter_frame_on_tile, filter_frame_on_unit, &ctxt);
    1306             : 
    1307          72 :         copy_funs[plane](dst, frame);
    1308             :     }
    1309          60 : }
    1310             : 
    1311         672 : static void foreach_rest_unit_in_tile(const AV1PixelRect *tile_rect,
    1312             :     int32_t tile_row, int32_t tile_col, int32_t tile_cols,
    1313             :     int32_t hunits_per_tile, int32_t units_per_tile,
    1314             :     int32_t unit_size, int32_t ss_y,
    1315             :     RestUnitVisitor on_rest_unit,
    1316             :     void *priv) {
    1317         672 :     const int32_t tile_w = tile_rect->right - tile_rect->left;
    1318         672 :     const int32_t tile_h = tile_rect->bottom - tile_rect->top;
    1319         672 :     const int32_t ext_size = unit_size * 3 / 2;
    1320             : 
    1321         672 :     const int32_t tile_idx = tile_col + tile_row * tile_cols;
    1322         672 :     const int32_t unit_idx0 = tile_idx * units_per_tile;
    1323             : 
    1324         672 :     int32_t y0 = 0, i = 0;
    1325        1344 :     while (y0 < tile_h) {
    1326         672 :         int32_t remaining_h = tile_h - y0;
    1327         672 :         int32_t h = (remaining_h < ext_size) ? remaining_h : unit_size;
    1328             : 
    1329             :         RestorationTileLimits limits;
    1330         672 :         limits.v_start = tile_rect->top + y0;
    1331         672 :         limits.v_end = tile_rect->top + y0 + h;
    1332             :         assert(limits.v_end <= tile_rect->bottom);
    1333             :         // Offset the tile upwards to align with the restoration processing stripe
    1334         672 :         const int32_t voffset = RESTORATION_UNIT_OFFSET >> ss_y;
    1335         672 :         limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
    1336         672 :         if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
    1337             : 
    1338         672 :         int32_t x0 = 0, j = 0;
    1339        1944 :         while (x0 < tile_w) {
    1340        1272 :             int32_t remaining_w = tile_w - x0;
    1341        1272 :             int32_t w = (remaining_w < ext_size) ? remaining_w : unit_size;
    1342             : 
    1343        1272 :             limits.h_start = tile_rect->left + x0;
    1344        1272 :             limits.h_end = tile_rect->left + x0 + w;
    1345             :             assert(limits.h_end <= tile_rect->right);
    1346             : 
    1347        1272 :             const int32_t unit_idx = unit_idx0 + i * hunits_per_tile + j;
    1348        1272 :             on_rest_unit(&limits, tile_rect, unit_idx, priv);
    1349             : 
    1350        1272 :             x0 += w;
    1351        1272 :             ++j;
    1352             :         }
    1353             : 
    1354         672 :         y0 += h;
    1355         672 :         ++i;
    1356             :     }
    1357         672 : }
    1358             : 
    1359         672 : void av1_foreach_rest_unit_in_frame(Av1Common *cm, int32_t plane,
    1360             :     RestTileStartVisitor on_tile,
    1361             :     RestUnitVisitor on_rest_unit,
    1362             :     void *priv) {
    1363         672 :     const int32_t is_uv = plane > 0;
    1364         672 :     const int32_t ss_y = is_uv && cm->subsampling_y;
    1365             : 
    1366         672 :     const RestorationInfo *rsi = &cm->rst_info[plane];
    1367             : 
    1368         672 :     const AV1PixelRect tile_rect = whole_frame_rect(&cm->frm_size,
    1369             :         cm->subsampling_x, cm->subsampling_y, is_uv);
    1370             : 
    1371         672 :     if (on_tile) on_tile(0, 0, priv);
    1372             : 
    1373         672 :     foreach_rest_unit_in_tile(&tile_rect, 0, 0, 1, rsi->horz_units_per_tile,
    1374             :         rsi->units_per_tile, rsi->restoration_unit_size,
    1375             :         ss_y, on_rest_unit, priv);
    1376         672 : }
    1377         540 : static void foreach_rest_unit_in_tile_seg(const AV1PixelRect *tile_rect,
    1378             :     int32_t tile_row, int32_t tile_col, int32_t tile_cols,
    1379             :     int32_t hunits_per_tile, int32_t units_per_tile,
    1380             :     int32_t unit_size, int32_t ss_y,
    1381             :     RestUnitVisitor on_rest_unit,
    1382             :     void *priv ,
    1383             :     int32_t vunits_per_tile,
    1384             :     PictureControlSet   *picture_control_set_ptr,
    1385             :     uint32_t segment_index  )
    1386             : {
    1387             :     //tile_row=0
    1388             :     //tile_col=0
    1389             :     //tile_cols=1
    1390         540 :     const int32_t tile_w = tile_rect->right - tile_rect->left; // eq to pic_width
    1391         540 :     const int32_t tile_h = tile_rect->bottom - tile_rect->top; // eq to pic_height
    1392         540 :     const int32_t ext_size = unit_size * 3 / 2;
    1393             : 
    1394         540 :     const int32_t tile_idx = tile_col + tile_row * tile_cols;  //eq to 0
    1395         540 :     const int32_t unit_idx0 = tile_idx * units_per_tile;       //eq to 0
    1396             : 
    1397             :     uint32_t  x_seg_idx;
    1398             :     uint32_t  y_seg_idx;
    1399         540 :     uint32_t picture_width_in_units = hunits_per_tile;
    1400         540 :     uint32_t picture_height_in_units = vunits_per_tile;
    1401         540 :     SEGMENT_CONVERT_IDX_TO_XY(segment_index, x_seg_idx, y_seg_idx, picture_control_set_ptr->rest_segments_column_count);
    1402         540 :     uint32_t x_unit_start_idx = SEGMENT_START_IDX(x_seg_idx, picture_width_in_units,  picture_control_set_ptr->rest_segments_column_count);
    1403         540 :     uint32_t x_unit_end_idx   = SEGMENT_END_IDX  (x_seg_idx, picture_width_in_units,  picture_control_set_ptr->rest_segments_column_count);
    1404         540 :     uint32_t y_unit_start_idx = SEGMENT_START_IDX(y_seg_idx, picture_height_in_units, picture_control_set_ptr->rest_segments_row_count);
    1405         540 :     uint32_t y_unit_end_idx   = SEGMENT_END_IDX  (y_seg_idx, picture_height_in_units, picture_control_set_ptr->rest_segments_row_count);
    1406             : 
    1407         540 :     int32_t y0 = y_unit_start_idx * unit_size;
    1408         540 :     int32_t yend = ((int32_t)y_unit_end_idx == (int32_t)picture_height_in_units) ? tile_h : (int32_t)y_unit_end_idx * (int32_t)unit_size; //MIN(y_unit_end_idx * unit_size , tile_h);
    1409         540 :     int32_t i = y_unit_start_idx;
    1410             : 
    1411        1080 :     while (y0 < yend) {
    1412         540 :         int32_t remaining_h = tile_h - y0;
    1413         540 :         int32_t h = (remaining_h < ext_size) ? remaining_h : unit_size; //the area at the pic boundary should have size>= half unit_size to be an independent unit.
    1414             :                                                                         //if not, it will be added to the last complete unit, increasing its size to up to  3/2 unit_size.
    1415             : 
    1416             :         RestorationTileLimits limits;
    1417         540 :         limits.v_start = tile_rect->top + y0;
    1418         540 :         limits.v_end = tile_rect->top + y0 + h;
    1419             :         assert(limits.v_end <= tile_rect->bottom);
    1420             :         // Offset the tile upwards to align with the restoration processing stripe
    1421         540 :         const int32_t voffset = RESTORATION_UNIT_OFFSET >> ss_y;
    1422         540 :         limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
    1423         540 :         if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
    1424             : 
    1425         540 :         int32_t x0 = x_unit_start_idx * unit_size;
    1426         540 :         int32_t xend = ((int32_t)x_unit_end_idx == (int32_t)picture_width_in_units) ? tile_w : (int32_t)x_unit_end_idx * (int32_t)unit_size; //MIN(x_unit_end_idx * unit_size,tile_w);
    1427         540 :         int32_t j = x_unit_start_idx;
    1428             : 
    1429        1440 :         while (x0 < xend) {
    1430         900 :             int32_t remaining_w = tile_w - x0;
    1431         900 :             int32_t w = (remaining_w < ext_size) ? remaining_w : unit_size;
    1432             : 
    1433         900 :             limits.h_start = tile_rect->left + x0;
    1434         900 :             limits.h_end = tile_rect->left + x0 + w;
    1435             :             assert(limits.h_end <= tile_rect->right);
    1436             : 
    1437         900 :             const int32_t unit_idx = unit_idx0 + i * hunits_per_tile + j;
    1438         900 :             on_rest_unit(&limits, tile_rect, unit_idx, priv);
    1439             : 
    1440         900 :             x0 += w;
    1441         900 :             ++j;
    1442             :         }
    1443             : 
    1444         540 :         y0 += h;
    1445         540 :         ++i;
    1446             :     }
    1447         540 : }
    1448         540 : void av1_foreach_rest_unit_in_frame_seg(Av1Common *cm, int32_t plane,
    1449             :     RestTileStartVisitor on_tile,
    1450             :     RestUnitVisitor on_rest_unit,
    1451             :     void *priv,
    1452             :     PictureControlSet   *picture_control_set_ptr,
    1453             :     uint32_t segment_index)
    1454             : {
    1455         540 :     const int32_t is_uv = plane > 0;
    1456         540 :     const int32_t ss_y = is_uv && cm->subsampling_y;
    1457             : 
    1458         540 :     const RestorationInfo *rsi = &cm->rst_info[plane];
    1459             : 
    1460         540 :     const AV1PixelRect tile_rect = whole_frame_rect(&cm->frm_size,
    1461             :         cm->subsampling_x, cm->subsampling_y, is_uv);
    1462             : 
    1463         540 :     if (on_tile) on_tile(0, 0, priv);  //will set rsc->tile_strip0=0;
    1464             : 
    1465         540 :     foreach_rest_unit_in_tile_seg(&tile_rect, 0, 0, 1, rsi->horz_units_per_tile,
    1466             :         rsi->units_per_tile, rsi->restoration_unit_size,
    1467             :         ss_y, on_rest_unit, priv,
    1468             :         rsi->vert_units_per_tile,
    1469             :         picture_control_set_ptr,
    1470             :         segment_index);
    1471         540 : }
    1472             : 
    1473      117030 : int32_t eb_av1_loop_restoration_corners_in_sb(Av1Common *cm, int32_t plane,
    1474             :     int32_t mi_row, int32_t mi_col, BlockSize bsize,
    1475             :     int32_t *rcol0, int32_t *rcol1, int32_t *rrow0,
    1476             :     int32_t *rrow1, int32_t *tile_tl_idx) {
    1477             :     assert(rcol0 && rcol1 && rrow0 && rrow1);
    1478      117030 :     if (bsize != cm->p_pcs_ptr->sequence_control_set_ptr->seq_header.sb_size) return 0;
    1479       21583 :     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
    1480             : 
    1481             :     // assert(!cm->all_lossless);
    1482             : 
    1483        4303 :     const int32_t is_uv = plane > 0;
    1484             : 
    1485        4303 :     const AV1PixelRect tile_rect = whole_frame_rect(&cm->frm_size,
    1486             :         cm->subsampling_x, cm->subsampling_y, is_uv);
    1487        4320 :     const int32_t tile_w = tile_rect.right - tile_rect.left;
    1488        4320 :     const int32_t tile_h = tile_rect.bottom - tile_rect.top;
    1489             : 
    1490        4320 :     const int32_t mi_top = 0;
    1491        4320 :     const int32_t mi_left = 0;
    1492             : 
    1493             :     // Compute the mi-unit corners of the superblock relative to the top-left of
    1494             :     // the tile
    1495        4320 :     const int32_t mi_rel_row0 = mi_row - mi_top;
    1496        4320 :     const int32_t mi_rel_col0 = mi_col - mi_left;
    1497        4320 :     const int32_t mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
    1498        4320 :     const int32_t mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
    1499             : 
    1500        4320 :     const RestorationInfo *rsi = &cm->rst_info[plane];
    1501        4320 :     const int32_t size = rsi->restoration_unit_size;
    1502             : 
    1503             :     // Calculate the number of restoration units in this tile (which might be
    1504             :     // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
    1505        4320 :     const int32_t horz_units = count_units_in_tile(size, tile_w);
    1506        4320 :     const int32_t vert_units = count_units_in_tile(size, tile_h);
    1507             : 
    1508             :     // The size of an MI-unit on this plane of the image
    1509        4320 :     const int32_t ss_x = is_uv && cm->subsampling_x;
    1510        4320 :     const int32_t ss_y = is_uv && cm->subsampling_y;
    1511        4320 :     const int32_t mi_size_x = MI_SIZE >> ss_x;
    1512        4320 :     const int32_t mi_size_y = MI_SIZE >> ss_y;
    1513             : 
    1514             :     // Write m for the relative mi column or row, D for the superres denominator
    1515             :     // and N for the superres numerator. If u is the upscaled (called "unscaled"
    1516             :     // elsewhere) pixel offset then we can write the downscaled pixel offset in
    1517             :     // two ways as:
    1518             :     //
    1519             :     //   MI_SIZE * m = N / D u
    1520             :     //
    1521             :     // from which we get u = D * MI_SIZE * m / N
    1522             : 
    1523        4320 :     const int32_t mi_to_num_x = mi_size_x;//CHKN  av1_superres_unscaled(cm) ? mi_size_x : mi_size_x * cm->superres_scale_denominator;
    1524        4320 :     const int32_t mi_to_num_y = mi_size_y;
    1525        4320 :     const int32_t denom_x = size;//CHKN  av1_superres_unscaled(cm) ? size : size * SCALE_NUMERATOR;
    1526        4320 :     const int32_t denom_y = size;
    1527             : 
    1528        4320 :     const int32_t rnd_x = denom_x - 1;
    1529        4320 :     const int32_t rnd_y = denom_y - 1;
    1530             : 
    1531             :     // rcol0/rrow0 should be the first column/row of restoration units (relative
    1532             :     // to the top-left of the tile) that doesn't start left/below of
    1533             :     // mi_col/mi_row. For this calculation, we need to round up the division (if
    1534             :     // the sb starts at runit column 10.1, the first matching runit has column
    1535             :     // index 11)
    1536        4320 :     *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
    1537        4320 :     *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
    1538             : 
    1539             :     // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
    1540             :     // below-right. If we're at the bottom or right of the tile, this restoration
    1541             :     // unit might not exist, in which case we'll clamp accordingly.
    1542        4320 :     *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
    1543        4320 :     *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
    1544             : 
    1545        4320 :     const int32_t tile_idx = 0;
    1546        4320 :     *tile_tl_idx = tile_idx * rsi->units_per_tile;
    1547             : 
    1548        4320 :     return *rcol0 < *rcol1 && *rrow0 < *rrow1;
    1549             : }
    1550             : 
    1551             : // Extend to left and right
    1552        4212 : void extend_lines(uint8_t *buf, int32_t width, int32_t height, int32_t stride,
    1553             :     int32_t extend, int32_t use_highbitdepth) {
    1554       12636 :     for (int32_t i = 0; i < height; ++i) {
    1555        8424 :         if (use_highbitdepth) {
    1556           0 :             uint16_t *buf16 = (uint16_t *)buf;
    1557           0 :             eb_aom_memset16(buf16 - extend, buf16[0], extend);
    1558           0 :             eb_aom_memset16(buf16 + width, buf16[width - 1], extend);
    1559             :         }
    1560             :         else {
    1561        8424 :             memset(buf - extend, buf[0], extend);
    1562        8424 :             memset(buf + width, buf[width - 1], extend);
    1563             :         }
    1564        8424 :         buf += stride;
    1565             :     }
    1566        4212 : }
    1567             : 
    1568        3510 : static void save_deblock_boundary_lines(
    1569             :     uint8_t *src_buf, int32_t src_stride, int32_t src_width, int32_t src_height,
    1570             :     const Av1Common *cm, int32_t plane, int32_t row,
    1571             :     int32_t stripe, int32_t use_highbd, int32_t is_above,
    1572             :     RestorationStripeBoundaries *boundaries)
    1573             : {
    1574        3510 :     const int32_t is_uv = plane > 0;
    1575        3510 :     src_stride = src_stride << use_highbd;
    1576        3510 :     const uint8_t *src_rows = src_buf + row * src_stride;
    1577             : 
    1578        3510 :     uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
    1579        3510 :         : boundaries->stripe_boundary_below;
    1580        3510 :     uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
    1581        3510 :     const int32_t bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
    1582        3510 :     uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
    1583             : 
    1584             :     // There is a rare case in which a processing stripe can end 1px above the
    1585             :     // crop border. In this case, we do want to use deblocked pixels from below
    1586             :     // the stripe (hence why we ended up in this function), but instead of
    1587             :     // fetching 2 "below" rows we need to fetch one and duplicate it.
    1588             :     // This is equivalent to clamping the sample locations against the crop border
    1589        3510 :     const int32_t lines_to_save =
    1590        3510 :         AOMMIN(RESTORATION_CTX_VERT, src_height - row);
    1591             : 
    1592             :     assert(lines_to_save == 1 || lines_to_save == 2);
    1593             : 
    1594             :     int32_t upscaled_width;
    1595             :     int32_t line_bytes;
    1596             : 
    1597        3510 :     if (!av1_superres_unscaled(&cm->frm_size)) {
    1598           0 :         int32_t sx = is_uv && cm->subsampling_x;
    1599           0 :         upscaled_width = (cm->frm_size.superres_upscaled_width + sx) >> sx;
    1600           0 :         line_bytes = upscaled_width << use_highbd;
    1601             : 
    1602           0 :         av1_upscale_normative_rows(cm, (src_rows),
    1603             :             src_stride >> use_highbd, (bdry_rows),
    1604             :             boundaries->stripe_boundary_stride,
    1605             :             lines_to_save, sx, cm->bit_depth);
    1606             :     }
    1607             :     else {
    1608        3510 :         upscaled_width = src_width;
    1609        3510 :         line_bytes = upscaled_width << use_highbd;
    1610       10530 :         for (int32_t i = 0; i < lines_to_save; i++) {
    1611        7020 :             memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
    1612             :                 line_bytes);
    1613             :         }
    1614             :     }
    1615             :     // If we only saved one line, then copy it into the second line buffer
    1616        3510 :     if (lines_to_save == 1)
    1617           0 :         memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
    1618             : 
    1619        3510 :     extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
    1620             :         RESTORATION_EXTRA_HORZ, use_highbd);
    1621        3510 : }
    1622             : 
    1623         702 : static void save_cdef_boundary_lines(uint8_t *src_buf, int32_t src_stride,
    1624             :     int32_t src_width, const Av1Common *cm, int32_t plane, int32_t row,
    1625             :     int32_t stripe, int32_t use_highbd, int32_t is_above,
    1626             :     RestorationStripeBoundaries *boundaries)
    1627             : {
    1628         702 :     const int32_t is_uv = plane > 0;
    1629         702 :     src_stride = src_stride << use_highbd;
    1630         702 :     const uint8_t *src_rows = src_buf + row * src_stride;
    1631             : 
    1632         702 :     uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
    1633         702 :         : boundaries->stripe_boundary_below;
    1634         702 :     uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
    1635         702 :     const int32_t bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
    1636         702 :     uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
    1637             : 
    1638             :     // At the point where this function is called, we've already applied
    1639             :     // superres. So we don't need to extend the lines here, we can just
    1640             :     // pull directly from the topmost row of the upscaled frame.
    1641         702 :     const int32_t ss_x = is_uv && cm->subsampling_x;
    1642         702 :     const int32_t upscaled_width = av1_superres_unscaled(&cm->frm_size)
    1643             :         ? src_width
    1644         702 :         : (cm->frm_size.superres_upscaled_width + ss_x) >> ss_x;
    1645         702 :     const int32_t line_bytes = upscaled_width << use_highbd;
    1646        2106 :     for (int32_t i = 0; i < RESTORATION_CTX_VERT; i++) {
    1647             :         // Copy the line at 'row' into both context lines. This is because
    1648             :         // we want to (effectively) extend the outermost row of CDEF data
    1649             :         // from this tile to produce a border, rather than using deblocked
    1650             :         // pixels from the tile above/below.
    1651        1404 :         memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
    1652             :     }
    1653         702 :     extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
    1654             :         RESTORATION_EXTRA_HORZ, use_highbd);
    1655         702 : }
    1656             : 
    1657         702 : void save_tile_row_boundary_lines(uint8_t *src, int32_t src_stride,
    1658             :     int32_t src_width, int32_t src_height, int32_t use_highbd, int32_t plane,
    1659             :     Av1Common *cm, int32_t after_cdef, RestorationStripeBoundaries *boundaries)
    1660             : {
    1661         702 :     const int32_t is_uv = plane > 0;
    1662         702 :     const int32_t ss_y = is_uv && cm->subsampling_y;
    1663         702 :     const int32_t stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
    1664         702 :     const int32_t stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
    1665             : 
    1666             :     // Get the tile rectangle, with height rounded up to the next multiple of 8
    1667             :     // luma pixels (only relevant for the bottom tile of the frame)
    1668         702 :     const AV1PixelRect tile_rect = whole_frame_rect(&cm->frm_size, cm->subsampling_x,
    1669             :         cm->subsampling_y, is_uv);
    1670         702 :     const int32_t stripe0 = 0;
    1671             : 
    1672         702 :     int32_t plane_height = ROUND_POWER_OF_TWO(cm->frm_size.frame_height, ss_y);
    1673             : 
    1674             :     int32_t tile_stripe;
    1675        4914 :     for (tile_stripe = 0;; ++tile_stripe) {
    1676        4914 :         const int32_t rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
    1677        4914 :         const int32_t y0 = tile_rect.top + rel_y0;
    1678        4914 :         if (y0 >= tile_rect.bottom) break;
    1679             : 
    1680        4212 :         const int32_t rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
    1681        4212 :         const int32_t y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
    1682             : 
    1683        4212 :         const int32_t frame_stripe = stripe0 + tile_stripe;
    1684             : 
    1685             :         int32_t use_deblock_above, use_deblock_below;
    1686             :         // In this case, we should only use CDEF pixels at the top
    1687             :         // and bottom of the frame as a whole; internal tile boundaries
    1688             :         // can use deblocked pixels from adjacent tiles for context.
    1689        4212 :         use_deblock_above = (frame_stripe > 0);
    1690        4212 :         use_deblock_below = (y1 < plane_height);
    1691             : 
    1692        4212 :         if (!after_cdef) {
    1693             :             // Save deblocked context where needed.
    1694        2106 :             if (use_deblock_above) {
    1695        1755 :                 save_deblock_boundary_lines(src, src_stride, src_width, src_height,
    1696             :                     cm, plane, y0 - RESTORATION_CTX_VERT,
    1697             :                     frame_stripe, use_highbd, 1, boundaries);
    1698             :             }
    1699        2106 :             if (use_deblock_below) {
    1700        1755 :                 save_deblock_boundary_lines(src, src_stride, src_width, src_height,
    1701             :                     cm, plane, y1, frame_stripe, use_highbd, 0, boundaries);
    1702             :             }
    1703             :         }
    1704             :         else {
    1705             :             // Save CDEF context where needed. Note that we need to save the CDEF
    1706             :             // context for a particular boundary iff we *didn't* save deblocked
    1707             :             // context for that boundary.
    1708             :             //
    1709             :             // In addition, we need to save copies of the outermost line within
    1710             :             // the tile, rather than using data from outside the tile.
    1711        2106 :             if (!use_deblock_above) {
    1712         351 :                 save_cdef_boundary_lines(src, src_stride, src_width,
    1713             :                     cm, plane, y0, frame_stripe, use_highbd, 1, boundaries);
    1714             :             }
    1715        2106 :             if (!use_deblock_below) {
    1716         351 :                 save_cdef_boundary_lines(src, src_stride, src_width,
    1717             :                     cm, plane, y1 - 1, frame_stripe, use_highbd, 0, boundaries);
    1718             :             }
    1719             :         }
    1720             :     }
    1721         702 : }
    1722             : 
    1723             : // For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
    1724             : // lines to be used as boundary in the loop restoration process. The
    1725             : // lines are saved in rst_internal.stripe_boundary_lines
    1726         120 : void eb_av1_loop_restoration_save_boundary_lines(const Yv12BufferConfig *frame,
    1727             :     Av1Common *cm, int32_t after_cdef) {
    1728         120 :     const int32_t num_planes = 3;// av1_num_planes(cm);
    1729         120 :     const int32_t use_highbd = cm->use_highbitdepth;
    1730             : 
    1731         480 :     for (int32_t p = 0; p < num_planes; ++p) {
    1732         360 :         const int32_t is_uv = p > 0;
    1733         360 :         int32_t crop_width = frame->crop_widths[is_uv];
    1734         360 :         int32_t crop_height = frame->crop_heights[is_uv];
    1735         360 :         uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[p]);
    1736         360 :         int32_t src_stride = frame->strides[is_uv];
    1737         360 :         RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
    1738             : 
    1739         360 :         save_tile_row_boundary_lines(src_buf, src_stride, crop_width, crop_height,
    1740             :             use_highbd, p, cm, after_cdef, boundaries);
    1741             :     }
    1742         120 : }
    1743             : 
    1744             : // Assumes cm->rst_info[p].restoration_unit_size is already initialized
    1745             : 
    1746         144 : EbErrorType eb_av1_alloc_restoration_buffers(Av1Common *cm) {
    1747         144 :     EbErrorType return_error = EB_ErrorNone;
    1748         144 :     const int32_t num_planes = 3;// av1_num_planes(cm);
    1749         576 :     for (int32_t p = 0; p < num_planes; ++p)
    1750         432 :         return_error = eb_av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
    1751             : 
    1752             :     //CHKNif (cm->rst_tmpbuf == NULL)
    1753             :     {
    1754             :         //CHKN CHECK_MEM_ERROR(cm, cm->rst_tmpbuf,
    1755             :        //cm->rst_tmpbuf = (int32_t *)eb_aom_memalign(16, RESTORATION_TMPBUF_SIZE);
    1756             : 
    1757         144 :         EB_MALLOC_ALIGNED(cm->rst_tmpbuf, RESTORATION_TMPBUF_SIZE);
    1758             :     }
    1759             : 
    1760             :     // For striped loop restoration, we divide each row of tiles into "stripes",
    1761             :     // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET
    1762             :     // luma pixels to match the output from CDEF. We will need to store 2 *
    1763             :     // RESTORATION_CTX_VERT lines of data for each stripe, and also need to be
    1764             :     // able to quickly answer the question "Where is the <n>'th stripe for tile
    1765             :     // row <m>?" To make that efficient, we generate the rst_last_stripe array.
    1766         144 :     int32_t num_stripes = 0;
    1767         288 :     for (int32_t i = 0; i < 1/*cm->tile_rows*/; ++i) {
    1768             :         //TileInfo tile_info;
    1769             :         //eb_av1_tile_set_row(&tile_info, cm, i);
    1770             : 
    1771         144 :         const int32_t mi_h = cm->mi_rows;// tile_info.mi_row_end - tile_info.mi_row_start;
    1772         144 :         const int32_t ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
    1773         144 :         const int32_t tile_stripes = (ext_h + 63) / 64;
    1774         144 :         num_stripes += tile_stripes;
    1775         144 :         cm->rst_end_stripe[i] = num_stripes;
    1776             :     }
    1777             : 
    1778             :     // Now we need to allocate enough space to store the line buffers for the
    1779             :     // stripes
    1780         144 :     const int32_t frame_w = cm->frm_size.superres_upscaled_width;
    1781         144 :     const int32_t use_highbd = cm->use_highbitdepth ? 1 : 0;
    1782             : 
    1783         576 :     for (int32_t p = 0; p < num_planes; ++p) {
    1784         432 :         const int32_t is_uv = p > 0;
    1785         432 :         const int32_t ss_x = is_uv && cm->subsampling_x;
    1786         432 :         const int32_t plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
    1787         432 :         const int32_t stride = ALIGN_POWER_OF_TWO(plane_w, 5);
    1788         432 :         const int32_t buf_size = num_stripes * stride * RESTORATION_CTX_VERT
    1789             :             << use_highbd;
    1790         432 :         RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
    1791             : 
    1792             :         {
    1793         432 :             EB_MALLOC(boundaries->stripe_boundary_above, buf_size);
    1794         432 :             EB_MALLOC(boundaries->stripe_boundary_below, buf_size);
    1795             : 
    1796         432 :             boundaries->stripe_boundary_size = buf_size;
    1797             :         }
    1798         432 :         boundaries->stripe_boundary_stride = stride;
    1799             :     }
    1800             : 
    1801         144 :     return return_error;
    1802             : }

Generated by: LCOV version 1.14