Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : *
11 : */
12 : #include "EbDefinitions.h"
13 : #include "EbSequenceControlSet.h"
14 : #include "EbPictureBufferDesc.h"
15 : #include "EbPictureControlSet.h"
16 : #include "aom_dsp_rtcd.h"
17 : #include "EbRestoration.h"
18 :
19 : void av1_upscale_normative_rows(const Av1Common *cm, const uint8_t *src,
20 : int src_stride, uint8_t *dst, int dst_stride, int rows, int sub_x, int bd);
21 :
22 : void av1_foreach_rest_unit_in_frame(Av1Common *cm, int32_t plane,
23 : RestTileStartVisitor on_tile,
24 : RestUnitVisitor on_rest_unit,
25 : void *priv);
26 :
27 : void eb_aom_yv12_copy_y_c(const Yv12BufferConfig *src_ybc, Yv12BufferConfig *dst_ybc);
28 : void eb_aom_yv12_copy_u_c(const Yv12BufferConfig *src_bc, Yv12BufferConfig *dst_bc);
29 : void eb_aom_yv12_copy_v_c(const Yv12BufferConfig *src_bc, Yv12BufferConfig *dst_bc);
30 :
31 : int32_t eb_aom_realloc_frame_buffer(Yv12BufferConfig *ybf, int32_t width, int32_t height,
32 : int32_t ss_x, int32_t ss_y, int32_t use_highbitdepth,
33 : int32_t border, int32_t byte_alignment,
34 : AomCodecFrameBuffer *fb,
35 : aom_get_frame_buffer_cb_fn_t cb, void *cb_priv);
36 :
37 : ///---filter.h
38 : #define MAX_FILTER_TAP 8
39 :
40 : // With CONFIG_DUAL_FILTER, pack two InterpFilter's into a uint32_t: since
41 : // there are at most 10 filters, we can use 16 bits for each and have more than
42 : // enough space. This reduces argument passing and unifies the operation of
43 : // setting a (pair of) filters.
44 : //
45 : // Without CONFIG_DUAL_FILTER,
46 : typedef uint32_t InterpFilters;
47 :
48 : #define LOG_SWITCHABLE_FILTERS \
49 : 2 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
50 :
51 : #define MAX_SUBPEL_TAPS 12
52 : #define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
53 : #define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
54 : #define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
55 :
56 : //typedef struct InterpFilterParams {
57 : // const int16_t *filter_ptr;
58 : // uint16_t taps;
59 : // uint16_t subpel_shifts;
60 : // InterpFilter interp_filter;
61 : //} InterpFilterParams;
62 :
63 : InterpFilterParams av1_get_interp_filter_params_with_block_size(
64 : const InterpFilter interp_filter, const int32_t w);
65 :
66 : void *eb_aom_memset16(void *dest, int32_t val, size_t length);
67 :
68 : ///---convolve.h
69 : #define FILTER_BITS 7
70 :
71 : //typedef uint16_t ConvBufType;
72 : //typedef struct ConvolveParams {
73 : // int32_t ref;
74 : // int32_t do_average;
75 : // ConvBufType *dst;
76 : // int32_t dst_stride;
77 : // int32_t round_0;
78 : // int32_t round_1;
79 : // int32_t plane;
80 : // int32_t is_compound;
81 : // int32_t use_jnt_comp_avg;
82 : // int32_t fwd_offset;
83 : // int32_t bck_offset;
84 : //} ConvolveParams;
85 :
86 : #define ROUND0_BITS 3
87 : #define COMPOUND_ROUND1_BITS 7
88 : #define WIENER_ROUND0_BITS 3
89 :
90 : #define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0))
91 :
92 : typedef void(*aom_convolve_fn_t)(const uint8_t *src, int32_t src_stride,
93 : uint8_t *dst, int32_t dst_stride, int32_t w, int32_t h,
94 : InterpFilterParams *filter_params_x,
95 : InterpFilterParams *filter_params_y,
96 : const int32_t subpel_x_q4, const int32_t subpel_y_q4,
97 : ConvolveParams *conv_params);
98 :
99 : typedef void(*aom_highbd_convolve_fn_t)(
100 : const uint16_t *src, int32_t src_stride, uint16_t *dst, int32_t dst_stride, int32_t w,
101 : int32_t h, InterpFilterParams *filter_params_x,
102 : InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
103 : const int32_t subpel_y_q4, ConvolveParams *conv_params, int32_t bd);
104 :
105 : struct AV1Common;
106 : struct scale_factors;
107 :
108 55050 : static INLINE ConvolveParams get_conv_params_wiener(int32_t bd) {
109 : ConvolveParams conv_params;
110 : (void)bd;
111 55050 : conv_params.ref = 0;
112 55050 : conv_params.do_average = 0;
113 55050 : conv_params.is_compound = 0;
114 55050 : conv_params.round_0 = WIENER_ROUND0_BITS;
115 55050 : conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0;
116 55050 : const int32_t intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
117 : ASSERT(IMPLIES(bd < 12, intbufrange <= 16));
118 55050 : if (intbufrange > 16) {
119 0 : conv_params.round_0 += intbufrange - 16;
120 0 : conv_params.round_1 -= intbufrange - 16;
121 : }
122 55050 : conv_params.dst = NULL;
123 55050 : conv_params.dst_stride = 0;
124 55050 : conv_params.plane = 0;
125 :
126 : // Initialization
127 55050 : conv_params.fwd_offset = 0;
128 55050 : conv_params.bck_offset = 0;
129 55050 : conv_params.use_jnt_comp_avg = 0;
130 :
131 55050 : return conv_params;
132 : }
133 :
134 : void eb_av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
135 : uint8_t *dst, ptrdiff_t dst_stride,
136 : const int16_t *filter_x, int32_t x_step_q4,
137 : const int16_t *filter_y, int32_t y_step_q4,
138 : int32_t w, int32_t h,
139 : const ConvolveParams *conv_params);
140 :
141 : void eb_av1_highbd_wiener_convolve_add_src_c(
142 : const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
143 : ptrdiff_t dst_stride, const int16_t *filter_x, int32_t x_step_q4,
144 : const int16_t *filter_y, int32_t y_step_q4, int32_t w, int32_t h,
145 : const ConvolveParams *conv_params, int32_t bd);
146 :
147 : void *eb_aom_memalign(size_t align, size_t size);
148 : void eb_aom_free(void *memblk);
149 :
150 : // The 's' values are calculated based on original 'r' and 'e' values in the
151 : // spec using GenSgrprojVtable().
152 : // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
153 : const SgrParamsType eb_sgr_params[SGRPROJ_PARAMS] = {
154 : { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
155 : { { 2, 1 }, { 93, 1618 } }, { { 2, 1 }, { 80, 1438 } },
156 : { { 2, 1 }, { 70, 1295 } }, { { 2, 1 }, { 58, 1177 } },
157 : { { 2, 1 }, { 47, 1079 } }, { { 2, 1 }, { 37, 996 } },
158 : { { 2, 1 }, { 30, 925 } }, { { 2, 1 }, { 25, 863 } },
159 : { { 0, 1 }, { -1, 2589 } }, { { 0, 1 }, { -1, 1618 } },
160 : { { 0, 1 }, { -1, 1177 } }, { { 0, 1 }, { -1, 925 } },
161 : { { 2, 0 }, { 56, -1 } }, { { 2, 0 }, { 22, -1 } },
162 : };
163 :
164 6846 : AV1PixelRect whole_frame_rect(FrameSize *frm_size, int32_t sub_x,
165 : int32_t sub_y, int32_t is_uv)
166 : {
167 : AV1PixelRect rect;
168 :
169 6846 : int32_t ss_x = is_uv && sub_x;
170 6846 : int32_t ss_y = is_uv && sub_y;
171 :
172 6846 : rect.top = 0;
173 6846 : rect.bottom = ROUND_POWER_OF_TWO(frm_size->frame_height, ss_y);
174 6846 : rect.left = 0;
175 6846 : rect.right = ROUND_POWER_OF_TWO(frm_size->superres_upscaled_width, ss_x);
176 6846 : return rect;
177 : }
178 :
179 : // Count horizontal or vertical units per tile (use a width or height for
180 : // tile_size, respectively). We basically want to divide the tile size by the
181 : // size of a restoration unit. Rather than rounding up unconditionally as you
182 : // might expect, we round to nearest, which models the way a right or bottom
183 : // restoration unit can extend to up to 150% its normal width or height. The
184 : // max with 1 is to deal with tiles that are smaller than half of a restoration
185 : // unit.
186 9504 : static int32_t count_units_in_tile(int32_t unit_size, int32_t tile_size) {
187 9504 : return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
188 : }
189 :
190 432 : EbErrorType eb_av1_alloc_restoration_struct(struct Av1Common *cm, RestorationInfo *rsi,
191 : int32_t is_uv) {
192 : // We need to allocate enough space for restoration units to cover the
193 : // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
194 : // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
195 : // to do the computation ourselves, iterating over the tiles and keeping
196 : // track of the largest width and height, then upscaling.
197 432 : const AV1PixelRect tile_rect = whole_frame_rect(&cm->frm_size,
198 : cm->subsampling_x, cm->subsampling_y, is_uv);
199 432 : const int32_t max_tile_w = tile_rect.right - tile_rect.left;
200 432 : const int32_t max_tile_h = tile_rect.bottom - tile_rect.top;
201 :
202 : // To calculate hpertile and vpertile (horizontal and vertical units per
203 : // tile), we basically want to divide the largest tile width or height by the
204 : // size of a restoration unit. Rather than rounding up unconditionally as you
205 : // might expect, we round to nearest, which models the way a right or bottom
206 : // restoration unit can extend to up to 150% its normal width or height. The
207 : // max with 1 is to deal with tiles that are smaller than half of a
208 : // restoration unit.
209 432 : const int32_t unit_size = rsi->restoration_unit_size;
210 432 : const int32_t hpertile = count_units_in_tile(unit_size, max_tile_w); //FB of size < 1/2 unit_size are included in neigh FB making them bigger!!
211 432 : const int32_t vpertile = count_units_in_tile(unit_size, max_tile_h);
212 :
213 432 : rsi->units_per_tile = hpertile * vpertile;//pic_tot_FB
214 432 : rsi->horz_units_per_tile = hpertile; //pic_width_in_FB
215 432 : rsi->vert_units_per_tile = vpertile; //pic_height_in_FB
216 :
217 432 : const int32_t ntiles = 1;
218 432 : const int32_t nunits = ntiles * rsi->units_per_tile;
219 :
220 432 : EB_MALLOC_ARRAY(rsi->unit_info, nunits);
221 :
222 432 : return EB_ErrorNone;
223 : }
224 :
225 432 : static void extend_frame_lowbd(uint8_t *data, int32_t width, int32_t height, int32_t stride,
226 : int32_t border_horz, int32_t border_vert) {
227 : uint8_t *data_p;
228 : int32_t i;
229 110592 : for (i = 0; i < height; ++i) {
230 110160 : data_p = data + i * stride;
231 110160 : memset(data_p - border_horz, data_p[0], border_horz);
232 110160 : memset(data_p + width, data_p[width - 1], border_horz);
233 : }
234 432 : data_p = data - border_horz;
235 1728 : for (i = -border_vert; i < 0; ++i)
236 1296 : memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
237 1728 : for (i = height; i < height + border_vert; ++i) {
238 1296 : memcpy(data_p + i * stride, data_p + (height - 1) * stride,
239 1296 : width + 2 * border_horz);
240 : }
241 432 : }
242 :
243 0 : static void extend_frame_highbd(uint16_t *data, int32_t width, int32_t height,
244 : int32_t stride, int32_t border_horz, int32_t border_vert) {
245 : uint16_t *data_p;
246 : int32_t i, j;
247 0 : for (i = 0; i < height; ++i) {
248 0 : data_p = data + i * stride;
249 0 : for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
250 0 : for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
251 : }
252 0 : data_p = data - border_horz;
253 0 : for (i = -border_vert; i < 0; ++i) {
254 0 : memcpy(data_p + i * stride, data_p,
255 0 : (width + 2 * border_horz) * sizeof(uint16_t));
256 : }
257 0 : for (i = height; i < height + border_vert; ++i) {
258 0 : memcpy(data_p + i * stride, data_p + (height - 1) * stride,
259 0 : (width + 2 * border_horz) * sizeof(uint16_t));
260 : }
261 0 : }
262 :
263 432 : void eb_extend_frame(uint8_t *data, int32_t width, int32_t height, int32_t stride,
264 : int32_t border_horz, int32_t border_vert, int32_t highbd) {
265 432 : if (highbd)
266 0 : extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
267 : border_horz, border_vert);
268 : else
269 432 : extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
270 432 : }
271 :
272 116 : static void copy_tile_lowbd(int32_t width, int32_t height, const uint8_t *src,
273 : int32_t src_stride, uint8_t *dst, int32_t dst_stride) {
274 41876 : for (int32_t i = 0; i < height; ++i)
275 41760 : memcpy(dst + i * dst_stride, src + i * src_stride, width);
276 116 : }
277 :
278 0 : static void copy_tile_highbd(int32_t width, int32_t height, const uint16_t *src,
279 : int32_t src_stride, uint16_t *dst, int32_t dst_stride) {
280 0 : for (int32_t i = 0; i < height; ++i)
281 0 : memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
282 0 : }
283 :
284 116 : static void copy_tile(int32_t width, int32_t height, const uint8_t *src, int32_t src_stride,
285 : uint8_t *dst, int32_t dst_stride, int32_t highbd) {
286 116 : if (highbd)
287 0 : copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
288 0 : CONVERT_TO_SHORTPTR(dst), dst_stride);
289 : else
290 116 : copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
291 116 : }
292 :
293 : // With striped loop restoration, the filtering for each 64-pixel stripe gets
294 : // most of its input from the output of CDEF (stored in data8), but we need to
295 : // fill out a border of 3 pixels above/below the stripe according to the
296 : // following
297 : // rules:
298 : //
299 : // * At a frame boundary, we copy the outermost row of CDEF pixels three times.
300 : // This extension is done by a call to eb_extend_frame() at the start of the loop
301 : // restoration process, so the value of copy_above/copy_below doesn't strictly
302 : // matter.
303 : // However, by setting *copy_above = *copy_below = 1 whenever loop filtering
304 : // across tiles is disabled, we can allow
305 : // {setup,restore}_processing_stripe_boundary to assume that the top/bottom
306 : // data has always been copied, simplifying the behaviour at the left and
307 : // right edges of tiles.
308 : //
309 : // * If we're at a tile boundary and loop filtering across tiles is enabled,
310 : // then there is a logical stripe which is 64 pixels high, but which is split
311 : // into an 8px high and a 56px high stripe so that the processing (and
312 : // coefficient set usage) can be aligned to tiles.
313 : // In this case, we use the 3 rows of CDEF output across the boundary for
314 : // context; this corresponds to leaving the frame buffer as-is.
315 : //
316 : // * If we're at a tile boundary and loop filtering across tiles is disabled,
317 : // then we take the outermost row of CDEF pixels *within the current tile*
318 : // and copy it three times. Thus we behave exactly as if the tile were a full
319 : // frame.
320 : //
321 : // * Otherwise, we're at a stripe boundary within a tile. In that case, we
322 : // take 2 rows of deblocked pixels and extend them to 3 rows of context.
323 : //
324 : // The distinction between the latter two cases is handled by the
325 : // eb_av1_loop_restoration_save_boundary_lines() function, so here we just need
326 : // to decide if we're overwriting the above/below boundary pixels or not.
327 57678 : static void get_stripe_boundary_info(const RestorationTileLimits *limits,
328 : const AV1PixelRect *tile_rect, int32_t ss_y,
329 : int32_t *copy_above, int32_t *copy_below) {
330 57678 : *copy_above = 1;
331 57678 : *copy_below = 1;
332 :
333 57678 : const int32_t full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
334 57678 : const int32_t runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
335 :
336 57678 : const int32_t first_stripe_in_tile = (limits->v_start == tile_rect->top);
337 57678 : const int32_t this_stripe_height =
338 57678 : full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
339 57678 : const int32_t last_stripe_in_tile =
340 57678 : (limits->v_start + this_stripe_height >= tile_rect->bottom);
341 :
342 57678 : if (first_stripe_in_tile) *copy_above = 0;
343 57678 : if (last_stripe_in_tile) *copy_below = 0;
344 57678 : }
345 :
346 : // Overwrite the border pixels around a processing stripe so that the conditions
347 : // listed above get_stripe_boundary_info() are preserved.
348 : // We save the pixels which get overwritten into a temporary buffer, so that
349 : // they can be restored by restore_processing_stripe_boundary() after we've
350 : // processed the stripe.
351 : //
352 : // limits gives the rectangular limits of the remaining stripes for the current
353 : // restoration unit. rsb is the stored stripe boundaries (taken from either
354 : // deblock or CDEF output as necessary).
355 : //
356 : // tile_rect is the limits of the current tile and tile_stripe0 is the index of
357 : // the first stripe in this tile (needed to convert the tile-relative stripe
358 : // index we get from limits into something we can look up in rsb).
359 57678 : static void setup_processing_stripe_boundary(
360 : const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
361 : int32_t rsb_row, int32_t use_highbd, int32_t h, uint8_t *data8, int32_t data_stride,
362 : RestorationLineBuffers *rlbs, int32_t copy_above, int32_t copy_below, int32_t opt) {
363 : // Offsets within the line buffers. The buffer logically starts at column
364 : // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
365 : // has column x0 in the buffer.
366 57678 : const int32_t buf_stride = rsb->stripe_boundary_stride;
367 57678 : const int32_t buf_x0_off = limits->h_start;
368 57678 : const int32_t line_width =
369 57678 : (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
370 57678 : const int32_t line_size = line_width << use_highbd;
371 :
372 57678 : const int32_t data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
373 :
374 : // Replace RESTORATION_BORDER pixels above the top of the stripe
375 : // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
376 : // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
377 : // duplicating the topmost of the 2 lines (see the AOMMAX call when
378 : // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
379 : //
380 : // Special case: If we're at the top of a tile, which isn't on the topmost
381 : // tile row, and we're allowed to loop filter across tiles, then we have a
382 : // logical 64-pixel-high stripe which has been split into an 8-pixel high
383 : // stripe and a 56-pixel high stripe (the current one). So, in this case,
384 : // we want to leave the boundary alone!
385 57678 : if (!opt) {
386 57642 : if (copy_above) {
387 48035 : uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
388 :
389 192138 : for (int32_t i = -RESTORATION_BORDER; i < 0; ++i) {
390 144103 : const int32_t buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
391 144103 : const int32_t buf_off = buf_x0_off + buf_row * buf_stride;
392 144103 : const uint8_t *buf =
393 144103 : rsb->stripe_boundary_above + (buf_off << use_highbd);
394 144103 : uint8_t *dst8 = data8_tl + i * data_stride;
395 : // Save old pixels, then replace with data from stripe_boundary_above
396 144103 : memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
397 0 : REAL_PTR(use_highbd, dst8), line_size);
398 144103 : memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
399 : }
400 : }
401 :
402 : // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
403 : // The second buffer row is repeated, so src_row gets the values 0, 1, 1
404 : // for i = 0, 1, 2.
405 57642 : if (copy_below) {
406 48035 : const int32_t stripe_end = limits->v_start + h;
407 48035 : uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
408 :
409 192139 : for (int32_t i = 0; i < RESTORATION_BORDER; ++i) {
410 144104 : const int32_t buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
411 144104 : const int32_t buf_off = buf_x0_off + buf_row * buf_stride;
412 144104 : const uint8_t *src =
413 144104 : rsb->stripe_boundary_below + (buf_off << use_highbd);
414 :
415 144104 : uint8_t *dst8 = data8_bl + i * data_stride;
416 : // Save old pixels, then replace with data from stripe_boundary_below
417 144104 : memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
418 144104 : memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
419 : }
420 : }
421 : }
422 : else {
423 36 : if (copy_above) {
424 30 : uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
425 :
426 : // Only save and overwrite i=-RESTORATION_BORDER line.
427 30 : uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
428 : // Save old pixels, then replace with data from stripe_boundary_above
429 30 : memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
430 60 : memcpy(REAL_PTR(use_highbd, dst8),
431 30 : REAL_PTR(use_highbd,
432 : data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
433 : line_size);
434 : }
435 :
436 36 : if (copy_below) {
437 30 : const int32_t stripe_end = limits->v_start + h;
438 30 : uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
439 :
440 : // Only save and overwrite i=2 line.
441 30 : uint8_t *dst8 = data8_bl + 2 * data_stride;
442 : // Save old pixels, then replace with data from stripe_boundary_below
443 30 : memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
444 60 : memcpy(REAL_PTR(use_highbd, dst8),
445 30 : REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
446 : }
447 : }
448 57678 : }
449 :
450 : // This function restores the boundary lines modified by
451 : // setup_processing_stripe_boundary.
452 : //
453 : // Note: We need to be careful when handling the corners of the processing
454 : // unit, because (eg.) the top-left corner is considered to be part of
455 : // both the left and top borders. This means that, depending on the
456 : // loop_filter_across_tiles_enabled flag, the corner pixels might get
457 : // overwritten twice, once as part of the "top" border and once as part
458 : // of the "left" border (or similar for other corners).
459 : //
460 : // Everything works out fine as long as we make sure to reverse the order
461 : // when restoring, ie. we need to restore the left/right borders followed
462 : // by the top/bottom borders.
463 57677 : static void restore_processing_stripe_boundary(
464 : const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
465 : int32_t use_highbd, int32_t h, uint8_t *data8, int32_t data_stride, int32_t copy_above,
466 : int32_t copy_below, int32_t opt) {
467 57677 : const int32_t line_width =
468 57677 : (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
469 57677 : const int32_t line_size = line_width << use_highbd;
470 :
471 57677 : const int32_t data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
472 :
473 57677 : if (!opt) {
474 57641 : if (copy_above) {
475 48034 : uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
476 192136 : for (int32_t i = -RESTORATION_BORDER; i < 0; ++i) {
477 144102 : uint8_t *dst8 = data8_tl + i * data_stride;
478 144102 : memcpy(REAL_PTR(use_highbd, dst8),
479 144102 : rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
480 : }
481 : }
482 :
483 57641 : if (copy_below) {
484 48035 : const int32_t stripe_bottom = limits->v_start + h;
485 48035 : uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
486 :
487 192139 : for (int32_t i = 0; i < RESTORATION_BORDER; ++i) {
488 144104 : if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
489 :
490 144104 : uint8_t *dst8 = data8_bl + i * data_stride;
491 144104 : memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
492 : }
493 : }
494 : }
495 : else {
496 36 : if (copy_above) {
497 30 : uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
498 :
499 : // Only restore i=-RESTORATION_BORDER line.
500 30 : uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
501 30 : memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
502 : }
503 :
504 36 : if (copy_below) {
505 30 : const int32_t stripe_bottom = limits->v_start + h;
506 30 : uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
507 :
508 : // Only restore i=2 line.
509 30 : if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
510 30 : uint8_t *dst8 = data8_bl + 2 * data_stride;
511 30 : memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
512 : }
513 : }
514 : }
515 57677 : }
516 :
517 55050 : static void wiener_filter_stripe(const RestorationUnitInfo *rui,
518 : int32_t stripe_width, int32_t stripe_height,
519 : int32_t procunit_width, const uint8_t *src,
520 : int32_t src_stride, uint8_t *dst, int32_t dst_stride,
521 : int32_t *tmpbuf, int32_t bit_depth) {
522 : (void)tmpbuf;
523 : (void)bit_depth;
524 : assert(bit_depth == 8);
525 55050 : const ConvolveParams conv_params = get_conv_params_wiener(8);
526 :
527 353090 : for (int32_t j = 0; j < stripe_width; j += procunit_width) {
528 298044 : int32_t w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
529 298044 : const uint8_t *src_p = src + j;
530 298044 : uint8_t *dst_p = dst + j;//CHKN SSE
531 298044 : eb_av1_wiener_convolve_add_src(
532 298044 : src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
533 298044 : rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
534 : }
535 55046 : }
536 :
537 : /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
538 : over the input. The window is of size (2r + 1)x(2r + 1), and we
539 : specialize to r = 1, 2, 3. A default function is used for r > 3.
540 :
541 : Each loop follows the same format: We keep a window's worth of input
542 : in individual variables and select data out of that as appropriate.
543 : */
544 0 : static void boxsum1(int32_t *src, int32_t width, int32_t height, int32_t src_stride,
545 : int32_t sqr, int32_t *dst, int32_t dst_stride) {
546 : int32_t i, j, a, b, c;
547 : assert(width > 2 * SGRPROJ_BORDER_HORZ);
548 : assert(height > 2 * SGRPROJ_BORDER_VERT);
549 :
550 : // Vertical sum over 3-pixel regions, from src into dst.
551 0 : if (!sqr) {
552 0 : for (j = 0; j < width; ++j) {
553 0 : a = src[j];
554 0 : b = src[src_stride + j];
555 0 : c = src[2 * src_stride + j];
556 :
557 0 : dst[j] = a + b;
558 0 : for (i = 1; i < height - 2; ++i) {
559 : // Loop invariant: At the start of each iteration,
560 : // a = src[(i - 1) * src_stride + j]
561 : // b = src[(i ) * src_stride + j]
562 : // c = src[(i + 1) * src_stride + j]
563 0 : dst[i * dst_stride + j] = a + b + c;
564 0 : a = b;
565 0 : b = c;
566 0 : c = src[(i + 2) * src_stride + j];
567 : }
568 0 : dst[i * dst_stride + j] = a + b + c;
569 0 : dst[(i + 1) * dst_stride + j] = b + c;
570 : }
571 : }
572 : else {
573 0 : for (j = 0; j < width; ++j) {
574 0 : a = src[j] * src[j];
575 0 : b = src[src_stride + j] * src[src_stride + j];
576 0 : c = src[2 * src_stride + j] * src[2 * src_stride + j];
577 :
578 0 : dst[j] = a + b;
579 0 : for (i = 1; i < height - 2; ++i) {
580 0 : dst[i * dst_stride + j] = a + b + c;
581 0 : a = b;
582 0 : b = c;
583 0 : c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
584 : }
585 0 : dst[i * dst_stride + j] = a + b + c;
586 0 : dst[(i + 1) * dst_stride + j] = b + c;
587 : }
588 : }
589 :
590 : // Horizontal sum over 3-pixel regions of dst
591 0 : for (i = 0; i < height; ++i) {
592 0 : a = dst[i * dst_stride];
593 0 : b = dst[i * dst_stride + 1];
594 0 : c = dst[i * dst_stride + 2];
595 :
596 0 : dst[i * dst_stride] = a + b;
597 0 : for (j = 1; j < width - 2; ++j) {
598 : // Loop invariant: At the start of each iteration,
599 : // a = src[i * src_stride + (j - 1)]
600 : // b = src[i * src_stride + (j )]
601 : // c = src[i * src_stride + (j + 1)]
602 0 : dst[i * dst_stride + j] = a + b + c;
603 0 : a = b;
604 0 : b = c;
605 0 : c = dst[i * dst_stride + (j + 2)];
606 : }
607 0 : dst[i * dst_stride + j] = a + b + c;
608 0 : dst[i * dst_stride + (j + 1)] = b + c;
609 : }
610 0 : }
611 :
612 0 : static void boxsum2(int32_t *src, int32_t width, int32_t height, int32_t src_stride,
613 : int32_t sqr, int32_t *dst, int32_t dst_stride) {
614 : int32_t i, j, a, b, c, d, e;
615 : assert(width > 2 * SGRPROJ_BORDER_HORZ);
616 : assert(height > 2 * SGRPROJ_BORDER_VERT);
617 :
618 : // Vertical sum over 5-pixel regions, from src into dst.
619 0 : if (!sqr) {
620 0 : for (j = 0; j < width; ++j) {
621 0 : a = src[j];
622 0 : b = src[src_stride + j];
623 0 : c = src[2 * src_stride + j];
624 0 : d = src[3 * src_stride + j];
625 0 : e = src[4 * src_stride + j];
626 :
627 0 : dst[j] = a + b + c;
628 0 : dst[dst_stride + j] = a + b + c + d;
629 0 : for (i = 2; i < height - 3; ++i) {
630 : // Loop invariant: At the start of each iteration,
631 : // a = src[(i - 2) * src_stride + j]
632 : // b = src[(i - 1) * src_stride + j]
633 : // c = src[(i ) * src_stride + j]
634 : // d = src[(i + 1) * src_stride + j]
635 : // e = src[(i + 2) * src_stride + j]
636 0 : dst[i * dst_stride + j] = a + b + c + d + e;
637 0 : a = b;
638 0 : b = c;
639 0 : c = d;
640 0 : d = e;
641 0 : e = src[(i + 3) * src_stride + j];
642 : }
643 0 : dst[i * dst_stride + j] = a + b + c + d + e;
644 0 : dst[(i + 1) * dst_stride + j] = b + c + d + e;
645 0 : dst[(i + 2) * dst_stride + j] = c + d + e;
646 : }
647 : }
648 : else {
649 0 : for (j = 0; j < width; ++j) {
650 0 : a = src[j] * src[j];
651 0 : b = src[src_stride + j] * src[src_stride + j];
652 0 : c = src[2 * src_stride + j] * src[2 * src_stride + j];
653 0 : d = src[3 * src_stride + j] * src[3 * src_stride + j];
654 0 : e = src[4 * src_stride + j] * src[4 * src_stride + j];
655 :
656 0 : dst[j] = a + b + c;
657 0 : dst[dst_stride + j] = a + b + c + d;
658 0 : for (i = 2; i < height - 3; ++i) {
659 0 : dst[i * dst_stride + j] = a + b + c + d + e;
660 0 : a = b;
661 0 : b = c;
662 0 : c = d;
663 0 : d = e;
664 0 : e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
665 : }
666 0 : dst[i * dst_stride + j] = a + b + c + d + e;
667 0 : dst[(i + 1) * dst_stride + j] = b + c + d + e;
668 0 : dst[(i + 2) * dst_stride + j] = c + d + e;
669 : }
670 : }
671 :
672 : // Horizontal sum over 5-pixel regions of dst
673 0 : for (i = 0; i < height; ++i) {
674 0 : a = dst[i * dst_stride];
675 0 : b = dst[i * dst_stride + 1];
676 0 : c = dst[i * dst_stride + 2];
677 0 : d = dst[i * dst_stride + 3];
678 0 : e = dst[i * dst_stride + 4];
679 :
680 0 : dst[i * dst_stride] = a + b + c;
681 0 : dst[i * dst_stride + 1] = a + b + c + d;
682 0 : for (j = 2; j < width - 3; ++j) {
683 : // Loop invariant: At the start of each iteration,
684 : // a = src[i * src_stride + (j - 2)]
685 : // b = src[i * src_stride + (j - 1)]
686 : // c = src[i * src_stride + (j )]
687 : // d = src[i * src_stride + (j + 1)]
688 : // e = src[i * src_stride + (j + 2)]
689 0 : dst[i * dst_stride + j] = a + b + c + d + e;
690 0 : a = b;
691 0 : b = c;
692 0 : c = d;
693 0 : d = e;
694 0 : e = dst[i * dst_stride + (j + 3)];
695 : }
696 0 : dst[i * dst_stride + j] = a + b + c + d + e;
697 0 : dst[i * dst_stride + (j + 1)] = b + c + d + e;
698 0 : dst[i * dst_stride + (j + 2)] = c + d + e;
699 : }
700 0 : }
701 :
702 0 : static void boxsum(int32_t *src, int32_t width, int32_t height, int32_t src_stride, int32_t r,
703 : int32_t sqr, int32_t *dst, int32_t dst_stride) {
704 0 : if (r == 1)
705 0 : boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
706 0 : else if (r == 2)
707 0 : boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
708 : else
709 : assert(0 && "Invalid value of r in self-guided filter");
710 0 : }
711 :
712 50749 : void eb_decode_xq(const int32_t *xqd, int32_t *xq, const SgrParamsType *params) {
713 50749 : if (params->r[0] == 0) {
714 5528 : xq[0] = 0;
715 5528 : xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
716 : }
717 45221 : else if (params->r[1] == 0) {
718 10061 : xq[0] = xqd[0];
719 10061 : xq[1] = 0;
720 : }
721 : else {
722 35160 : xq[0] = xqd[0];
723 35160 : xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
724 : }
725 50749 : }
726 :
727 : const int32_t eb_x_by_xplus1[256] = {
728 : // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
729 : // instead of 0. See comments in selfguided_restoration_internal() for why
730 : 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
731 : 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
732 : 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
733 : 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
734 : 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
735 : 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
736 : 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
737 : 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
738 : 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
739 : 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
740 : 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
741 : 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
742 : 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
743 : 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
744 : 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
745 : 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
746 : 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
747 : 256,
748 : };
749 :
750 : const int32_t eb_one_by_x[MAX_NELEM] = {
751 : 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
752 : 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
753 : };
754 :
755 0 : static void selfguided_restoration_fast_internal(
756 : int32_t *dgd, int32_t width, int32_t height, int32_t dgd_stride, int32_t *dst,
757 : int32_t dst_stride, int32_t bit_depth, int32_t sgr_params_idx, int32_t radius_idx)
758 : {
759 0 : const SgrParamsType *const params = &eb_sgr_params[sgr_params_idx];
760 0 : const int32_t r = params->r[radius_idx];
761 0 : const int32_t width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
762 0 : const int32_t height_ext = height + 2 * SGRPROJ_BORDER_VERT;
763 : // Adjusting the stride of A and B here appears to avoid bad cache effects,
764 : // leading to a significant speed improvement.
765 : // We also align the stride to a multiple of 16 bytes, for consistency
766 : // with the SIMD version of this function.
767 0 : int32_t buf_stride = ((width_ext + 3) & ~3) + 16;
768 : int32_t A_[RESTORATION_PROC_UNIT_PELS];
769 : int32_t B_[RESTORATION_PROC_UNIT_PELS];
770 0 : int32_t *A = A_;
771 0 : int32_t *B = B_;
772 : int32_t i, j;
773 :
774 : assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
775 : assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
776 : "Need SGRPROJ_BORDER_* >= r+1");
777 :
778 0 : boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
779 : width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
780 0 : boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
781 : width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
782 0 : A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
783 0 : B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
784 : // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
785 : // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
786 0 : for (i = -1; i < height + 1; i += 2) {
787 0 : for (j = -1; j < width + 1; ++j) {
788 0 : const int32_t k = i * buf_stride + j;
789 0 : const int32_t n = (2 * r + 1) * (2 * r + 1);
790 :
791 : // a < 2^16 * n < 2^22 regardless of bit depth
792 0 : uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
793 : // b < 2^8 * n < 2^14 regardless of bit depth
794 0 : uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
795 :
796 : // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
797 : // and p itself satisfies p < 2^14 * n^2 < 2^26.
798 : // This bound on p is due to:
799 : // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
800 : //
801 : // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
802 : // This is an artefact of rounding, and can only happen if all pixels
803 : // are (almost) identical, so in this case we saturate to p=0.
804 0 : uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
805 :
806 0 : const uint32_t s = params->s[radius_idx];
807 :
808 : // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
809 : // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
810 : // (this holds even after accounting for the rounding in s)
811 0 : const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
812 :
813 : // Note: We have to be quite careful about the value of A[k].
814 : // This is used as a blend factor between individual pixel values and the
815 : // local mean. So it logically has a range of [0, 256], including both
816 : // endpoints.
817 : //
818 : // This is a pain for hardware, as we'd like something which can be stored
819 : // in exactly 8 bits.
820 : // Further, in the calculation of B[k] below, if z == 0 and r == 2,
821 : // then A[k] "should be" 0. But then we can end up setting B[k] to a value
822 : // slightly above 2^(8 + bit depth), due to rounding in the value of
823 : // eb_one_by_x[25-1].
824 : //
825 : // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
826 : // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
827 : // overflow), without significantly affecting the final result: z == 0
828 : // implies that the image is essentially "flat", so the local mean and
829 : // individual pixel values are very similar.
830 : //
831 : // Note that saturating on the other side, ie. requring A[k] <= 255,
832 : // would be a bad idea, as that corresponds to the case where the image
833 : // is very variable, when we want to preserve the local pixel value as
834 : // much as possible.
835 0 : A[k] = eb_x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
836 :
837 : // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
838 : // eb_one_by_x[n - 1] = round(2^12 / n)
839 : // => the product here is < 2^(20 + bit_depth) <= 2^32,
840 : // and B[k] is set to a value < 2^(8 + bit depth)
841 : // This holds even with the rounding in eb_one_by_x and in the overall
842 : // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
843 0 : B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
844 : (uint32_t)B[k] *
845 : (uint32_t)eb_one_by_x[n - 1],
846 : SGRPROJ_RECIP_BITS);
847 : }
848 : }
849 : // Use the A[] and B[] arrays to calculate the filtered image
850 : assert(r == 2);
851 0 : for (i = 0; i < height; ++i) {
852 0 : if (!(i & 1)) { // even row
853 0 : for (j = 0; j < width; ++j) {
854 0 : const int32_t k = i * buf_stride + j;
855 0 : const int32_t l = i * dgd_stride + j;
856 0 : const int32_t m = i * dst_stride + j;
857 0 : const int32_t nb = 5;
858 0 : const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
859 0 : (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
860 0 : A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
861 : 5;
862 0 : const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
863 0 : (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
864 0 : B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
865 : 5;
866 0 : const int32_t v = a * dgd[l] + b;
867 0 : dst[m] =
868 0 : ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
869 : }
870 : }
871 : else { // odd row
872 0 : for (j = 0; j < width; ++j) {
873 0 : const int32_t k = i * buf_stride + j;
874 0 : const int32_t l = i * dgd_stride + j;
875 0 : const int32_t m = i * dst_stride + j;
876 0 : const int32_t nb = 4;
877 0 : const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
878 0 : const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
879 0 : const int32_t v = a * dgd[l] + b;
880 0 : dst[m] =
881 0 : ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
882 : }
883 : }
884 : }
885 0 : }
886 :
887 0 : static void selfguided_restoration_internal(int32_t *dgd, int32_t width, int32_t height,
888 : int32_t dgd_stride, int32_t *dst,
889 : int32_t dst_stride, int32_t bit_depth,
890 : int32_t sgr_params_idx,
891 : int32_t radius_idx) {
892 0 : const SgrParamsType *const params = &eb_sgr_params[sgr_params_idx];
893 0 : const int32_t r = params->r[radius_idx];
894 0 : const int32_t width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
895 0 : const int32_t height_ext = height + 2 * SGRPROJ_BORDER_VERT;
896 : // Adjusting the stride of A and B here appears to avoid bad cache effects,
897 : // leading to a significant speed improvement.
898 : // We also align the stride to a multiple of 16 bytes, for consistency
899 : // with the SIMD version of this function.
900 0 : int32_t buf_stride = ((width_ext + 3) & ~3) + 16;
901 : int32_t A_[RESTORATION_PROC_UNIT_PELS];
902 : int32_t B_[RESTORATION_PROC_UNIT_PELS];
903 0 : int32_t *A = A_;
904 0 : int32_t *B = B_;
905 : int32_t i, j;
906 :
907 : assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
908 : assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
909 : "Need SGRPROJ_BORDER_* >= r+1");
910 :
911 0 : boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
912 : width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
913 0 : boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
914 : width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
915 0 : A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
916 0 : B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
917 : // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
918 : // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
919 0 : for (i = -1; i < height + 1; ++i) {
920 0 : for (j = -1; j < width + 1; ++j) {
921 0 : const int32_t k = i * buf_stride + j;
922 0 : const int32_t n = (2 * r + 1) * (2 * r + 1);
923 :
924 : // a < 2^16 * n < 2^22 regardless of bit depth
925 0 : uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
926 : // b < 2^8 * n < 2^14 regardless of bit depth
927 0 : uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
928 :
929 : // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
930 : // and p itself satisfies p < 2^14 * n^2 < 2^26.
931 : // This bound on p is due to:
932 : // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
933 : //
934 : // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
935 : // This is an artefact of rounding, and can only happen if all pixels
936 : // are (almost) identical, so in this case we saturate to p=0.
937 0 : uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
938 :
939 0 : const uint32_t s = params->s[radius_idx];
940 :
941 : // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
942 : // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
943 : // (this holds even after accounting for the rounding in s)
944 0 : const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
945 :
946 : // Note: We have to be quite careful about the value of A[k].
947 : // This is used as a blend factor between individual pixel values and the
948 : // local mean. So it logically has a range of [0, 256], including both
949 : // endpoints.
950 : //
951 : // This is a pain for hardware, as we'd like something which can be stored
952 : // in exactly 8 bits.
953 : // Further, in the calculation of B[k] below, if z == 0 and r == 2,
954 : // then A[k] "should be" 0. But then we can end up setting B[k] to a value
955 : // slightly above 2^(8 + bit depth), due to rounding in the value of
956 : // eb_one_by_x[25-1].
957 : //
958 : // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
959 : // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
960 : // overflow), without significantly affecting the final result: z == 0
961 : // implies that the image is essentially "flat", so the local mean and
962 : // individual pixel values are very similar.
963 : //
964 : // Note that saturating on the other side, ie. requring A[k] <= 255,
965 : // would be a bad idea, as that corresponds to the case where the image
966 : // is very variable, when we want to preserve the local pixel value as
967 : // much as possible.
968 0 : A[k] = eb_x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
969 :
970 : // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
971 : // eb_one_by_x[n - 1] = round(2^12 / n)
972 : // => the product here is < 2^(20 + bit_depth) <= 2^32,
973 : // and B[k] is set to a value < 2^(8 + bit depth)
974 : // This holds even with the rounding in eb_one_by_x and in the overall
975 : // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
976 0 : B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
977 : (uint32_t)B[k] *
978 : (uint32_t)eb_one_by_x[n - 1],
979 : SGRPROJ_RECIP_BITS);
980 : }
981 : }
982 : // Use the A[] and B[] arrays to calculate the filtered image
983 0 : for (i = 0; i < height; ++i) {
984 0 : for (j = 0; j < width; ++j) {
985 0 : const int32_t k = i * buf_stride + j;
986 0 : const int32_t l = i * dgd_stride + j;
987 0 : const int32_t m = i * dst_stride + j;
988 0 : const int32_t nb = 5;
989 0 : const int32_t a =
990 0 : (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
991 : 4 +
992 0 : (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
993 0 : A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
994 : 3;
995 0 : const int32_t b =
996 0 : (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
997 : 4 +
998 0 : (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
999 0 : B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
1000 : 3;
1001 0 : const int32_t v = a * dgd[l] + b;
1002 0 : dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
1003 : }
1004 : }
1005 0 : }
1006 :
1007 0 : void eb_av1_selfguided_restoration_c(const uint8_t *dgd8, int32_t width, int32_t height,
1008 : int32_t dgd_stride, int32_t *flt0, int32_t *flt1,
1009 : int32_t flt_stride, int32_t sgr_params_idx,
1010 : int32_t bit_depth, int32_t highbd) {
1011 : int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
1012 0 : const int32_t dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
1013 0 : int32_t *dgd32 =
1014 0 : dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
1015 :
1016 0 : if (highbd) {
1017 0 : const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
1018 0 : for (int32_t i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
1019 0 : for (int32_t j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j)
1020 0 : dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
1021 : }
1022 : }
1023 : else {
1024 0 : for (int32_t i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
1025 0 : for (int32_t j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j)
1026 0 : dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
1027 : }
1028 : }
1029 :
1030 0 : const SgrParamsType *const params = &eb_sgr_params[sgr_params_idx];
1031 : // If params->r == 0 we skip the corresponding filter. We only allow one of
1032 : // the radii to be 0, as having both equal to 0 would be equivalent to
1033 : // skipping SGR entirely.
1034 : assert(!(params->r[0] == 0 && params->r[1] == 0));
1035 :
1036 0 : if (params->r[0] > 0)
1037 0 : selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
1038 : flt0, flt_stride, bit_depth,
1039 : sgr_params_idx, 0);
1040 0 : if (params->r[1] > 0)
1041 0 : selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
1042 : flt_stride, bit_depth, sgr_params_idx, 1);
1043 0 : }
1044 :
1045 0 : void eb_apply_selfguided_restoration_c(const uint8_t *dat8, int32_t width, int32_t height,
1046 : int32_t stride, int32_t eps, const int32_t *xqd,
1047 : uint8_t *dst8, int32_t dst_stride,
1048 : int32_t *tmpbuf, int32_t bit_depth,
1049 : int32_t highbd) {
1050 0 : int32_t *flt0 = tmpbuf;
1051 0 : int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
1052 : assert(width * height <= RESTORATION_UNITPELS_MAX);
1053 :
1054 0 : eb_av1_selfguided_restoration_c(dat8, width, height, stride, flt0, flt1, width,
1055 : eps, bit_depth, highbd);
1056 0 : const SgrParamsType *const params = &eb_sgr_params[eps];
1057 : int32_t xq[2];
1058 0 : eb_decode_xq(xqd, xq, params);
1059 0 : for (int32_t i = 0; i < height; ++i) {
1060 0 : for (int32_t j = 0; j < width; ++j) {
1061 0 : const int32_t k = i * width + j;
1062 0 : uint8_t *dst8ij = dst8 + i * dst_stride + j;
1063 0 : const uint8_t *dat8ij = dat8 + i * stride + j;
1064 :
1065 0 : const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
1066 0 : const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
1067 0 : int32_t v = u << SGRPROJ_PRJ_BITS;
1068 : // If params->r == 0 then we skipped the filtering in
1069 : // eb_av1_selfguided_restoration_c, i.e. flt[k] == u
1070 0 : if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
1071 0 : if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
1072 0 : const int16_t w =
1073 0 : (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
1074 :
1075 0 : const uint16_t out = clip_pixel_highbd(w, bit_depth);
1076 0 : if (highbd)
1077 0 : *CONVERT_TO_SHORTPTR(dst8ij) = out;
1078 : else
1079 0 : *dst8ij = (uint8_t)out;
1080 : }
1081 : }
1082 0 : }
1083 :
1084 2628 : static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
1085 : int32_t stripe_width, int32_t stripe_height,
1086 : int32_t procunit_width, const uint8_t *src,
1087 : int32_t src_stride, uint8_t *dst, int32_t dst_stride,
1088 : int32_t *tmpbuf, int32_t bit_depth) {
1089 : (void)bit_depth;
1090 : assert(bit_depth == 8);
1091 :
1092 16788 : for (int32_t j = 0; j < stripe_width; j += procunit_width) {
1093 14160 : int32_t w = AOMMIN(procunit_width, stripe_width - j);
1094 : //CHKN SSE
1095 14160 : eb_apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
1096 14160 : rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
1097 : dst + j, dst_stride, tmpbuf, bit_depth, 0);
1098 : }
1099 2628 : }
1100 :
1101 0 : static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
1102 : int32_t stripe_width, int32_t stripe_height,
1103 : int32_t procunit_width, const uint8_t *src8,
1104 : int32_t src_stride, uint8_t *dst8,
1105 : int32_t dst_stride, int32_t *tmpbuf,
1106 : int32_t bit_depth) {
1107 : (void)tmpbuf;
1108 0 : const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
1109 :
1110 0 : for (int32_t j = 0; j < stripe_width; j += procunit_width) {
1111 0 : int32_t w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
1112 0 : const uint8_t *src8_p = src8 + j;
1113 0 : uint8_t *dst8_p = dst8 + j;
1114 0 : eb_av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride, //CHKN SSE
1115 0 : rui->wiener_info.hfilter, 16,
1116 0 : rui->wiener_info.vfilter, 16, w,
1117 : stripe_height, &conv_params, bit_depth);
1118 : }
1119 0 : }
1120 :
1121 0 : static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
1122 : int32_t stripe_width, int32_t stripe_height,
1123 : int32_t procunit_width,
1124 : const uint8_t *src8, int32_t src_stride,
1125 : uint8_t *dst8, int32_t dst_stride,
1126 : int32_t *tmpbuf, int32_t bit_depth) {
1127 0 : for (int32_t j = 0; j < stripe_width; j += procunit_width) {
1128 0 : int32_t w = AOMMIN(procunit_width, stripe_width - j);
1129 :
1130 : //CHKN SSE
1131 0 : eb_apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
1132 0 : rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
1133 : dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
1134 : }
1135 0 : }
1136 :
1137 : typedef void(*stripe_filter_fun)(const RestorationUnitInfo *rui,
1138 : int32_t stripe_width, int32_t stripe_height,
1139 : int32_t procunit_width, const uint8_t *src,
1140 : int32_t src_stride, uint8_t *dst, int32_t dst_stride,
1141 : int32_t *tmpbuf, int32_t bit_depth);
1142 :
1143 : #define NUM_STRIPE_FILTERS 4
1144 :
1145 : static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
1146 : wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
1147 : sgrproj_filter_stripe_highbd
1148 : };
1149 :
1150 : // Filter one restoration unit
1151 9729 : void eb_av1_loop_restoration_filter_unit(
1152 : uint8_t need_bounadaries,
1153 : const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
1154 : const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
1155 : const AV1PixelRect *tile_rect, int32_t tile_stripe0, int32_t ss_x, int32_t ss_y,
1156 : int32_t highbd, int32_t bit_depth, uint8_t *data8, int32_t stride, uint8_t *dst8,
1157 : int32_t dst_stride, int32_t *tmpbuf, int32_t optimized_lr) {
1158 9729 : RestorationType unit_rtype = rui->restoration_type;
1159 :
1160 9729 : int32_t unit_h = limits->v_end - limits->v_start;
1161 9729 : int32_t unit_w = limits->h_end - limits->h_start;
1162 9729 : uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
1163 9729 : uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
1164 :
1165 9729 : if (unit_rtype == RESTORE_NONE) {
1166 116 : copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
1167 116 : return;
1168 : }
1169 :
1170 9613 : const int32_t filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1171 : assert(filter_idx < NUM_STRIPE_FILTERS);
1172 9613 : const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1173 :
1174 9613 : const int32_t procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1175 :
1176 : // Convolve the whole tile one stripe at a time
1177 9613 : RestorationTileLimits remaining_stripes = *limits;
1178 9613 : int32_t i = 0;
1179 67291 : while (i < unit_h) {
1180 : int32_t copy_above, copy_below;
1181 57678 : remaining_stripes.v_start = limits->v_start + i;
1182 :
1183 57678 : get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, ©_above,
1184 : ©_below);
1185 :
1186 57678 : const int32_t full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1187 57678 : const int32_t runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1188 :
1189 : // Work out where this stripe's boundaries are within
1190 : // rsb->stripe_boundary_{above,below}
1191 57678 : const int32_t tile_stripe =
1192 57678 : (remaining_stripes.v_start - tile_rect->top + runit_offset) /
1193 : full_stripe_height;
1194 57678 : const int32_t frame_stripe = tile_stripe0 + tile_stripe;
1195 57678 : const int32_t rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1196 :
1197 : // Calculate this stripe's height, based on two rules:
1198 : // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
1199 : // * We can't extend past the end of the current restoration unit
1200 57678 : const int32_t nominal_stripe_height =
1201 57678 : full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
1202 57678 : const int32_t h = AOMMIN(nominal_stripe_height,
1203 : remaining_stripes.v_end - remaining_stripes.v_start);
1204 :
1205 57678 : if(need_bounadaries)
1206 57678 : setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1207 : h, data8, stride, rlbs, copy_above,
1208 : copy_below, optimized_lr);
1209 :
1210 57678 : stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1211 57678 : dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
1212 57677 : if (need_bounadaries)
1213 57677 : restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1214 : data8, stride, copy_above, copy_below,
1215 : optimized_lr);
1216 :
1217 57678 : i += h;
1218 : }
1219 : }
1220 :
1221 : typedef struct {
1222 : const RestorationInfo *rsi;
1223 : RestorationLineBuffers *rlbs;
1224 : const Av1Common *cm;
1225 : int32_t tile_stripe0;
1226 : int32_t ss_x, ss_y;
1227 : int32_t highbd, bit_depth;
1228 : uint8_t *data8, *dst8;
1229 : int32_t data_stride, dst_stride;
1230 : int32_t *tmpbuf;
1231 : } FilterFrameCtxt;
1232 :
1233 72 : static void filter_frame_on_tile(int32_t tile_row, int32_t tile_col, void *priv) {
1234 : (void)tile_col;
1235 72 : FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1236 72 : ctxt->tile_stripe0 =
1237 72 : (tile_row == 0) ? 0 : ctxt->cm->rst_end_stripe[tile_row - 1];
1238 72 : }
1239 :
1240 192 : static void filter_frame_on_unit(const RestorationTileLimits *limits,
1241 : const AV1PixelRect *tile_rect,
1242 : int32_t rest_unit_idx, void *priv) {
1243 192 : FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1244 192 : const RestorationInfo *rsi = ctxt->rsi;
1245 :
1246 192 : eb_av1_loop_restoration_filter_unit(
1247 : 1,
1248 192 : limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, ctxt->rlbs,
1249 : tile_rect, ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
1250 : ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
1251 : ctxt->dst_stride, ctxt->tmpbuf, rsi->optimized_lr);
1252 192 : }
1253 :
1254 60 : void eb_av1_loop_restoration_filter_frame(Yv12BufferConfig *frame,
1255 : Av1Common *cm, int32_t optimized_lr) {
1256 : // assert(!cm->all_lossless);
1257 60 : const int32_t num_planes = 3;// av1_num_planes(cm);
1258 : typedef void(*copy_fun)(const Yv12BufferConfig *src,
1259 : Yv12BufferConfig *dst);
1260 : static const copy_fun copy_funs[3] = { eb_aom_yv12_copy_y_c, eb_aom_yv12_copy_u_c, eb_aom_yv12_copy_v_c };//CHKN SSE
1261 :
1262 60 : Yv12BufferConfig *dst = &cm->rst_frame;
1263 :
1264 60 : const int32_t frame_width = frame->crop_widths[0];
1265 60 : const int32_t frame_height = frame->crop_heights[0];
1266 60 : if (eb_aom_realloc_frame_buffer(dst, frame_width, frame_height,
1267 : cm->subsampling_x, cm->subsampling_y,
1268 : cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
1269 : cm->byte_alignment, NULL, NULL, NULL) < 0)
1270 0 : printf("Failed to allocate restoration dst buffer\n");
1271 :
1272 : RestorationLineBuffers rlbs;
1273 60 : const int32_t bit_depth = cm->bit_depth;
1274 60 : const int32_t highbd = cm->use_highbitdepth;
1275 :
1276 240 : for (int32_t plane = 0; plane < num_planes; ++plane) {
1277 180 : RestorationInfo *rsi = &cm->rst_info[plane];
1278 180 : RestorationType rtype = rsi->frame_restoration_type;
1279 180 : rsi->optimized_lr = optimized_lr;
1280 :
1281 180 : if (rtype == RESTORE_NONE)
1282 108 : continue;
1283 72 : const int32_t is_uv = plane > 0;
1284 72 : const int32_t plane_width = frame->crop_widths[is_uv];
1285 72 : const int32_t plane_height = frame->crop_heights[is_uv];
1286 :
1287 72 : eb_extend_frame(frame->buffers[plane], plane_width, plane_height,
1288 : frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER,
1289 : highbd);
1290 :
1291 : FilterFrameCtxt ctxt;
1292 72 : ctxt.rsi = rsi;
1293 72 : ctxt.rlbs = &rlbs;
1294 72 : ctxt.cm = cm;
1295 72 : ctxt.ss_x = is_uv && cm->subsampling_x;
1296 72 : ctxt.ss_y = is_uv && cm->subsampling_y;
1297 72 : ctxt.highbd = highbd;
1298 72 : ctxt.bit_depth = bit_depth;
1299 72 : ctxt.data8 = frame->buffers[plane];
1300 72 : ctxt.dst8 = dst->buffers[plane];
1301 72 : ctxt.data_stride = frame->strides[is_uv];
1302 72 : ctxt.dst_stride = dst->strides[is_uv];
1303 72 : ctxt.tmpbuf = cm->rst_tmpbuf;
1304 :
1305 72 : av1_foreach_rest_unit_in_frame(cm, plane, filter_frame_on_tile, filter_frame_on_unit, &ctxt);
1306 :
1307 72 : copy_funs[plane](dst, frame);
1308 : }
1309 60 : }
1310 :
1311 672 : static void foreach_rest_unit_in_tile(const AV1PixelRect *tile_rect,
1312 : int32_t tile_row, int32_t tile_col, int32_t tile_cols,
1313 : int32_t hunits_per_tile, int32_t units_per_tile,
1314 : int32_t unit_size, int32_t ss_y,
1315 : RestUnitVisitor on_rest_unit,
1316 : void *priv) {
1317 672 : const int32_t tile_w = tile_rect->right - tile_rect->left;
1318 672 : const int32_t tile_h = tile_rect->bottom - tile_rect->top;
1319 672 : const int32_t ext_size = unit_size * 3 / 2;
1320 :
1321 672 : const int32_t tile_idx = tile_col + tile_row * tile_cols;
1322 672 : const int32_t unit_idx0 = tile_idx * units_per_tile;
1323 :
1324 672 : int32_t y0 = 0, i = 0;
1325 1344 : while (y0 < tile_h) {
1326 672 : int32_t remaining_h = tile_h - y0;
1327 672 : int32_t h = (remaining_h < ext_size) ? remaining_h : unit_size;
1328 :
1329 : RestorationTileLimits limits;
1330 672 : limits.v_start = tile_rect->top + y0;
1331 672 : limits.v_end = tile_rect->top + y0 + h;
1332 : assert(limits.v_end <= tile_rect->bottom);
1333 : // Offset the tile upwards to align with the restoration processing stripe
1334 672 : const int32_t voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1335 672 : limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
1336 672 : if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
1337 :
1338 672 : int32_t x0 = 0, j = 0;
1339 1944 : while (x0 < tile_w) {
1340 1272 : int32_t remaining_w = tile_w - x0;
1341 1272 : int32_t w = (remaining_w < ext_size) ? remaining_w : unit_size;
1342 :
1343 1272 : limits.h_start = tile_rect->left + x0;
1344 1272 : limits.h_end = tile_rect->left + x0 + w;
1345 : assert(limits.h_end <= tile_rect->right);
1346 :
1347 1272 : const int32_t unit_idx = unit_idx0 + i * hunits_per_tile + j;
1348 1272 : on_rest_unit(&limits, tile_rect, unit_idx, priv);
1349 :
1350 1272 : x0 += w;
1351 1272 : ++j;
1352 : }
1353 :
1354 672 : y0 += h;
1355 672 : ++i;
1356 : }
1357 672 : }
1358 :
1359 672 : void av1_foreach_rest_unit_in_frame(Av1Common *cm, int32_t plane,
1360 : RestTileStartVisitor on_tile,
1361 : RestUnitVisitor on_rest_unit,
1362 : void *priv) {
1363 672 : const int32_t is_uv = plane > 0;
1364 672 : const int32_t ss_y = is_uv && cm->subsampling_y;
1365 :
1366 672 : const RestorationInfo *rsi = &cm->rst_info[plane];
1367 :
1368 672 : const AV1PixelRect tile_rect = whole_frame_rect(&cm->frm_size,
1369 : cm->subsampling_x, cm->subsampling_y, is_uv);
1370 :
1371 672 : if (on_tile) on_tile(0, 0, priv);
1372 :
1373 672 : foreach_rest_unit_in_tile(&tile_rect, 0, 0, 1, rsi->horz_units_per_tile,
1374 : rsi->units_per_tile, rsi->restoration_unit_size,
1375 : ss_y, on_rest_unit, priv);
1376 672 : }
1377 540 : static void foreach_rest_unit_in_tile_seg(const AV1PixelRect *tile_rect,
1378 : int32_t tile_row, int32_t tile_col, int32_t tile_cols,
1379 : int32_t hunits_per_tile, int32_t units_per_tile,
1380 : int32_t unit_size, int32_t ss_y,
1381 : RestUnitVisitor on_rest_unit,
1382 : void *priv ,
1383 : int32_t vunits_per_tile,
1384 : PictureControlSet *picture_control_set_ptr,
1385 : uint32_t segment_index )
1386 : {
1387 : //tile_row=0
1388 : //tile_col=0
1389 : //tile_cols=1
1390 540 : const int32_t tile_w = tile_rect->right - tile_rect->left; // eq to pic_width
1391 540 : const int32_t tile_h = tile_rect->bottom - tile_rect->top; // eq to pic_height
1392 540 : const int32_t ext_size = unit_size * 3 / 2;
1393 :
1394 540 : const int32_t tile_idx = tile_col + tile_row * tile_cols; //eq to 0
1395 540 : const int32_t unit_idx0 = tile_idx * units_per_tile; //eq to 0
1396 :
1397 : uint32_t x_seg_idx;
1398 : uint32_t y_seg_idx;
1399 540 : uint32_t picture_width_in_units = hunits_per_tile;
1400 540 : uint32_t picture_height_in_units = vunits_per_tile;
1401 540 : SEGMENT_CONVERT_IDX_TO_XY(segment_index, x_seg_idx, y_seg_idx, picture_control_set_ptr->rest_segments_column_count);
1402 540 : uint32_t x_unit_start_idx = SEGMENT_START_IDX(x_seg_idx, picture_width_in_units, picture_control_set_ptr->rest_segments_column_count);
1403 540 : uint32_t x_unit_end_idx = SEGMENT_END_IDX (x_seg_idx, picture_width_in_units, picture_control_set_ptr->rest_segments_column_count);
1404 540 : uint32_t y_unit_start_idx = SEGMENT_START_IDX(y_seg_idx, picture_height_in_units, picture_control_set_ptr->rest_segments_row_count);
1405 540 : uint32_t y_unit_end_idx = SEGMENT_END_IDX (y_seg_idx, picture_height_in_units, picture_control_set_ptr->rest_segments_row_count);
1406 :
1407 540 : int32_t y0 = y_unit_start_idx * unit_size;
1408 540 : int32_t yend = ((int32_t)y_unit_end_idx == (int32_t)picture_height_in_units) ? tile_h : (int32_t)y_unit_end_idx * (int32_t)unit_size; //MIN(y_unit_end_idx * unit_size , tile_h);
1409 540 : int32_t i = y_unit_start_idx;
1410 :
1411 1080 : while (y0 < yend) {
1412 540 : int32_t remaining_h = tile_h - y0;
1413 540 : int32_t h = (remaining_h < ext_size) ? remaining_h : unit_size; //the area at the pic boundary should have size>= half unit_size to be an independent unit.
1414 : //if not, it will be added to the last complete unit, increasing its size to up to 3/2 unit_size.
1415 :
1416 : RestorationTileLimits limits;
1417 540 : limits.v_start = tile_rect->top + y0;
1418 540 : limits.v_end = tile_rect->top + y0 + h;
1419 : assert(limits.v_end <= tile_rect->bottom);
1420 : // Offset the tile upwards to align with the restoration processing stripe
1421 540 : const int32_t voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1422 540 : limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
1423 540 : if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
1424 :
1425 540 : int32_t x0 = x_unit_start_idx * unit_size;
1426 540 : int32_t xend = ((int32_t)x_unit_end_idx == (int32_t)picture_width_in_units) ? tile_w : (int32_t)x_unit_end_idx * (int32_t)unit_size; //MIN(x_unit_end_idx * unit_size,tile_w);
1427 540 : int32_t j = x_unit_start_idx;
1428 :
1429 1440 : while (x0 < xend) {
1430 900 : int32_t remaining_w = tile_w - x0;
1431 900 : int32_t w = (remaining_w < ext_size) ? remaining_w : unit_size;
1432 :
1433 900 : limits.h_start = tile_rect->left + x0;
1434 900 : limits.h_end = tile_rect->left + x0 + w;
1435 : assert(limits.h_end <= tile_rect->right);
1436 :
1437 900 : const int32_t unit_idx = unit_idx0 + i * hunits_per_tile + j;
1438 900 : on_rest_unit(&limits, tile_rect, unit_idx, priv);
1439 :
1440 900 : x0 += w;
1441 900 : ++j;
1442 : }
1443 :
1444 540 : y0 += h;
1445 540 : ++i;
1446 : }
1447 540 : }
1448 540 : void av1_foreach_rest_unit_in_frame_seg(Av1Common *cm, int32_t plane,
1449 : RestTileStartVisitor on_tile,
1450 : RestUnitVisitor on_rest_unit,
1451 : void *priv,
1452 : PictureControlSet *picture_control_set_ptr,
1453 : uint32_t segment_index)
1454 : {
1455 540 : const int32_t is_uv = plane > 0;
1456 540 : const int32_t ss_y = is_uv && cm->subsampling_y;
1457 :
1458 540 : const RestorationInfo *rsi = &cm->rst_info[plane];
1459 :
1460 540 : const AV1PixelRect tile_rect = whole_frame_rect(&cm->frm_size,
1461 : cm->subsampling_x, cm->subsampling_y, is_uv);
1462 :
1463 540 : if (on_tile) on_tile(0, 0, priv); //will set rsc->tile_strip0=0;
1464 :
1465 540 : foreach_rest_unit_in_tile_seg(&tile_rect, 0, 0, 1, rsi->horz_units_per_tile,
1466 : rsi->units_per_tile, rsi->restoration_unit_size,
1467 : ss_y, on_rest_unit, priv,
1468 : rsi->vert_units_per_tile,
1469 : picture_control_set_ptr,
1470 : segment_index);
1471 540 : }
1472 :
1473 117030 : int32_t eb_av1_loop_restoration_corners_in_sb(Av1Common *cm, int32_t plane,
1474 : int32_t mi_row, int32_t mi_col, BlockSize bsize,
1475 : int32_t *rcol0, int32_t *rcol1, int32_t *rrow0,
1476 : int32_t *rrow1, int32_t *tile_tl_idx) {
1477 : assert(rcol0 && rcol1 && rrow0 && rrow1);
1478 117030 : if (bsize != cm->p_pcs_ptr->sequence_control_set_ptr->seq_header.sb_size) return 0;
1479 21583 : if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
1480 :
1481 : // assert(!cm->all_lossless);
1482 :
1483 4303 : const int32_t is_uv = plane > 0;
1484 :
1485 4303 : const AV1PixelRect tile_rect = whole_frame_rect(&cm->frm_size,
1486 : cm->subsampling_x, cm->subsampling_y, is_uv);
1487 4320 : const int32_t tile_w = tile_rect.right - tile_rect.left;
1488 4320 : const int32_t tile_h = tile_rect.bottom - tile_rect.top;
1489 :
1490 4320 : const int32_t mi_top = 0;
1491 4320 : const int32_t mi_left = 0;
1492 :
1493 : // Compute the mi-unit corners of the superblock relative to the top-left of
1494 : // the tile
1495 4320 : const int32_t mi_rel_row0 = mi_row - mi_top;
1496 4320 : const int32_t mi_rel_col0 = mi_col - mi_left;
1497 4320 : const int32_t mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
1498 4320 : const int32_t mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
1499 :
1500 4320 : const RestorationInfo *rsi = &cm->rst_info[plane];
1501 4320 : const int32_t size = rsi->restoration_unit_size;
1502 :
1503 : // Calculate the number of restoration units in this tile (which might be
1504 : // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
1505 4320 : const int32_t horz_units = count_units_in_tile(size, tile_w);
1506 4320 : const int32_t vert_units = count_units_in_tile(size, tile_h);
1507 :
1508 : // The size of an MI-unit on this plane of the image
1509 4320 : const int32_t ss_x = is_uv && cm->subsampling_x;
1510 4320 : const int32_t ss_y = is_uv && cm->subsampling_y;
1511 4320 : const int32_t mi_size_x = MI_SIZE >> ss_x;
1512 4320 : const int32_t mi_size_y = MI_SIZE >> ss_y;
1513 :
1514 : // Write m for the relative mi column or row, D for the superres denominator
1515 : // and N for the superres numerator. If u is the upscaled (called "unscaled"
1516 : // elsewhere) pixel offset then we can write the downscaled pixel offset in
1517 : // two ways as:
1518 : //
1519 : // MI_SIZE * m = N / D u
1520 : //
1521 : // from which we get u = D * MI_SIZE * m / N
1522 :
1523 4320 : const int32_t mi_to_num_x = mi_size_x;//CHKN av1_superres_unscaled(cm) ? mi_size_x : mi_size_x * cm->superres_scale_denominator;
1524 4320 : const int32_t mi_to_num_y = mi_size_y;
1525 4320 : const int32_t denom_x = size;//CHKN av1_superres_unscaled(cm) ? size : size * SCALE_NUMERATOR;
1526 4320 : const int32_t denom_y = size;
1527 :
1528 4320 : const int32_t rnd_x = denom_x - 1;
1529 4320 : const int32_t rnd_y = denom_y - 1;
1530 :
1531 : // rcol0/rrow0 should be the first column/row of restoration units (relative
1532 : // to the top-left of the tile) that doesn't start left/below of
1533 : // mi_col/mi_row. For this calculation, we need to round up the division (if
1534 : // the sb starts at runit column 10.1, the first matching runit has column
1535 : // index 11)
1536 4320 : *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
1537 4320 : *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
1538 :
1539 : // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1540 : // below-right. If we're at the bottom or right of the tile, this restoration
1541 : // unit might not exist, in which case we'll clamp accordingly.
1542 4320 : *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1543 4320 : *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1544 :
1545 4320 : const int32_t tile_idx = 0;
1546 4320 : *tile_tl_idx = tile_idx * rsi->units_per_tile;
1547 :
1548 4320 : return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1549 : }
1550 :
1551 : // Extend to left and right
1552 4212 : void extend_lines(uint8_t *buf, int32_t width, int32_t height, int32_t stride,
1553 : int32_t extend, int32_t use_highbitdepth) {
1554 12636 : for (int32_t i = 0; i < height; ++i) {
1555 8424 : if (use_highbitdepth) {
1556 0 : uint16_t *buf16 = (uint16_t *)buf;
1557 0 : eb_aom_memset16(buf16 - extend, buf16[0], extend);
1558 0 : eb_aom_memset16(buf16 + width, buf16[width - 1], extend);
1559 : }
1560 : else {
1561 8424 : memset(buf - extend, buf[0], extend);
1562 8424 : memset(buf + width, buf[width - 1], extend);
1563 : }
1564 8424 : buf += stride;
1565 : }
1566 4212 : }
1567 :
1568 3510 : static void save_deblock_boundary_lines(
1569 : uint8_t *src_buf, int32_t src_stride, int32_t src_width, int32_t src_height,
1570 : const Av1Common *cm, int32_t plane, int32_t row,
1571 : int32_t stripe, int32_t use_highbd, int32_t is_above,
1572 : RestorationStripeBoundaries *boundaries)
1573 : {
1574 3510 : const int32_t is_uv = plane > 0;
1575 3510 : src_stride = src_stride << use_highbd;
1576 3510 : const uint8_t *src_rows = src_buf + row * src_stride;
1577 :
1578 3510 : uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1579 3510 : : boundaries->stripe_boundary_below;
1580 3510 : uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1581 3510 : const int32_t bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1582 3510 : uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1583 :
1584 : // There is a rare case in which a processing stripe can end 1px above the
1585 : // crop border. In this case, we do want to use deblocked pixels from below
1586 : // the stripe (hence why we ended up in this function), but instead of
1587 : // fetching 2 "below" rows we need to fetch one and duplicate it.
1588 : // This is equivalent to clamping the sample locations against the crop border
1589 3510 : const int32_t lines_to_save =
1590 3510 : AOMMIN(RESTORATION_CTX_VERT, src_height - row);
1591 :
1592 : assert(lines_to_save == 1 || lines_to_save == 2);
1593 :
1594 : int32_t upscaled_width;
1595 : int32_t line_bytes;
1596 :
1597 3510 : if (!av1_superres_unscaled(&cm->frm_size)) {
1598 0 : int32_t sx = is_uv && cm->subsampling_x;
1599 0 : upscaled_width = (cm->frm_size.superres_upscaled_width + sx) >> sx;
1600 0 : line_bytes = upscaled_width << use_highbd;
1601 :
1602 0 : av1_upscale_normative_rows(cm, (src_rows),
1603 : src_stride >> use_highbd, (bdry_rows),
1604 : boundaries->stripe_boundary_stride,
1605 : lines_to_save, sx, cm->bit_depth);
1606 : }
1607 : else {
1608 3510 : upscaled_width = src_width;
1609 3510 : line_bytes = upscaled_width << use_highbd;
1610 10530 : for (int32_t i = 0; i < lines_to_save; i++) {
1611 7020 : memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1612 : line_bytes);
1613 : }
1614 : }
1615 : // If we only saved one line, then copy it into the second line buffer
1616 3510 : if (lines_to_save == 1)
1617 0 : memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1618 :
1619 3510 : extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1620 : RESTORATION_EXTRA_HORZ, use_highbd);
1621 3510 : }
1622 :
1623 702 : static void save_cdef_boundary_lines(uint8_t *src_buf, int32_t src_stride,
1624 : int32_t src_width, const Av1Common *cm, int32_t plane, int32_t row,
1625 : int32_t stripe, int32_t use_highbd, int32_t is_above,
1626 : RestorationStripeBoundaries *boundaries)
1627 : {
1628 702 : const int32_t is_uv = plane > 0;
1629 702 : src_stride = src_stride << use_highbd;
1630 702 : const uint8_t *src_rows = src_buf + row * src_stride;
1631 :
1632 702 : uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1633 702 : : boundaries->stripe_boundary_below;
1634 702 : uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1635 702 : const int32_t bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1636 702 : uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1637 :
1638 : // At the point where this function is called, we've already applied
1639 : // superres. So we don't need to extend the lines here, we can just
1640 : // pull directly from the topmost row of the upscaled frame.
1641 702 : const int32_t ss_x = is_uv && cm->subsampling_x;
1642 702 : const int32_t upscaled_width = av1_superres_unscaled(&cm->frm_size)
1643 : ? src_width
1644 702 : : (cm->frm_size.superres_upscaled_width + ss_x) >> ss_x;
1645 702 : const int32_t line_bytes = upscaled_width << use_highbd;
1646 2106 : for (int32_t i = 0; i < RESTORATION_CTX_VERT; i++) {
1647 : // Copy the line at 'row' into both context lines. This is because
1648 : // we want to (effectively) extend the outermost row of CDEF data
1649 : // from this tile to produce a border, rather than using deblocked
1650 : // pixels from the tile above/below.
1651 1404 : memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1652 : }
1653 702 : extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1654 : RESTORATION_EXTRA_HORZ, use_highbd);
1655 702 : }
1656 :
1657 702 : void save_tile_row_boundary_lines(uint8_t *src, int32_t src_stride,
1658 : int32_t src_width, int32_t src_height, int32_t use_highbd, int32_t plane,
1659 : Av1Common *cm, int32_t after_cdef, RestorationStripeBoundaries *boundaries)
1660 : {
1661 702 : const int32_t is_uv = plane > 0;
1662 702 : const int32_t ss_y = is_uv && cm->subsampling_y;
1663 702 : const int32_t stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1664 702 : const int32_t stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1665 :
1666 : // Get the tile rectangle, with height rounded up to the next multiple of 8
1667 : // luma pixels (only relevant for the bottom tile of the frame)
1668 702 : const AV1PixelRect tile_rect = whole_frame_rect(&cm->frm_size, cm->subsampling_x,
1669 : cm->subsampling_y, is_uv);
1670 702 : const int32_t stripe0 = 0;
1671 :
1672 702 : int32_t plane_height = ROUND_POWER_OF_TWO(cm->frm_size.frame_height, ss_y);
1673 :
1674 : int32_t tile_stripe;
1675 4914 : for (tile_stripe = 0;; ++tile_stripe) {
1676 4914 : const int32_t rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
1677 4914 : const int32_t y0 = tile_rect.top + rel_y0;
1678 4914 : if (y0 >= tile_rect.bottom) break;
1679 :
1680 4212 : const int32_t rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
1681 4212 : const int32_t y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
1682 :
1683 4212 : const int32_t frame_stripe = stripe0 + tile_stripe;
1684 :
1685 : int32_t use_deblock_above, use_deblock_below;
1686 : // In this case, we should only use CDEF pixels at the top
1687 : // and bottom of the frame as a whole; internal tile boundaries
1688 : // can use deblocked pixels from adjacent tiles for context.
1689 4212 : use_deblock_above = (frame_stripe > 0);
1690 4212 : use_deblock_below = (y1 < plane_height);
1691 :
1692 4212 : if (!after_cdef) {
1693 : // Save deblocked context where needed.
1694 2106 : if (use_deblock_above) {
1695 1755 : save_deblock_boundary_lines(src, src_stride, src_width, src_height,
1696 : cm, plane, y0 - RESTORATION_CTX_VERT,
1697 : frame_stripe, use_highbd, 1, boundaries);
1698 : }
1699 2106 : if (use_deblock_below) {
1700 1755 : save_deblock_boundary_lines(src, src_stride, src_width, src_height,
1701 : cm, plane, y1, frame_stripe, use_highbd, 0, boundaries);
1702 : }
1703 : }
1704 : else {
1705 : // Save CDEF context where needed. Note that we need to save the CDEF
1706 : // context for a particular boundary iff we *didn't* save deblocked
1707 : // context for that boundary.
1708 : //
1709 : // In addition, we need to save copies of the outermost line within
1710 : // the tile, rather than using data from outside the tile.
1711 2106 : if (!use_deblock_above) {
1712 351 : save_cdef_boundary_lines(src, src_stride, src_width,
1713 : cm, plane, y0, frame_stripe, use_highbd, 1, boundaries);
1714 : }
1715 2106 : if (!use_deblock_below) {
1716 351 : save_cdef_boundary_lines(src, src_stride, src_width,
1717 : cm, plane, y1 - 1, frame_stripe, use_highbd, 0, boundaries);
1718 : }
1719 : }
1720 : }
1721 702 : }
1722 :
1723 : // For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1724 : // lines to be used as boundary in the loop restoration process. The
1725 : // lines are saved in rst_internal.stripe_boundary_lines
1726 120 : void eb_av1_loop_restoration_save_boundary_lines(const Yv12BufferConfig *frame,
1727 : Av1Common *cm, int32_t after_cdef) {
1728 120 : const int32_t num_planes = 3;// av1_num_planes(cm);
1729 120 : const int32_t use_highbd = cm->use_highbitdepth;
1730 :
1731 480 : for (int32_t p = 0; p < num_planes; ++p) {
1732 360 : const int32_t is_uv = p > 0;
1733 360 : int32_t crop_width = frame->crop_widths[is_uv];
1734 360 : int32_t crop_height = frame->crop_heights[is_uv];
1735 360 : uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[p]);
1736 360 : int32_t src_stride = frame->strides[is_uv];
1737 360 : RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
1738 :
1739 360 : save_tile_row_boundary_lines(src_buf, src_stride, crop_width, crop_height,
1740 : use_highbd, p, cm, after_cdef, boundaries);
1741 : }
1742 120 : }
1743 :
1744 : // Assumes cm->rst_info[p].restoration_unit_size is already initialized
1745 :
1746 144 : EbErrorType eb_av1_alloc_restoration_buffers(Av1Common *cm) {
1747 144 : EbErrorType return_error = EB_ErrorNone;
1748 144 : const int32_t num_planes = 3;// av1_num_planes(cm);
1749 576 : for (int32_t p = 0; p < num_planes; ++p)
1750 432 : return_error = eb_av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
1751 :
1752 : //CHKNif (cm->rst_tmpbuf == NULL)
1753 : {
1754 : //CHKN CHECK_MEM_ERROR(cm, cm->rst_tmpbuf,
1755 : //cm->rst_tmpbuf = (int32_t *)eb_aom_memalign(16, RESTORATION_TMPBUF_SIZE);
1756 :
1757 144 : EB_MALLOC_ALIGNED(cm->rst_tmpbuf, RESTORATION_TMPBUF_SIZE);
1758 : }
1759 :
1760 : // For striped loop restoration, we divide each row of tiles into "stripes",
1761 : // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET
1762 : // luma pixels to match the output from CDEF. We will need to store 2 *
1763 : // RESTORATION_CTX_VERT lines of data for each stripe, and also need to be
1764 : // able to quickly answer the question "Where is the <n>'th stripe for tile
1765 : // row <m>?" To make that efficient, we generate the rst_last_stripe array.
1766 144 : int32_t num_stripes = 0;
1767 288 : for (int32_t i = 0; i < 1/*cm->tile_rows*/; ++i) {
1768 : //TileInfo tile_info;
1769 : //eb_av1_tile_set_row(&tile_info, cm, i);
1770 :
1771 144 : const int32_t mi_h = cm->mi_rows;// tile_info.mi_row_end - tile_info.mi_row_start;
1772 144 : const int32_t ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
1773 144 : const int32_t tile_stripes = (ext_h + 63) / 64;
1774 144 : num_stripes += tile_stripes;
1775 144 : cm->rst_end_stripe[i] = num_stripes;
1776 : }
1777 :
1778 : // Now we need to allocate enough space to store the line buffers for the
1779 : // stripes
1780 144 : const int32_t frame_w = cm->frm_size.superres_upscaled_width;
1781 144 : const int32_t use_highbd = cm->use_highbitdepth ? 1 : 0;
1782 :
1783 576 : for (int32_t p = 0; p < num_planes; ++p) {
1784 432 : const int32_t is_uv = p > 0;
1785 432 : const int32_t ss_x = is_uv && cm->subsampling_x;
1786 432 : const int32_t plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
1787 432 : const int32_t stride = ALIGN_POWER_OF_TWO(plane_w, 5);
1788 432 : const int32_t buf_size = num_stripes * stride * RESTORATION_CTX_VERT
1789 : << use_highbd;
1790 432 : RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
1791 :
1792 : {
1793 432 : EB_MALLOC(boundaries->stripe_boundary_above, buf_size);
1794 432 : EB_MALLOC(boundaries->stripe_boundary_below, buf_size);
1795 :
1796 432 : boundaries->stripe_boundary_size = buf_size;
1797 : }
1798 432 : boundaries->stripe_boundary_stride = stride;
1799 : }
1800 :
1801 144 : return return_error;
1802 : }
|