Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 : #include <stdio.h>
12 : #include <stdlib.h>
13 : #include <math.h>
14 : #include <string.h>
15 :
16 : #include "EbCdef.h"
17 : #include "stdint.h"
18 : #include "EbCodingUnit.h"
19 : #include "EbEncDecProcess.h"
20 : #include "aom_dsp_rtcd.h"
21 :
22 : extern int16_t eb_av1_ac_quant_Q3(int32_t qindex, int32_t delta, AomBitDepth bit_depth);
23 :
24 : //-------memory stuff
25 :
26 : #define ADDRESS_STORAGE_SIZE sizeof(size_t)
27 : #define DEFAULT_ALIGNMENT (2 * sizeof(void *))
28 : #define AOM_MAX_ALLOCABLE_MEMORY 8589934592 // 8 GB
29 : /*returns an addr aligned to the byte boundary specified by align*/
30 : #define align_addr(addr, align) \
31 : (void *)(((size_t)(addr) + ((align)-1)) & ~(size_t)((align)-1))
32 :
33 : // Returns 0 in case of overflow of nmemb * size.
34 3562 : static int32_t check_size_argument_overflow(uint64_t nmemb, uint64_t size) {
35 3562 : const uint64_t total_size = nmemb * size;
36 3562 : if (nmemb == 0) return 1;
37 3562 : if (size > AOM_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
38 : if (total_size != (size_t)total_size) return 0;
39 3562 : return 1;
40 : }
41 :
42 3562 : static size_t GetAlignedMallocSize(size_t size, size_t align) {
43 3562 : return size + align - 1 + ADDRESS_STORAGE_SIZE;
44 : }
45 :
46 7124 : static size_t *GetMallocAddressLocation(void *const mem) {
47 7124 : return ((size_t *)mem) - 1;
48 : }
49 :
50 3562 : static void SetActualMallocAddress(void *const mem,
51 : const void *const malloc_addr) {
52 3562 : size_t *const malloc_addr_location = GetMallocAddressLocation(mem);
53 3562 : *malloc_addr_location = (size_t)malloc_addr;
54 3562 : }
55 :
56 3562 : static void *GetActualMallocAddress(void *const mem) {
57 3562 : const size_t *const malloc_addr_location = GetMallocAddressLocation(mem);
58 3562 : return (void *)(*malloc_addr_location);
59 : }
60 :
61 3562 : void *eb_aom_memalign(size_t align, size_t size) {
62 3562 : void *x = NULL;
63 3562 : const size_t aligned_size = GetAlignedMallocSize(size, align);
64 : #if defined(AOM_MAX_ALLOCABLE_MEMORY)
65 3562 : if (!check_size_argument_overflow(1, aligned_size)) return NULL;
66 : #endif
67 3562 : void *const addr = malloc(aligned_size);
68 3562 : if (addr) {
69 3562 : x = align_addr((uint8_t *)addr + ADDRESS_STORAGE_SIZE, align);
70 3562 : SetActualMallocAddress(x, addr);
71 : }
72 3562 : return x;
73 : }
74 :
75 3202 : void *eb_aom_malloc(size_t size) { return eb_aom_memalign(DEFAULT_ALIGNMENT, size); }
76 :
77 3562 : void eb_aom_free(void *memblk) {
78 3562 : if (memblk) {
79 3562 : void *addr = GetActualMallocAddress(memblk);
80 3562 : free(addr);
81 : }
82 3562 : }
83 :
84 0 : void *eb_aom_memset16(void *dest, int32_t val, size_t length) {
85 : size_t i;
86 0 : uint16_t *dest16 = (uint16_t *)dest;
87 0 : for (i = 0; i < length; i++) *dest16++ = (uint16_t)val;
88 0 : return dest;
89 : }
90 : //-------------------------------
91 :
92 : extern INLINE int32_t get_msb(uint32_t n);
93 :
94 0 : static INLINE int32_t sign(int32_t i) { return i < 0 ? -1 : 1; }
95 0 : static INLINE int32_t constrain(int32_t diff, int32_t threshold, int32_t damping) {
96 0 : if (!threshold) return 0;
97 :
98 0 : const int32_t shift = AOMMAX(0, damping - get_msb(threshold));
99 0 : return sign(diff) *
100 0 : AOMMIN(abs(diff), AOMMAX(0, threshold - (abs(diff) >> shift)));
101 : }
102 :
103 : /* Generated from gen_filter_tables.c. */
104 : DECLARE_ALIGNED(16, const int32_t, eb_cdef_directions[8][2]) = {
105 : { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 },
106 : { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 },
107 : { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 },
108 : { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2 },
109 : { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2 },
110 : { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1 },
111 : { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 },
112 : { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }
113 : };
114 :
115 : /* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
116 : The search minimizes the weighted variance along all the lines in a
117 : particular direction, i.e. the squared error between the input and a
118 : "predicted" block where each pixel is replaced by the average along a line
119 : in a particular direction. Since each direction have the same sum(x^2) term,
120 : that term is never computed. See Section 2, step 2, of:
121 : http://jmvalin.ca/notes/intra_paint.pdf */
122 0 : int32_t eb_cdef_find_dir_c(const uint16_t *img, int32_t stride, int32_t *var,
123 : int32_t coeff_shift) {
124 : int32_t i;
125 0 : int32_t cost[8] = { 0 };
126 0 : int32_t partial[8][15] = { { 0 } };
127 0 : int32_t best_cost = 0;
128 0 : int32_t best_dir = 0;
129 : /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
130 : The output is then 840 times larger, but we don't care for finding
131 : the max. */
132 : static const int32_t div_table[] = { 0, 840, 420, 280, 210, 168, 140, 120, 105 };
133 0 : for (i = 0; i < 8; i++) {
134 : int32_t j;
135 0 : for (j = 0; j < 8; j++) {
136 : int32_t x;
137 : /* We subtract 128 here to reduce the maximum range of the squared
138 : partial sums. */
139 0 : x = (img[i * stride + j] >> coeff_shift) - 128;
140 0 : partial[0][i + j] += x;
141 0 : partial[1][i + j / 2] += x;
142 0 : partial[2][i] += x;
143 0 : partial[3][3 + i - j / 2] += x;
144 0 : partial[4][7 + i - j] += x;
145 0 : partial[5][3 - i / 2 + j] += x;
146 0 : partial[6][j] += x;
147 0 : partial[7][i / 2 + j] += x;
148 : }
149 : }
150 0 : for (i = 0; i < 8; i++) {
151 0 : cost[2] += partial[2][i] * partial[2][i];
152 0 : cost[6] += partial[6][i] * partial[6][i];
153 : }
154 0 : cost[2] *= div_table[8];
155 0 : cost[6] *= div_table[8];
156 0 : for (i = 0; i < 7; i++) {
157 0 : cost[0] += (partial[0][i] * partial[0][i] +
158 0 : partial[0][14 - i] * partial[0][14 - i]) *
159 0 : div_table[i + 1];
160 0 : cost[4] += (partial[4][i] * partial[4][i] +
161 0 : partial[4][14 - i] * partial[4][14 - i]) *
162 0 : div_table[i + 1];
163 : }
164 0 : cost[0] += partial[0][7] * partial[0][7] * div_table[8];
165 0 : cost[4] += partial[4][7] * partial[4][7] * div_table[8];
166 0 : for (i = 1; i < 8; i += 2) {
167 : int32_t j;
168 0 : for (j = 0; j < 4 + 1; j++)
169 0 : cost[i] += partial[i][3 + j] * partial[i][3 + j];
170 0 : cost[i] *= div_table[8];
171 0 : for (j = 0; j < 4 - 1; j++) {
172 0 : cost[i] += (partial[i][j] * partial[i][j] +
173 0 : partial[i][10 - j] * partial[i][10 - j]) *
174 0 : div_table[2 * j + 2];
175 : }
176 : }
177 0 : for (i = 0; i < 8; i++) {
178 0 : if (cost[i] > best_cost) {
179 0 : best_cost = cost[i];
180 0 : best_dir = i;
181 : }
182 : }
183 : /* Difference between the optimal variance and the variance along the
184 : orthogonal direction. Again, the sum(x^2) terms cancel out. */
185 0 : *var = best_cost - cost[(best_dir + 4) & 7];
186 : /* We'd normally divide by 840, but dividing by 1024 is close enough
187 : for what we're going to do with this. */
188 0 : *var >>= 10;
189 0 : return best_dir;
190 : }
191 :
192 : const int32_t eb_cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
193 : const int32_t eb_cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
194 :
195 : /* Smooth in the direction detected. */
196 0 : void eb_cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int32_t dstride,
197 : const uint16_t *in, int32_t pri_strength, int32_t sec_strength,
198 : int32_t dir, int32_t pri_damping, int32_t sec_damping, int32_t bsize,
199 : int32_t coeff_shift) {
200 : int32_t i, j, k;
201 0 : const int32_t s = CDEF_BSTRIDE;
202 0 : const int32_t *pri_taps = eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
203 0 : const int32_t *sec_taps = eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
204 :
205 0 : for (i = 0; i < (4 << (int32_t)(bsize == BLOCK_8X8 || bsize == BLOCK_4X8)); i++) {
206 0 : for (j = 0; j < (4 << (int32_t)(bsize == BLOCK_8X8 || bsize == BLOCK_8X4)); j++) {
207 0 : int16_t sum = 0;
208 : int16_t y;
209 0 : int16_t x = in[i * s + j];
210 0 : int32_t max = x;
211 0 : int32_t min = x;
212 0 : for (k = 0; k < 2; k++) {
213 0 : int16_t p0 = in[i * s + j + eb_cdef_directions[dir][k]];
214 0 : int16_t p1 = in[i * s + j - eb_cdef_directions[dir][k]];
215 0 : sum += (int16_t)(pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping));
216 0 : sum += (int16_t)(pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping));
217 0 : if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max);
218 0 : if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max);
219 0 : min = AOMMIN(p0, min);
220 0 : min = AOMMIN(p1, min);
221 0 : int16_t s0 = in[i * s + j + eb_cdef_directions[(dir + 2) & 7][k]];
222 0 : int16_t s1 = in[i * s + j - eb_cdef_directions[(dir + 2) & 7][k]];
223 0 : int16_t s2 = in[i * s + j + eb_cdef_directions[(dir + 6) & 7][k]];
224 0 : int16_t s3 = in[i * s + j - eb_cdef_directions[(dir + 6) & 7][k]];
225 0 : if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max);
226 0 : if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max);
227 0 : if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max);
228 0 : if (s3 != CDEF_VERY_LARGE) max = AOMMAX(s3, max);
229 0 : min = AOMMIN(s0, min);
230 0 : min = AOMMIN(s1, min);
231 0 : min = AOMMIN(s2, min);
232 0 : min = AOMMIN(s3, min);
233 0 : sum += (int16_t)(sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping));
234 0 : sum += (int16_t)(sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping));
235 0 : sum += (int16_t)(sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping));
236 0 : sum += (int16_t)(sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping));
237 : }
238 0 : y = (int16_t)clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), min, max);
239 0 : if (dst8)
240 0 : dst8[i * dstride + j] = (uint8_t)y;
241 : else
242 0 : dst16[i * dstride + j] = (uint16_t)y;
243 : }
244 : }
245 0 : }
246 4415 : int32_t get_cdef_gi_step(
247 : int8_t cdef_filter_mode) {
248 4415 : int32_t gi_step = cdef_filter_mode == 1 ? 1 : cdef_filter_mode == 2 ? 4 : cdef_filter_mode == 3 ? 8 : cdef_filter_mode == 4 ? 16 : 64;
249 4415 : return gi_step;
250 : }
251 : /* Compute the primary filter strength for an 8x8 block based on the
252 : directional variance difference. A high variance difference means
253 : that we have a highly directional pattern (e.g. a high contrast
254 : edge), so we can apply more deringing. A low variance means that we
255 : either have a low contrast edge, or a non-directional texture, so
256 : we want to be careful not to blur. */
257 1172370 : static INLINE int32_t adjust_strength(int32_t strength, int32_t var) {
258 1172370 : const int32_t i = var >> 6 ? AOMMIN(get_msb(var >> 6), 12) : 0;
259 : /* We use the variance of 8x8 blocks to adjust the strength. */
260 1169690 : return var ? (strength * (4 + i) + 8) >> 4 : 0;
261 : }
262 :
263 106007 : void eb_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int32_t dstride, uint16_t *in,
264 : int32_t xdec, int32_t ydec, int32_t dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
265 : int32_t *dirinit, int32_t var[CDEF_NBLOCKS][CDEF_NBLOCKS], int32_t pli,
266 : cdef_list *dlist, int32_t cdef_count, int32_t level,
267 : int32_t sec_strength, int32_t pri_damping, int32_t sec_damping,
268 : int32_t coeff_shift) {
269 : int32_t bi;
270 : int32_t bx;
271 : int32_t by;
272 : int32_t bsize, bsizex, bsizey;
273 :
274 106007 : int32_t pri_strength = level << coeff_shift;
275 106007 : sec_strength <<= coeff_shift;
276 106007 : sec_damping += coeff_shift - (pli != AOM_PLANE_Y);
277 106007 : pri_damping += coeff_shift - (pli != AOM_PLANE_Y);
278 106007 : bsize =
279 106007 : ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
280 106007 : bsizex = 3 - xdec;
281 106007 : bsizey = 3 - ydec;
282 106007 : if (dirinit && pri_strength == 0 && sec_strength == 0) {
283 : // If we're here, both primary and secondary strengths are 0, and
284 : // we still haven't written anything to y[] yet, so we just copy
285 : // the input to y[]. This is necessary only for eb_av1_cdef_search()
286 : // and only eb_av1_cdef_search() sets dirinit.
287 119198 : for (bi = 0; bi < cdef_count; bi++) {
288 114898 : by = dlist[bi].by << bsizey;
289 114898 : bx = dlist[bi].bx << bsizex;
290 : int32_t iy, ix;
291 : // TODO(stemidts/jmvalin): SIMD optimisations
292 114898 : if (dst8) {
293 718659 : for (iy = 0; iy < 1 << bsizey; iy++)
294 4177440 : for (ix = 0; ix < 1 << bsizex; ix++)
295 3573680 : dst8[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
296 3573680 : (uint8_t)in[(by + iy) * CDEF_BSTRIDE + bx + ix];
297 : }
298 : else {
299 0 : for (iy = 0; iy < 1 << bsizey; iy++)
300 0 : for (ix = 0; ix < 1 << bsizex; ix++)
301 0 : dst16[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
302 0 : in[(by + iy) * CDEF_BSTRIDE + bx + ix];
303 : }
304 : }
305 4300 : return;
306 : }
307 :
308 101707 : if (pli == 0) {
309 33927 : if (!dirinit || !*dirinit) {
310 88955 : for (bi = 0; bi < cdef_count; bi++) {
311 85796 : by = dlist[bi].by;
312 85796 : bx = dlist[bi].bx;
313 :
314 85800 : dir[by][bx] = eb_cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx],
315 85796 : CDEF_BSTRIDE, &var[by][bx], coeff_shift);
316 : }
317 3159 : if (dirinit) *dirinit = 1;
318 : }
319 : }
320 101711 : if (pli == 1 && xdec != ydec) {
321 0 : for (bi = 0; bi < cdef_count; bi++) {
322 0 : /*static*/ const int32_t conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 };
323 0 : /*static*/ const int32_t conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 };
324 0 : by = dlist[bi].by;
325 0 : bx = dlist[bi].bx;
326 0 : dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]];
327 : }
328 : }
329 :
330 3561860 : for (bi = 0; bi < cdef_count; bi++) {
331 3413200 : int32_t t = dlist[bi].skip ? 0 : pri_strength;
332 3413200 : int32_t s = dlist[bi].skip ? 0 : sec_strength;
333 3413200 : by = dlist[bi].by;
334 3413200 : bx = dlist[bi].bx;
335 3413200 : if (dst8)
336 13462200 : eb_cdef_filter_block(
337 3449740 : &dst8[dirinit ? bi << (bsizex + bsizey) : (by << bsizey) * dstride + (bx << bsizex)],
338 : NULL,
339 : dirinit ? 1 << bsizex : dstride,
340 3449740 : &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
341 1172840 : (pli ? t : adjust_strength(t, var[by][bx])), s,
342 3098780 : t ? dir[by][bx] : 0, pri_damping, sec_damping, bsize,
343 : coeff_shift);
344 : else
345 0 : eb_cdef_filter_block(
346 : NULL,
347 0 : &dst16[dirinit ? bi << (bsizex + bsizey)
348 0 : : (by << bsizey) * dstride + (bx << bsizex)],
349 : dirinit ? 1 << bsizex : dstride,
350 0 : &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
351 0 : (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
352 : pri_damping, sec_damping, bsize, coeff_shift);
353 : }
354 : }
355 :
356 14333 : int32_t eb_sb_all_skip(PictureControlSet *picture_control_set_ptr, const Av1Common *const cm, int32_t mi_row, int32_t mi_col) {
357 : int32_t maxc, maxr;
358 14333 : int32_t skip = 1;
359 14333 : maxc = cm->mi_cols - mi_col;
360 14333 : maxr = cm->mi_rows - mi_row;
361 :
362 14333 : maxr = AOMMIN(maxr, MI_SIZE_64X64);
363 14333 : maxc = AOMMIN(maxc, MI_SIZE_64X64);
364 :
365 221944 : for (int32_t r = 0; r < maxr; r++) {
366 3320370 : for (int32_t c = 0; c < maxc; c++) {
367 3112760 : skip =
368 5664950 : skip &&
369 2552190 : picture_control_set_ptr->mi_grid_base[(mi_row + r) * picture_control_set_ptr->mi_stride + mi_col + c]->mbmi.block_mi.skip;
370 : /// cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]->skip;
371 : }
372 : }
373 14333 : return skip;
374 : }
375 :
376 267206 : static int32_t is_8x8_block_skip(ModeInfo **grid, int32_t mi_row, int32_t mi_col,
377 : int32_t mi_stride) {
378 267206 : int32_t is_skip = 1;
379 800405 : for (int32_t r = 0; r < mi_size_high[BLOCK_8X8]; ++r)
380 1598740 : for (int32_t c = 0; c < mi_size_wide[BLOCK_8X8]; ++c)
381 1065540 : is_skip &= (int32_t)(grid[(mi_row + r) * mi_stride + (mi_col + c)]->mbmi.block_mi.skip);
382 :
383 267206 : return is_skip;
384 : }
385 :
386 4557 : int32_t eb_sb_compute_cdef_list(PictureControlSet *picture_control_set_ptr, const Av1Common *const cm, int32_t mi_row, int32_t mi_col,
387 : cdef_list *dlist, BlockSize bs)
388 : {
389 : //MbModeInfo **grid = cm->mi_grid_visible;
390 4557 : ModeInfo **grid = picture_control_set_ptr->mi_grid_base;
391 :
392 4557 : int32_t maxc = cm->mi_cols - mi_col;
393 4557 : int32_t maxr = cm->mi_rows - mi_row;
394 :
395 4557 : if (bs == BLOCK_128X128 || bs == BLOCK_128X64)
396 0 : maxc = AOMMIN(maxc, MI_SIZE_128X128);
397 : else
398 4559 : maxc = AOMMIN(maxc, MI_SIZE_64X64);
399 4557 : if (bs == BLOCK_128X128 || bs == BLOCK_64X128)
400 0 : maxr = AOMMIN(maxr, MI_SIZE_128X128);
401 : else
402 4557 : maxr = AOMMIN(maxr, MI_SIZE_64X64);
403 :
404 4557 : const int32_t r_step = mi_size_high[BLOCK_8X8];
405 4557 : const int32_t c_step = mi_size_wide[BLOCK_8X8];
406 4557 : const int32_t r_shift = (r_step == 2);
407 4557 : const int32_t c_shift = (c_step == 2);
408 :
409 4557 : assert(r_step == 1 || r_step == 2);
410 4557 : assert(c_step == 1 || c_step == 2);
411 :
412 4557 : int32_t count = 0;
413 :
414 39748 : for (int32_t r = 0; r < maxr; r += r_step) {
415 301174 : for (int32_t c = 0; c < maxc; c += c_step) {
416 265983 : if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, picture_control_set_ptr->mi_stride)) {
417 70986 : dlist[count].by = (uint8_t)(r >> r_shift);
418 70986 : dlist[count].bx = (uint8_t)(c >> c_shift);
419 70986 : dlist[count].skip = 0;
420 70986 : count++;
421 : }
422 : }
423 : }
424 5945 : return count;
425 : }
426 0 : void eb_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int32_t dstride, const uint8_t *src,
427 : int32_t sstride, int32_t v, int32_t h) {
428 0 : for (int32_t i = 0; i < v; i++) {
429 0 : for (int32_t j = 0; j < h; j++)
430 0 : dst[i * dstride + j] = src[i * sstride + j];
431 : }
432 0 : }
433 :
434 17690 : void copy_sb8_16(uint16_t *dst, int32_t dstride,
435 : const uint8_t *src, int32_t src_voffset, int32_t src_hoffset,
436 : int32_t sstride, int32_t vsize, int32_t hsize) {
437 : {
438 17690 : const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
439 :
440 17690 : eb_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
441 : }
442 17686 : }
443 :
444 13809 : void fill_rect(uint16_t *dst, int32_t dstride, int32_t v, int32_t h,
445 : uint16_t x) {
446 319832 : for (int32_t i = 0; i < v; i++) {
447 3295350 : for (int32_t j = 0; j < h; j++)
448 2989320 : dst[i * dstride + j] = x;
449 : }
450 13809 : }
451 :
452 14748 : void copy_rect(uint16_t *dst, int32_t dstride, const uint16_t *src,
453 : int32_t sstride, int32_t v, int32_t h) {
454 405347 : for (int32_t i = 0; i < v; i++) {
455 3818650 : for (int32_t j = 0; j < h; j++)
456 3428050 : dst[i * dstride + j] = src[i * sstride + j];
457 : }
458 14748 : }
459 :
460 90 : void eb_av1_cdef_frame(
461 : EncDecContext *context_ptr,
462 : SequenceControlSet *sequence_control_set_ptr,
463 : PictureControlSet *pCs){
464 : (void)context_ptr;
465 :
466 90 : struct PictureParentControlSet *pPcs = pCs->parent_pcs_ptr;
467 90 : Av1Common* cm = pPcs->av1_cm;
468 90 : FrameHeader *frm_hdr = &pPcs->frm_hdr;
469 :
470 : EbPictureBufferDesc * recon_picture_ptr;
471 :
472 90 : if (pPcs->is_used_as_reference_flag == EB_TRUE)
473 68 : recon_picture_ptr = ((EbReferenceObject*)pCs->parent_pcs_ptr->reference_picture_wrapper_ptr->object_ptr)->reference_picture;
474 : else
475 22 : recon_picture_ptr = pCs->recon_picture_ptr;
476 :
477 90 : EbByte reconBufferY = &((recon_picture_ptr->buffer_y)[recon_picture_ptr->origin_x + recon_picture_ptr->origin_y * recon_picture_ptr->stride_y]);
478 90 : EbByte reconBufferCb = &((recon_picture_ptr->buffer_cb)[recon_picture_ptr->origin_x / 2 + recon_picture_ptr->origin_y / 2 * recon_picture_ptr->stride_cb]);
479 90 : EbByte reconBufferCr = &((recon_picture_ptr->buffer_cr)[recon_picture_ptr->origin_x / 2 + recon_picture_ptr->origin_y / 2 * recon_picture_ptr->stride_cr]);
480 :
481 90 : const int32_t num_planes = 3;// av1_num_planes(cm);
482 : DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]);
483 : uint16_t *linebuf[3];
484 : uint16_t *colbuf[3];
485 : cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
486 : uint8_t *row_cdef, *prev_row_cdef, *curr_row_cdef;
487 : int32_t cdef_count;
488 90 : int32_t dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
489 90 : int32_t var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
490 : int32_t mi_wide_l2[3];
491 : int32_t mi_high_l2[3];
492 : int32_t xdec[3];
493 : int32_t ydec[3];
494 90 : int32_t coeff_shift = AOMMAX(sequence_control_set_ptr->static_config.encoder_bit_depth/*cm->bit_depth*/ - 8, 0);
495 90 : const int32_t nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
496 90 : const int32_t nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
497 : //eb_av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0, num_planes);
498 90 : row_cdef = (uint8_t *)eb_aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2);
499 90 : assert(row_cdef != NULL);
500 90 : memset(row_cdef, 1, sizeof(*row_cdef) * (nhfb + 2) * 2);
501 90 : prev_row_cdef = row_cdef + 1;
502 90 : curr_row_cdef = prev_row_cdef + nhfb + 2;
503 360 : for (int32_t pli = 0; pli < num_planes; pli++) {
504 270 : int32_t subsampling_x = (pli == 0) ? 0 : 1;
505 270 : int32_t subsampling_y = (pli == 0) ? 0 : 1;
506 :
507 270 : xdec[pli] = subsampling_x; //CHKN xd->plane[pli].subsampling_x;
508 270 : ydec[pli] = subsampling_y; //CHKN xd->plane[pli].subsampling_y;
509 270 : mi_wide_l2[pli] = MI_SIZE_LOG2 - subsampling_x; //CHKN xd->plane[pli].subsampling_x;
510 270 : mi_high_l2[pli] = MI_SIZE_LOG2 - subsampling_y; //CHKN xd->plane[pli].subsampling_y;
511 : }
512 :
513 90 : const int32_t stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
514 360 : for (int32_t pli = 0; pli < num_planes; pli++) {
515 270 : linebuf[pli] = (uint16_t *)eb_aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride);
516 270 : colbuf[pli] = (uint16_t *)eb_aom_malloc(sizeof(*colbuf) * ((CDEF_BLOCKSIZE << mi_high_l2[pli]) + 2 * CDEF_VBORDER) * CDEF_HBORDER);
517 : }
518 :
519 630 : for (int32_t fbr = 0; fbr < nvfb; fbr++) {
520 2160 : for (int32_t pli = 0; pli < num_planes; pli++) {
521 1620 : const int32_t block_height =
522 1620 : (MI_SIZE_64X64 << mi_high_l2[pli]) + 2 * CDEF_VBORDER;
523 1620 : fill_rect(colbuf[pli], CDEF_HBORDER, block_height, CDEF_HBORDER,
524 : CDEF_VERY_LARGE);
525 : }
526 :
527 540 : int32_t cdef_left = 1;
528 5940 : for (int32_t fbc = 0; fbc < nhfb; fbc++) {
529 : int32_t level, sec_strength;
530 : int32_t uv_level, uv_sec_strength;
531 : int32_t nhb, nvb;
532 5400 : int32_t cstart = 0;
533 5400 : curr_row_cdef[fbc] = 0;
534 :
535 : //WAHT IS THIS ?? CHKN -->for
536 5400 : if (pCs->mi_grid_base[MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc] == NULL ||
537 5400 : pCs->mi_grid_base[MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc]->mbmi.cdef_strength == -1) {
538 0 : cdef_left = 0;
539 0 : printf("\n\n\nCDEF ERROR: Skipping Current FB\n\n\n");
540 0 : continue;
541 : }
542 :
543 5400 : if (!cdef_left) cstart = -CDEF_HBORDER; //CHKN if the left block has not been filtered, then we can use samples on the left as input.
544 :
545 5400 : nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
546 5400 : nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
547 : int32_t frame_top, frame_left, frame_bottom, frame_right;
548 :
549 5400 : int32_t mi_row = MI_SIZE_64X64 * fbr;
550 5400 : int32_t mi_col = MI_SIZE_64X64 * fbc;
551 : // for the current filter block, it's top left corner mi structure (mi_tl)
552 : // is first accessed to check whether the top and left boundaries are
553 : // frame boundaries. Then bottom-left and top-right mi structures are
554 : // accessed to check whether the bottom and right boundaries
555 : // (respectively) are frame boundaries.
556 : //
557 : // Note that we can't just check the bottom-right mi structure - eg. if
558 : // we're at the right-hand edge of the frame but not the bottom, then
559 : // the bottom-right mi is NULL but the bottom-left is not.
560 5400 : frame_top = (mi_row == 0) ? 1 : 0;
561 5400 : frame_left = (mi_col == 0) ? 1 : 0;
562 :
563 5400 : if (fbr != nvfb - 1)
564 4500 : frame_bottom = (mi_row + MI_SIZE_64X64 == cm->mi_rows) ? 1 : 0;
565 : else
566 900 : frame_bottom = 1;
567 :
568 5400 : if (fbc != nhfb - 1)
569 4860 : frame_right = (mi_col + MI_SIZE_64X64 == cm->mi_cols) ? 1 : 0;
570 : else
571 540 : frame_right = 1;
572 :
573 5400 : const int32_t mbmi_cdef_strength = pCs->mi_grid_base[MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc]->mbmi.cdef_strength;
574 5400 : level = frm_hdr->CDEF_params.cdef_y_strength[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
575 5400 : sec_strength = frm_hdr->CDEF_params.cdef_y_strength[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
576 5400 : sec_strength += sec_strength == 3;
577 5400 : uv_level = frm_hdr->CDEF_params.cdef_uv_strength[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
578 5400 : uv_sec_strength = frm_hdr->CDEF_params.cdef_uv_strength[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
579 5400 : uv_sec_strength += uv_sec_strength == 3;
580 8527 : if ((level == 0 && sec_strength == 0 && uv_level == 0 && uv_sec_strength == 0) ||
581 3127 : (cdef_count = eb_sb_compute_cdef_list(pCs, cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, BLOCK_64X64)) == 0) {
582 4255 : cdef_left = 0;
583 4255 : continue;
584 : }
585 :
586 1145 : curr_row_cdef[fbc] = 1;
587 4580 : for (int32_t pli = 0; pli < num_planes; pli++) {
588 : int32_t coffset;
589 : int32_t rend, cend;
590 3435 : int32_t pri_damping = frm_hdr->CDEF_params.cdef_damping;
591 3435 : int32_t sec_damping = frm_hdr->CDEF_params.cdef_damping;
592 3435 : int32_t hsize = nhb << mi_wide_l2[pli];
593 3435 : int32_t vsize = nvb << mi_high_l2[pli];
594 :
595 3435 : if (pli) {
596 2290 : level = uv_level;
597 2290 : sec_strength = uv_sec_strength;
598 : }
599 :
600 3435 : if (fbc == nhfb - 1)
601 528 : cend = hsize;
602 : else
603 2907 : cend = hsize + CDEF_HBORDER;
604 :
605 3435 : if (fbr == nvfb - 1)
606 678 : rend = vsize;
607 : else
608 2757 : rend = vsize + CDEF_VBORDER;
609 :
610 3435 : coffset = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
611 3435 : if (fbc == nhfb - 1) {
612 : /* On the last superblock column, fill in the right border with
613 : CDEF_VERY_LARGE to avoid filtering with the outside. */
614 528 : fill_rect(&src[cend + CDEF_HBORDER], CDEF_BSTRIDE,
615 528 : rend + CDEF_VBORDER, hsize + CDEF_HBORDER - cend,
616 : CDEF_VERY_LARGE);
617 : }
618 3435 : if (fbr == nvfb - 1) {
619 : /* On the last superblock row, fill in the bottom border with
620 : CDEF_VERY_LARGE to avoid filtering with the outside. */
621 678 : fill_rect(&src[(rend + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
622 : CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
623 : }
624 :
625 3435 : uint8_t* recBuff = 0;
626 3435 : uint32_t recStride = 0;
627 :
628 3435 : switch (pli) {
629 1145 : case 0:
630 1145 : recBuff = reconBufferY;
631 1145 : recStride = recon_picture_ptr->stride_y;
632 1145 : break;
633 1145 : case 1:
634 1145 : recBuff = reconBufferCb;
635 1145 : recStride = recon_picture_ptr->stride_cb;
636 :
637 1145 : break;
638 1145 : case 2:
639 1145 : recBuff = reconBufferCr;
640 1145 : recStride = recon_picture_ptr->stride_cr;
641 1145 : break;
642 : }
643 :
644 : /* Copy in the pixels we need from the current superblock for
645 : deringing.*/
646 3435 : copy_sb8_16(//cm,
647 3435 : &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
648 : CDEF_BSTRIDE, recBuff/*xd->plane[pli].dst.buf*/,
649 3435 : (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr, coffset + cstart,
650 : recStride/*xd->plane[pli].dst.stride*/, rend, cend - cstart);
651 3435 : if (!prev_row_cdef[fbc]) {
652 807 : copy_sb8_16(//cm,
653 : &src[CDEF_HBORDER], CDEF_BSTRIDE,
654 : recBuff/*xd->plane[pli].dst.buf*/,
655 807 : (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
656 : coffset, recStride/*xd->plane[pli].dst.stride*/, CDEF_VBORDER, hsize);
657 : }
658 2628 : else if (fbr > 0) {
659 1968 : copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &linebuf[pli][coffset],
660 : stride, CDEF_VBORDER, hsize);
661 : }
662 : else {
663 660 : fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
664 : CDEF_VERY_LARGE);
665 : }
666 :
667 3435 : if (!prev_row_cdef[fbc - 1]) {
668 627 : copy_sb8_16(//cm,
669 : src, CDEF_BSTRIDE, recBuff/*xd->plane[pli].dst.buf*/,
670 627 : (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
671 : coffset - CDEF_HBORDER, recStride/*xd->plane[pli].dst.stride*/,
672 : CDEF_VBORDER, CDEF_HBORDER);
673 : }
674 2808 : else if (fbr > 0 && fbc > 0) {
675 1356 : copy_rect(src, CDEF_BSTRIDE, &linebuf[pli][coffset - CDEF_HBORDER],
676 : stride, CDEF_VBORDER, CDEF_HBORDER);
677 : }
678 : else {
679 1452 : fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
680 : CDEF_VERY_LARGE);
681 : }
682 :
683 3435 : if (!prev_row_cdef[fbc + 1]) {
684 1131 : copy_sb8_16(//cm,
685 1131 : &src[CDEF_HBORDER + (nhb << mi_wide_l2[pli])],
686 : CDEF_BSTRIDE, recBuff/*xd->plane[pli].dst.buf*/,
687 1131 : (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
688 : coffset + hsize, recStride/*xd->plane[pli].dst.stride*/, CDEF_VBORDER,
689 : CDEF_HBORDER);
690 : }
691 2304 : else if (fbr > 0 && fbc < nhfb - 1) {
692 1242 : copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
693 1242 : &linebuf[pli][coffset + hsize], stride, CDEF_VBORDER,
694 : CDEF_HBORDER);
695 : }
696 : else {
697 1062 : fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
698 : CDEF_HBORDER, CDEF_VERY_LARGE);
699 : }
700 :
701 3435 : if (cdef_left) {
702 : /* If we deringed the superblock on the left then we need to copy in
703 : saved pixels. */
704 2661 : copy_rect(src, CDEF_BSTRIDE, colbuf[pli], CDEF_HBORDER,
705 : rend + CDEF_VBORDER, CDEF_HBORDER);
706 : }
707 :
708 : /* Saving pixels in case we need to dering the superblock on the
709 : right. */
710 3435 : if (fbc < nhfb - 1)
711 2907 : copy_rect(colbuf[pli], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
712 : rend + CDEF_VBORDER, CDEF_HBORDER);
713 :
714 3435 : if (fbr < nvfb - 1)
715 2757 : copy_sb8_16(
716 : //cm,
717 2757 : &linebuf[pli][coffset], stride, recBuff/*xd->plane[pli].dst.buf*/,
718 2757 : (MI_SIZE_64X64 << mi_high_l2[pli]) * (fbr + 1) - CDEF_VBORDER,
719 : coffset, recStride/*xd->plane[pli].dst.stride*/, CDEF_VBORDER, hsize);
720 :
721 3435 : if (frame_top) {
722 660 : fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, hsize + 2 * CDEF_HBORDER,
723 : CDEF_VERY_LARGE);
724 : }
725 3435 : if (frame_left) {
726 972 : fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
727 : CDEF_VERY_LARGE);
728 : }
729 3435 : if (frame_bottom) {
730 678 : fill_rect(&src[(vsize + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
731 : CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
732 : }
733 3435 : if (frame_right) {
734 528 : fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
735 : vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
736 : }
737 :
738 : //if (cm->use_highbitdepth) {
739 : // eb_cdef_filter_fb(
740 : // NULL,
741 : // &CONVERT_TO_SHORTPTR(
742 : // xd->plane[pli]
743 : // .dst.buf)[xd->plane[pli].dst.stride *
744 : // (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
745 : // (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
746 : // xd->plane[pli].dst.stride,
747 : // &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
748 : // ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
749 : // sec_strength, pri_damping, sec_damping, coeff_shift);
750 : //} else
751 : {
752 3435 : eb_cdef_filter_fb(
753 3435 : &recBuff[recStride *(MI_SIZE_64X64 * fbr << mi_high_l2[pli]) + (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
754 : //&xd->plane[pli].dst.buf[xd->plane[pli].dst.stride *(MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +(fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
755 : NULL, recStride/*xd->plane[pli].dst.stride*/,
756 : &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
757 : ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
758 : sec_strength, pri_damping, sec_damping, coeff_shift);
759 : }
760 : }
761 1145 : cdef_left = 1; //CHKN filtered data is written back directy to recFrame.
762 : }
763 : {
764 540 : uint8_t *tmp = prev_row_cdef;
765 540 : prev_row_cdef = curr_row_cdef;
766 540 : curr_row_cdef = tmp;
767 : }
768 : }
769 90 : eb_aom_free(row_cdef);
770 360 : for (int32_t pli = 0; pli < num_planes; pli++) {
771 270 : eb_aom_free(linebuf[pli]);
772 270 : eb_aom_free(colbuf[pli]);
773 : }
774 90 : }
775 :
776 0 : void av1_cdef_frame16bit(
777 : EncDecContext *context_ptr,
778 : SequenceControlSet *sequence_control_set_ptr,
779 : PictureControlSet *pCs){
780 : (void)context_ptr;
781 0 : struct PictureParentControlSet *pPcs = pCs->parent_pcs_ptr;
782 0 : Av1Common* cm = pPcs->av1_cm;
783 0 : FrameHeader *frm_hdr = &pPcs->frm_hdr;
784 :
785 : EbPictureBufferDesc * recon_picture_ptr;
786 :
787 0 : if (pPcs->is_used_as_reference_flag == EB_TRUE)
788 0 : recon_picture_ptr = ((EbReferenceObject*)pCs->parent_pcs_ptr->reference_picture_wrapper_ptr->object_ptr)->reference_picture16bit;
789 :
790 : else
791 0 : recon_picture_ptr = pCs->recon_picture16bit_ptr;
792 :
793 0 : uint16_t* reconBufferY = (uint16_t*)recon_picture_ptr->buffer_y + (recon_picture_ptr->origin_x + recon_picture_ptr->origin_y * recon_picture_ptr->stride_y);
794 0 : uint16_t* reconBufferCb = (uint16_t*)recon_picture_ptr->buffer_cb + (recon_picture_ptr->origin_x / 2 + recon_picture_ptr->origin_y / 2 * recon_picture_ptr->stride_cb);
795 0 : uint16_t* reconBufferCr = (uint16_t*)recon_picture_ptr->buffer_cr + (recon_picture_ptr->origin_x / 2 + recon_picture_ptr->origin_y / 2 * recon_picture_ptr->stride_cr);
796 :
797 0 : const int32_t num_planes = 3;// av1_num_planes(cm);
798 : DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]);
799 : uint16_t *linebuf[3];
800 : uint16_t *colbuf[3];
801 : cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
802 : uint8_t *row_cdef, *prev_row_cdef, *curr_row_cdef;
803 : int32_t cdef_count;
804 0 : int32_t dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
805 0 : int32_t var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
806 : int32_t mi_wide_l2[3];
807 : int32_t mi_high_l2[3];
808 : int32_t xdec[3];
809 : int32_t ydec[3];
810 0 : int32_t coeff_shift = AOMMAX(sequence_control_set_ptr->static_config.encoder_bit_depth/*cm->bit_depth*/ - 8, 0);
811 0 : const int32_t nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
812 0 : const int32_t nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
813 : //eb_av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0, num_planes);
814 0 : row_cdef = (uint8_t *)eb_aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2);
815 0 : assert(row_cdef);
816 0 : memset(row_cdef, 1, sizeof(*row_cdef) * (nhfb + 2) * 2);
817 0 : prev_row_cdef = row_cdef + 1;
818 0 : curr_row_cdef = prev_row_cdef + nhfb + 2;
819 0 : for (int32_t pli = 0; pli < num_planes; pli++) {
820 0 : int32_t subsampling_x = (pli == 0) ? 0 : 1;
821 0 : int32_t subsampling_y = (pli == 0) ? 0 : 1;
822 :
823 0 : xdec[pli] = subsampling_x; //CHKN xd->plane[pli].subsampling_x;
824 0 : ydec[pli] = subsampling_y; //CHKN xd->plane[pli].subsampling_y;
825 0 : mi_wide_l2[pli] = MI_SIZE_LOG2 - subsampling_x; //CHKN xd->plane[pli].subsampling_x;
826 0 : mi_high_l2[pli] = MI_SIZE_LOG2 - subsampling_y; //CHKN xd->plane[pli].subsampling_y;
827 : }
828 :
829 0 : const int32_t stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
830 0 : for (int32_t pli = 0; pli < num_planes; pli++) {
831 0 : linebuf[pli] = (uint16_t *)eb_aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride);
832 0 : colbuf[pli] = (uint16_t *)eb_aom_malloc(sizeof(*colbuf) * ((CDEF_BLOCKSIZE << mi_high_l2[pli]) + 2 * CDEF_VBORDER) * CDEF_HBORDER);
833 : }
834 :
835 0 : for (int32_t fbr = 0; fbr < nvfb; fbr++) {
836 0 : for (int32_t pli = 0; pli < num_planes; pli++) {
837 0 : const int32_t block_height =
838 0 : (MI_SIZE_64X64 << mi_high_l2[pli]) + 2 * CDEF_VBORDER;
839 0 : fill_rect(colbuf[pli], CDEF_HBORDER, block_height, CDEF_HBORDER,
840 : CDEF_VERY_LARGE);
841 : }
842 :
843 0 : int32_t cdef_left = 1;
844 0 : for (int32_t fbc = 0; fbc < nhfb; fbc++) {
845 : int32_t level, sec_strength;
846 : int32_t uv_level, uv_sec_strength;
847 : int32_t nhb, nvb;
848 0 : int32_t cstart = 0;
849 0 : curr_row_cdef[fbc] = 0;
850 :
851 : //WAHT IS THIS ?? CHKN -->for
852 0 : if (pCs->mi_grid_base[MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc] == NULL ||
853 0 : pCs->mi_grid_base[MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc]->mbmi.cdef_strength == -1) {
854 0 : cdef_left = 0;
855 0 : printf("\n\n\nCDEF ERROR: Skipping Current FB\n\n\n");
856 0 : continue;
857 : }
858 :
859 0 : if (!cdef_left) cstart = -CDEF_HBORDER; //CHKN if the left block has not been filtered, then we can use samples on the left as input.
860 :
861 0 : nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
862 0 : nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
863 : int32_t frame_top, frame_left, frame_bottom, frame_right;
864 :
865 0 : int32_t mi_row = MI_SIZE_64X64 * fbr;
866 0 : int32_t mi_col = MI_SIZE_64X64 * fbc;
867 : // for the current filter block, it's top left corner mi structure (mi_tl)
868 : // is first accessed to check whether the top and left boundaries are
869 : // frame boundaries. Then bottom-left and top-right mi structures are
870 : // accessed to check whether the bottom and right boundaries
871 : // (respectively) are frame boundaries.
872 : //
873 : // Note that we can't just check the bottom-right mi structure - eg. if
874 : // we're at the right-hand edge of the frame but not the bottom, then
875 : // the bottom-right mi is NULL but the bottom-left is not.
876 0 : frame_top = (mi_row == 0) ? 1 : 0;
877 0 : frame_left = (mi_col == 0) ? 1 : 0;
878 :
879 0 : if (fbr != nvfb - 1)
880 0 : frame_bottom = (mi_row + MI_SIZE_64X64 == cm->mi_rows) ? 1 : 0;
881 : else
882 0 : frame_bottom = 1;
883 :
884 0 : if (fbc != nhfb - 1)
885 0 : frame_right = (mi_col + MI_SIZE_64X64 == cm->mi_cols) ? 1 : 0;
886 : else
887 0 : frame_right = 1;
888 :
889 0 : const int32_t mbmi_cdef_strength = pCs->mi_grid_base[MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc]->mbmi.cdef_strength;
890 0 : level = frm_hdr->CDEF_params.cdef_y_strength[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
891 0 : sec_strength = frm_hdr->CDEF_params.cdef_y_strength[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
892 0 : sec_strength += sec_strength == 3;
893 0 : uv_level = frm_hdr->CDEF_params.cdef_uv_strength[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
894 0 : uv_sec_strength = frm_hdr->CDEF_params.cdef_uv_strength[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
895 0 : uv_sec_strength += uv_sec_strength == 3;
896 0 : if ((level == 0 && sec_strength == 0 && uv_level == 0 && uv_sec_strength == 0) ||
897 0 : (cdef_count = eb_sb_compute_cdef_list(pCs, cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, BLOCK_64X64)) == 0) {
898 0 : cdef_left = 0;
899 0 : continue;
900 : }
901 :
902 0 : curr_row_cdef[fbc] = 1;
903 0 : for (int32_t pli = 0; pli < num_planes; pli++) {
904 : int32_t coffset;
905 : int32_t rend, cend;
906 0 : int32_t pri_damping = frm_hdr->CDEF_params.cdef_damping;
907 0 : int32_t sec_damping = frm_hdr->CDEF_params.cdef_damping;
908 0 : int32_t hsize = nhb << mi_wide_l2[pli];
909 0 : int32_t vsize = nvb << mi_high_l2[pli];
910 :
911 0 : if (pli) {
912 0 : level = uv_level;
913 0 : sec_strength = uv_sec_strength;
914 : }
915 :
916 0 : if (fbc == nhfb - 1)
917 0 : cend = hsize;
918 : else
919 0 : cend = hsize + CDEF_HBORDER;
920 :
921 0 : if (fbr == nvfb - 1)
922 0 : rend = vsize;
923 : else
924 0 : rend = vsize + CDEF_VBORDER;
925 :
926 0 : coffset = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
927 0 : if (fbc == nhfb - 1) {
928 : /* On the last superblock column, fill in the right border with
929 : CDEF_VERY_LARGE to avoid filtering with the outside. */
930 0 : fill_rect(&src[cend + CDEF_HBORDER], CDEF_BSTRIDE,
931 0 : rend + CDEF_VBORDER, hsize + CDEF_HBORDER - cend,
932 : CDEF_VERY_LARGE);
933 : }
934 0 : if (fbr == nvfb - 1) {
935 : /* On the last superblock row, fill in the bottom border with
936 : CDEF_VERY_LARGE to avoid filtering with the outside. */
937 0 : fill_rect(&src[(rend + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
938 : CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
939 : }
940 :
941 0 : uint16_t* recBuff = 0;
942 0 : uint32_t recStride = 0;
943 :
944 0 : switch (pli) {
945 0 : case 0:
946 0 : recBuff = reconBufferY;
947 0 : recStride = recon_picture_ptr->stride_y;
948 0 : break;
949 0 : case 1:
950 0 : recBuff = reconBufferCb;
951 0 : recStride = recon_picture_ptr->stride_cb;
952 :
953 0 : break;
954 0 : case 2:
955 0 : recBuff = reconBufferCr;
956 0 : recStride = recon_picture_ptr->stride_cr;
957 0 : break;
958 : }
959 :
960 : //--ok
961 : /* Copy in the pixels we need from the current superblock for
962 : deringing.*/
963 :
964 0 : copy_sb16_16(//cm,
965 0 : &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
966 : CDEF_BSTRIDE, recBuff/*xd->plane[pli].dst.buf*/,
967 0 : (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr, coffset + cstart,
968 : recStride/*xd->plane[pli].dst.stride*/, rend, cend - cstart);
969 :
970 0 : if (!prev_row_cdef[fbc]) {
971 0 : copy_sb16_16(//cm,
972 : &src[CDEF_HBORDER], CDEF_BSTRIDE,
973 : recBuff/*xd->plane[pli].dst.buf*/,
974 0 : (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
975 : coffset, recStride/*xd->plane[pli].dst.stride*/, CDEF_VBORDER, hsize);
976 : }
977 0 : else if (fbr > 0) {
978 0 : copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &linebuf[pli][coffset],
979 : stride, CDEF_VBORDER, hsize);
980 : }
981 : else {
982 0 : fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
983 : CDEF_VERY_LARGE);
984 : }
985 :
986 0 : if (!prev_row_cdef[fbc - 1]) {
987 0 : copy_sb16_16(//cm,
988 : src, CDEF_BSTRIDE, recBuff/*xd->plane[pli].dst.buf*/,
989 0 : (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
990 : coffset - CDEF_HBORDER, recStride/*xd->plane[pli].dst.stride*/,
991 : CDEF_VBORDER, CDEF_HBORDER);
992 : }
993 0 : else if (fbr > 0 && fbc > 0) {
994 0 : copy_rect(src, CDEF_BSTRIDE, &linebuf[pli][coffset - CDEF_HBORDER],
995 : stride, CDEF_VBORDER, CDEF_HBORDER);
996 : }
997 : else {
998 0 : fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
999 : CDEF_VERY_LARGE);
1000 : }
1001 :
1002 0 : if (!prev_row_cdef[fbc + 1]) {
1003 0 : copy_sb16_16(//cm,
1004 0 : &src[CDEF_HBORDER + (nhb << mi_wide_l2[pli])],
1005 : CDEF_BSTRIDE, recBuff/*xd->plane[pli].dst.buf*/,
1006 0 : (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
1007 : coffset + hsize, recStride/*xd->plane[pli].dst.stride*/, CDEF_VBORDER,
1008 : CDEF_HBORDER);
1009 : }
1010 0 : else if (fbr > 0 && fbc < nhfb - 1) {
1011 0 : copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
1012 0 : &linebuf[pli][coffset + hsize], stride, CDEF_VBORDER,
1013 : CDEF_HBORDER);
1014 : }
1015 : else {
1016 0 : fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
1017 : CDEF_HBORDER, CDEF_VERY_LARGE);
1018 : }
1019 :
1020 0 : if (cdef_left) {
1021 : /* If we deringed the superblock on the left then we need to copy in
1022 : saved pixels. */
1023 0 : copy_rect(src, CDEF_BSTRIDE, colbuf[pli], CDEF_HBORDER,
1024 : rend + CDEF_VBORDER, CDEF_HBORDER);
1025 : }
1026 :
1027 : /* Saving pixels in case we need to dering the superblock on the
1028 : right. */
1029 0 : if (fbc < nhfb - 1)
1030 0 : copy_rect(colbuf[pli], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
1031 : rend + CDEF_VBORDER, CDEF_HBORDER);
1032 0 : if (fbr < nvfb - 1)
1033 0 : copy_sb16_16(
1034 : //cm,
1035 0 : &linebuf[pli][coffset], stride, recBuff/*xd->plane[pli].dst.buf*/,
1036 0 : (MI_SIZE_64X64 << mi_high_l2[pli]) * (fbr + 1) - CDEF_VBORDER,
1037 : coffset, recStride/*xd->plane[pli].dst.stride*/, CDEF_VBORDER, hsize);
1038 0 : if (frame_top) {
1039 0 : fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, hsize + 2 * CDEF_HBORDER,
1040 : CDEF_VERY_LARGE);
1041 : }
1042 0 : if (frame_left) {
1043 0 : fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
1044 : CDEF_VERY_LARGE);
1045 : }
1046 0 : if (frame_bottom) {
1047 0 : fill_rect(&src[(vsize + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
1048 : CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
1049 : }
1050 0 : if (frame_right) {
1051 0 : fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
1052 : vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
1053 : }
1054 :
1055 : //if (cm->use_highbitdepth) {
1056 : // eb_cdef_filter_fb(
1057 : // NULL,
1058 : // &CONVERT_TO_SHORTPTR(
1059 : // xd->plane[pli]
1060 : // .dst.buf)[xd->plane[pli].dst.stride *
1061 : // (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
1062 : // (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
1063 : // xd->plane[pli].dst.stride,
1064 : // &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
1065 : // ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
1066 : // sec_strength, pri_damping, sec_damping, coeff_shift);
1067 : //} else
1068 : {
1069 0 : eb_cdef_filter_fb(
1070 : NULL,
1071 0 : &recBuff[recStride *(MI_SIZE_64X64 * fbr << mi_high_l2[pli]) + (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
1072 : //&xd->plane[pli].dst.buf[xd->plane[pli].dst.stride *(MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +(fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
1073 : recStride/*xd->plane[pli].dst.stride*/,
1074 : &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
1075 : ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
1076 : sec_strength, pri_damping, sec_damping, coeff_shift);
1077 : }
1078 : }
1079 0 : cdef_left = 1; //CHKN filtered data is written back directy to recFrame.
1080 : }
1081 : {
1082 0 : uint8_t *tmp = prev_row_cdef;
1083 0 : prev_row_cdef = curr_row_cdef;
1084 0 : curr_row_cdef = tmp;
1085 : }
1086 : }
1087 0 : eb_aom_free(row_cdef);
1088 0 : for (int32_t pli = 0; pli < num_planes; pli++) {
1089 0 : eb_aom_free(linebuf[pli]);
1090 0 : eb_aom_free(colbuf[pli]);
1091 : }
1092 0 : }
1093 :
1094 : ///-------search
1095 :
1096 : static int32_t priconv[REDUCED_PRI_STRENGTHS] = { 0, 1, 2, 3, 5, 7, 10, 13 };
1097 :
1098 : /* Search for the best strength to add as an option, knowing we
1099 : already selected nb_strengths options. */
1100 0 : static uint64_t search_one(int32_t *lev, int32_t nb_strengths,
1101 : uint64_t mse[][TOTAL_STRENGTHS], int32_t sb_count,
1102 : int32_t fast, int32_t start_gi, int32_t end_gi) {
1103 : uint64_t tot_mse[TOTAL_STRENGTHS];
1104 : (void)fast;
1105 0 : const int32_t total_strengths = end_gi;
1106 : int32_t i, j;
1107 0 : uint64_t best_tot_mse = (uint64_t)1 << 63;
1108 0 : int32_t best_id = 0;
1109 0 : memset(tot_mse, 0, sizeof(tot_mse));
1110 0 : for (i = 0; i < sb_count; i++) {
1111 : int32_t gi;
1112 0 : uint64_t best_mse = (uint64_t)1 << 63;
1113 : /* Find best mse among already selected options. */
1114 0 : for (gi = 0; gi < nb_strengths; gi++) {
1115 0 : if (mse[i][lev[gi]] < best_mse)
1116 0 : best_mse = mse[i][lev[gi]];
1117 : }
1118 : /* Find best mse when adding each possible new option. */
1119 :
1120 0 : for (j = start_gi; j < total_strengths; j++) {
1121 0 : uint64_t best = best_mse;
1122 0 : if (mse[i][j] < best) best = mse[i][j];
1123 0 : tot_mse[j] += best;
1124 : }
1125 : }
1126 0 : for (j = start_gi; j < total_strengths; j++) {
1127 0 : if (tot_mse[j] < best_tot_mse) {
1128 0 : best_tot_mse = tot_mse[j];
1129 0 : best_id = j;
1130 : }
1131 : }
1132 0 : lev[nb_strengths] = best_id;
1133 0 : return best_tot_mse;
1134 : }
1135 :
1136 : /* Search for the best luma+chroma strength to add as an option, knowing we
1137 : already selected nb_strengths options. */
1138 0 : uint64_t search_one_dual_c(int *lev0, int *lev1, int nb_strengths,
1139 : uint64_t(**mse)[TOTAL_STRENGTHS], int sb_count,
1140 : int fast, int start_gi, int end_gi) {
1141 : uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
1142 : int32_t i, j;
1143 0 : uint64_t best_tot_mse = (uint64_t)1 << 63;
1144 0 : int32_t best_id0 = 0;
1145 0 : int32_t best_id1 = 0;
1146 : (void)fast;
1147 0 : const int32_t total_strengths = end_gi;
1148 0 : memset(tot_mse, 0, sizeof(tot_mse));
1149 0 : for (i = 0; i < sb_count; i++) {
1150 : int32_t gi;
1151 0 : uint64_t best_mse = (uint64_t)1 << 63;
1152 : /* Find best mse among already selected options. */
1153 0 : for (gi = 0; gi < nb_strengths; gi++) {
1154 0 : uint64_t curr = mse[0][i][lev0[gi]];
1155 0 : curr += mse[1][i][lev1[gi]];
1156 0 : if (curr < best_mse)
1157 0 : best_mse = curr;
1158 : }
1159 : /* Find best mse when adding each possible new option. */
1160 0 : for (j = start_gi; j < total_strengths; j++) {
1161 : int32_t k;
1162 0 : for (k = start_gi; k < total_strengths; k++) {
1163 0 : uint64_t best = best_mse;
1164 0 : uint64_t curr = mse[0][i][j];
1165 0 : curr += mse[1][i][k];
1166 0 : if (curr < best) best = curr;
1167 0 : tot_mse[j][k] += best;
1168 : }
1169 : }
1170 : }
1171 :
1172 0 : for (j = start_gi; j < total_strengths; j++) {
1173 : int32_t k;
1174 0 : for (k = start_gi; k < total_strengths; k++) {
1175 0 : if (tot_mse[j][k] < best_tot_mse) {
1176 0 : best_tot_mse = tot_mse[j][k];
1177 0 : best_id0 = j;
1178 0 : best_id1 = k;
1179 : }
1180 : }
1181 : }
1182 0 : lev0[nb_strengths] = best_id0;
1183 0 : lev1[nb_strengths] = best_id1;
1184 0 : return best_tot_mse;
1185 : }
1186 :
1187 : /* Search for the set of strengths that minimizes mse. */
1188 0 : static uint64_t joint_strength_search(int32_t *best_lev, int32_t nb_strengths,
1189 : uint64_t mse[][TOTAL_STRENGTHS],
1190 : int32_t sb_count, int32_t fast, int32_t start_gi, int32_t end_gi) {
1191 : uint64_t best_tot_mse;
1192 : int32_t i;
1193 0 : best_tot_mse = (uint64_t)1 << 63;
1194 : /* Greedy search: add one strength options at a time. */
1195 0 : for (i = 0; i < nb_strengths; i++)
1196 0 : best_tot_mse = search_one(best_lev, i, mse, sb_count, fast, start_gi, end_gi);
1197 : /* Trying to refine the greedy search by reconsidering each
1198 : already-selected option. */
1199 0 : if (!fast) {
1200 0 : for (i = 0; i < 4 * nb_strengths; i++) {
1201 : int32_t j;
1202 0 : for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1];
1203 : best_tot_mse =
1204 0 : search_one(best_lev, nb_strengths - 1, mse, sb_count, fast, start_gi, end_gi);
1205 : }
1206 : }
1207 0 : return best_tot_mse;
1208 : }
1209 :
1210 : /* Search for the set of luma+chroma strengths that minimizes mse. */
1211 480 : static uint64_t joint_strength_search_dual(int32_t *best_lev0, int32_t *best_lev1,
1212 : int32_t nb_strengths,
1213 : uint64_t(**mse)[TOTAL_STRENGTHS],
1214 : int32_t sb_count, int32_t fast, int32_t start_gi, int32_t end_gi) {
1215 : uint64_t best_tot_mse;
1216 : int32_t i;
1217 480 : best_tot_mse = (uint64_t)1 << 63;
1218 : /* Greedy search: add one strength options at a time. */
1219 2280 : for (i = 0; i < nb_strengths; i++)
1220 1800 : best_tot_mse = search_one_dual(best_lev0, best_lev1, i, mse, sb_count, fast, start_gi, end_gi);
1221 : /* Trying to refine the greedy search by reconsidering each
1222 : already-selected option. */
1223 7679 : for (i = 0; i < 4 * nb_strengths; i++) {
1224 : int32_t j;
1225 40797 : for (j = 0; j < nb_strengths - 1; j++) {
1226 33597 : best_lev0[j] = best_lev0[j + 1];
1227 33597 : best_lev1[j] = best_lev1[j + 1];
1228 : }
1229 7200 : best_tot_mse = search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse, sb_count, fast, start_gi, end_gi);
1230 : }
1231 479 : return best_tot_mse;
1232 : }
1233 :
1234 : /* FIXME: SSE-optimize this. */
1235 0 : void copy_sb16_16(uint16_t *dst, int32_t dstride, const uint16_t *src,
1236 : int32_t src_voffset, int32_t src_hoffset, int32_t sstride,
1237 : int32_t vsize, int32_t hsize) {
1238 : int32_t r, c;
1239 0 : const uint16_t *base = &src[src_voffset * sstride + src_hoffset];
1240 0 : for (r = 0; r < vsize; r++) {
1241 0 : EB_MEMCPY(dst, (void*)base, 2 * hsize);
1242 0 : dst += dstride;
1243 0 : base += sstride;
1244 : }
1245 : UNUSED(c);
1246 0 : }
1247 :
1248 0 : static INLINE uint64_t dist_8x8_16bit_c(const uint16_t *src, const uint16_t *dst, const int32_t dstride, const int32_t coeff_shift) {
1249 0 : uint64_t svar = 0;
1250 0 : uint64_t dvar = 0;
1251 0 : uint64_t sum_s = 0;
1252 0 : uint64_t sum_d = 0;
1253 0 : uint64_t sum_s2 = 0;
1254 0 : uint64_t sum_d2 = 0;
1255 0 : uint64_t sum_sd = 0;
1256 : int32_t i, j;
1257 0 : for (i = 0; i < 8; i++) {
1258 0 : for (j = 0; j < 8; j++) {
1259 0 : sum_s += src[8 * i + j];
1260 0 : sum_d += dst[i * dstride + j];
1261 0 : sum_s2 += src[8 * i + j] * src[8 * i + j];
1262 0 : sum_d2 += dst[i * dstride + j] * dst[i * dstride + j];
1263 0 : sum_sd += src[8 * i + j] * dst[i * dstride + j];
1264 : }
1265 : }
1266 : /* Compute the variance -- the calculation cannot go negative. */
1267 0 : svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
1268 0 : dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
1269 0 : return (uint64_t)floor(
1270 0 : .5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
1271 0 : (svar + dvar + (400 << 2 * coeff_shift)) /
1272 0 : (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar)));
1273 : }
1274 :
1275 0 : static INLINE uint64_t mse_8_16bit(const uint16_t *src, const uint16_t *dst, const int32_t dstride, const int32_t height) {
1276 0 : uint64_t sum = 0;
1277 : int32_t i, j;
1278 0 : for (i = 0; i < height; i++) {
1279 0 : for (j = 0; j < 8; j++) {
1280 0 : int32_t e = dst[i * dstride + j] - src[8 * i + j];
1281 0 : sum += e * e;
1282 : }
1283 : }
1284 0 : return sum;
1285 : }
1286 :
1287 0 : static INLINE uint64_t mse_4_16bit_c(const uint16_t *src, const uint16_t *dst, const int32_t dstride, const int32_t height) {
1288 0 : uint64_t sum = 0;
1289 : int32_t i, j;
1290 0 : for (i = 0; i < height; i++) {
1291 0 : for (j = 0; j < 4; j++) {
1292 0 : int32_t e = dst[i * dstride + j] - src[4 * i + j];
1293 0 : sum += e * e;
1294 : }
1295 : }
1296 0 : return sum;
1297 : }
1298 :
1299 0 : static INLINE uint64_t dist_8x8_8bit_c(const uint8_t *src, const uint8_t *dst, const int32_t dstride, const int32_t coeff_shift) {
1300 0 : uint64_t svar = 0;
1301 0 : uint64_t dvar = 0;
1302 0 : uint64_t sum_s = 0;
1303 0 : uint64_t sum_d = 0;
1304 0 : uint64_t sum_s2 = 0;
1305 0 : uint64_t sum_d2 = 0;
1306 0 : uint64_t sum_sd = 0;
1307 : int32_t i, j;
1308 0 : for (i = 0; i < 8; i++) {
1309 0 : for (j = 0; j < 8; j++) {
1310 0 : sum_s += src[8 * i + j];
1311 0 : sum_d += dst[i * dstride + j];
1312 0 : sum_s2 += src[8 * i + j] * src[8 * i + j];
1313 0 : sum_d2 += dst[i * dstride + j] * dst[i * dstride + j];
1314 0 : sum_sd += src[8 * i + j] * dst[i * dstride + j];
1315 : }
1316 : }
1317 : /* Compute the variance -- the calculation cannot go negative. */
1318 0 : svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
1319 0 : dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
1320 0 : return (uint64_t)floor(
1321 0 : .5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
1322 0 : (svar + dvar + (400 << 2 * coeff_shift)) /
1323 0 : (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar)));
1324 : }
1325 :
1326 0 : static INLINE uint64_t mse_8_8bit(const uint8_t *src, const uint8_t *dst, const int32_t dstride, const int32_t height) {
1327 0 : uint64_t sum = 0;
1328 : int32_t i, j;
1329 0 : for (i = 0; i < height; i++) {
1330 0 : for (j = 0; j < 8; j++) {
1331 0 : int32_t e = dst[i * dstride + j] - src[8 * i + j];
1332 0 : sum += e * e;
1333 : }
1334 : }
1335 0 : return sum;
1336 : }
1337 :
1338 0 : static INLINE uint64_t mse_4_8bit_c(const uint8_t *src, const uint8_t *dst, const int32_t dstride, const int32_t height) {
1339 0 : uint64_t sum = 0;
1340 : int32_t i, j;
1341 0 : for (i = 0; i < height; i++) {
1342 0 : for (j = 0; j < 4; j++) {
1343 0 : int32_t e = dst[i * dstride + j] - src[4 * i + j];
1344 0 : sum += e * e;
1345 : }
1346 : }
1347 0 : return sum;
1348 : }
1349 :
1350 : /* Compute MSE only on the blocks we filtered. */
1351 0 : uint64_t compute_cdef_dist_c(const uint16_t *dst, int32_t dstride, const uint16_t *src, const cdef_list *dlist, int32_t cdef_count, BlockSize bsize, int32_t coeff_shift, int32_t pli) {
1352 0 : uint64_t sum = 0;
1353 : int32_t bi, bx, by;
1354 0 : if (bsize == BLOCK_8X8) {
1355 0 : for (bi = 0; bi < cdef_count; bi++) {
1356 0 : by = dlist[bi].by;
1357 0 : bx = dlist[bi].bx;
1358 0 : if (pli == 0) {
1359 0 : sum += dist_8x8_16bit_c(&src[bi << (3 + 3)], &dst[(by << 3) * dstride + (bx << 3)], dstride,
1360 : coeff_shift);
1361 : }
1362 : else
1363 0 : sum += mse_8_16bit(&src[bi << (3 + 3)], &dst[(by << 3) * dstride + (bx << 3)], dstride, 8);
1364 : }
1365 : }
1366 0 : else if (bsize == BLOCK_4X8) {
1367 0 : for (bi = 0; bi < cdef_count; bi++) {
1368 0 : by = dlist[bi].by;
1369 0 : bx = dlist[bi].bx;
1370 0 : sum += mse_4_16bit_c(&src[bi << (3 + 2)], &dst[(by << 3) * dstride + (bx << 2)], dstride, 8);
1371 : }
1372 : }
1373 0 : else if (bsize == BLOCK_8X4) {
1374 0 : for (bi = 0; bi < cdef_count; bi++) {
1375 0 : by = dlist[bi].by;
1376 0 : bx = dlist[bi].bx;
1377 0 : sum += mse_8_16bit(&src[bi << (2 + 3)], &dst[(by << 2) * dstride + (bx << 3)], dstride, 4);
1378 : }
1379 : }
1380 : else {
1381 0 : assert(bsize == BLOCK_4X4);
1382 0 : for (bi = 0; bi < cdef_count; bi++) {
1383 0 : by = dlist[bi].by;
1384 0 : bx = dlist[bi].bx;
1385 0 : sum += mse_4_16bit_c(&src[bi << (2 + 2)], &dst[(by << 2) * dstride + (bx << 2)], dstride, 4);
1386 : }
1387 : }
1388 0 : return sum >> 2 * coeff_shift;
1389 : }
1390 :
1391 0 : uint64_t compute_cdef_dist_8bit_c(const uint8_t *dst8, int32_t dstride, const uint8_t *src8, const cdef_list *dlist, int32_t cdef_count, BlockSize bsize, int32_t coeff_shift, int32_t pli) {
1392 0 : uint64_t sum = 0;
1393 : int32_t bi, bx, by;
1394 0 : if (bsize == BLOCK_8X8) {
1395 0 : for (bi = 0; bi < cdef_count; bi++) {
1396 0 : by = dlist[bi].by;
1397 0 : bx = dlist[bi].bx;
1398 0 : if (pli == 0) {
1399 0 : sum += dist_8x8_8bit_c(&src8[bi << (3 + 3)], &dst8[(by << 3) * dstride + (bx << 3)], dstride,
1400 : coeff_shift);
1401 : }
1402 : else
1403 0 : sum += mse_8_8bit(&src8[bi << (3 + 3)], &dst8[(by << 3) * dstride + (bx << 3)], dstride, 8);
1404 : }
1405 : }
1406 0 : else if (bsize == BLOCK_4X8) {
1407 0 : for (bi = 0; bi < cdef_count; bi++) {
1408 0 : by = dlist[bi].by;
1409 0 : bx = dlist[bi].bx;
1410 0 : sum += mse_4_8bit_c(&src8[bi << (3 + 2)], &dst8[(by << 3) * dstride + (bx << 2)], dstride, 8);
1411 : }
1412 : }
1413 0 : else if (bsize == BLOCK_8X4) {
1414 0 : for (bi = 0; bi < cdef_count; bi++) {
1415 0 : by = dlist[bi].by;
1416 0 : bx = dlist[bi].bx;
1417 0 : sum += mse_8_8bit(&src8[bi << (2 + 3)], &dst8[(by << 2) * dstride + (bx << 3)], dstride, 4);
1418 : }
1419 : }
1420 : else {
1421 0 : assert(bsize == BLOCK_4X4);
1422 0 : for (bi = 0; bi < cdef_count; bi++) {
1423 0 : by = dlist[bi].by;
1424 0 : bx = dlist[bi].bx;
1425 0 : sum += mse_4_8bit_c(&src8[bi << (2 + 2)], &dst8[(by << 2) * dstride + (bx << 2)], dstride, 4);
1426 : }
1427 : }
1428 0 : return sum >> 2 * coeff_shift;
1429 : }
1430 :
1431 120 : void finish_cdef_search(
1432 : EncDecContext *context_ptr,
1433 : SequenceControlSet *sequence_control_set_ptr,
1434 : PictureControlSet *picture_control_set_ptr
1435 : , int32_t selected_strength_cnt[64]
1436 : )
1437 : {
1438 : (void)context_ptr;
1439 120 : int32_t fast = 0;
1440 120 : struct PictureParentControlSet *pPcs = picture_control_set_ptr->parent_pcs_ptr;
1441 120 : FrameHeader *frm_hdr = &pPcs->frm_hdr;
1442 120 : Av1Common* cm = pPcs->av1_cm;
1443 120 : int32_t mi_rows = pPcs->av1_cm->mi_rows;
1444 120 : int32_t mi_cols = pPcs->av1_cm->mi_cols;
1445 :
1446 : int32_t fbr, fbc;
1447 :
1448 : int32_t pli;
1449 :
1450 120 : uint64_t best_tot_mse = (uint64_t)1 << 63;
1451 : uint64_t tot_mse;
1452 : int32_t sb_count;
1453 120 : int32_t nvfb = (mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
1454 120 : int32_t nhfb = (mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
1455 120 : int32_t *sb_index = (int32_t *)malloc(nvfb * nhfb * sizeof(*sb_index));
1456 120 : int32_t *selected_strength = (int32_t *)malloc(nvfb * nhfb * sizeof(*sb_index));
1457 120 : int32_t best_frame_gi_cnt = 0;
1458 120 : const int32_t total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
1459 : int32_t gi_step;
1460 : int32_t mid_gi;
1461 : int32_t start_gi;
1462 : int32_t end_gi;
1463 :
1464 120 : assert(sb_index != NULL);
1465 120 : assert(selected_strength != NULL);
1466 :
1467 120 : gi_step = get_cdef_gi_step(pPcs->cdef_filter_mode);
1468 :
1469 120 : mid_gi = pPcs->cdf_ref_frame_strenght;
1470 120 : start_gi = pPcs->use_ref_frame_cdef_strength && pPcs->cdef_filter_mode == 1 ? (AOMMAX(0, mid_gi - gi_step)) : 0;
1471 120 : end_gi = pPcs->use_ref_frame_cdef_strength ? AOMMIN(total_strengths, mid_gi + gi_step) : pPcs->cdef_filter_mode == 1 ? 8 : total_strengths;
1472 :
1473 : uint64_t(*mse[2])[TOTAL_STRENGTHS];
1474 120 : int32_t pri_damping = 3 + (frm_hdr->quantization_params.base_q_idx >> 6);
1475 : //int32_t sec_damping = 3 + (frm_hdr->quantization_params.base_q_idx >> 6);
1476 : int32_t i;
1477 : int32_t nb_strengths;
1478 : int32_t nb_strength_bits;
1479 : int32_t quantizer;
1480 : double lambda;
1481 120 : const int32_t num_planes = 3;
1482 :
1483 120 : quantizer =
1484 120 : eb_av1_ac_quant_Q3(frm_hdr->quantization_params.base_q_idx, 0, (AomBitDepth)sequence_control_set_ptr->static_config.encoder_bit_depth) >> (sequence_control_set_ptr->static_config.encoder_bit_depth - 8);
1485 120 : lambda = .12 * quantizer * quantizer / 256.;
1486 :
1487 120 : mse[0] = (uint64_t(*)[64])malloc(sizeof(**mse) * nvfb * nhfb);
1488 120 : mse[1] = (uint64_t(*)[64])malloc(sizeof(**mse) * nvfb * nhfb);
1489 :
1490 120 : sb_count = 0;
1491 840 : for (fbr = 0; fbr < nvfb; ++fbr) {
1492 7920 : for (fbc = 0; fbc < nhfb; ++fbc) {
1493 7200 : ModeInfo **mi = picture_control_set_ptr->mi_grid_base + MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc;
1494 7200 : const MbModeInfo *mbmi = &mi[0]->mbmi;
1495 :
1496 7200 : if (((fbc & 1) &&
1497 3600 : (mbmi->block_mi.sb_type == BLOCK_128X128 || mbmi->block_mi.sb_type == BLOCK_128X64)) ||
1498 7200 : ((fbr & 1) &&
1499 3600 : (mbmi->block_mi.sb_type == BLOCK_128X128 || mbmi->block_mi.sb_type == BLOCK_64X128)))
1500 : {
1501 0 : continue;
1502 : }
1503 :
1504 : // No filtering if the entire filter block is skipped
1505 7200 : if (eb_sb_all_skip(picture_control_set_ptr, cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64))
1506 5765 : continue;
1507 :
1508 5740 : for (pli = 0; pli < num_planes; pli++) {
1509 4305 : if (pli == 0)
1510 1435 : memcpy(mse[0][sb_count], picture_control_set_ptr->mse_seg[0][fbr*nhfb + fbc], TOTAL_STRENGTHS * sizeof(uint64_t));
1511 4305 : if (pli == 2)
1512 1435 : memcpy(mse[1][sb_count], picture_control_set_ptr->mse_seg[1][fbr*nhfb + fbc], TOTAL_STRENGTHS * sizeof(uint64_t));
1513 4305 : sb_index[sb_count] = MI_SIZE_64X64 * fbr * picture_control_set_ptr->mi_stride + MI_SIZE_64X64 * fbc;
1514 : }
1515 1435 : sb_count++;
1516 : }
1517 : }
1518 :
1519 120 : nb_strength_bits = 0;
1520 : /* Search for different number of signalling bits. */
1521 600 : for (i = 0; i <= 3; i++) {
1522 : int32_t j;
1523 : int32_t best_lev0[CDEF_MAX_STRENGTHS];
1524 480 : int32_t best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
1525 480 : nb_strengths = 1 << i;
1526 480 : if (num_planes >= 3)
1527 480 : tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths, mse, sb_count, fast, start_gi, end_gi);
1528 : else
1529 0 : tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count, fast, start_gi, end_gi);
1530 : /* Count superblock signalling cost. */
1531 480 : tot_mse += (uint64_t)(sb_count * lambda * i);
1532 : /* Count header signalling cost. */
1533 480 : tot_mse += (uint64_t)(nb_strengths * lambda * CDEF_STRENGTH_BITS);
1534 480 : if (tot_mse < best_tot_mse) {
1535 163 : best_tot_mse = tot_mse;
1536 163 : nb_strength_bits = i;
1537 439 : for (j = 0; j < 1 << nb_strength_bits; j++) {
1538 276 : frm_hdr->CDEF_params.cdef_y_strength[j] = best_lev0[j];
1539 276 : frm_hdr->CDEF_params.cdef_uv_strength[j] = best_lev1[j];
1540 : }
1541 : }
1542 : }
1543 120 : nb_strengths = 1 << nb_strength_bits;
1544 :
1545 120 : frm_hdr->CDEF_params.cdef_bits = nb_strength_bits;
1546 120 : pPcs->nb_cdef_strengths = nb_strengths;
1547 1555 : for (i = 0; i < sb_count; i++) {
1548 : int32_t gi;
1549 : int32_t best_gi;
1550 1435 : uint64_t best_mse = (uint64_t)1 << 63;
1551 1435 : best_gi = 0;
1552 6529 : for (gi = 0; gi < pPcs->nb_cdef_strengths; gi++) {
1553 5094 : uint64_t curr = mse[0][i][frm_hdr->CDEF_params.cdef_y_strength[gi]];
1554 5094 : if (num_planes >= 3) curr += mse[1][i][frm_hdr->CDEF_params.cdef_uv_strength[gi]];
1555 5094 : if (curr < best_mse) {
1556 2276 : best_gi = gi;
1557 2276 : best_mse = curr;
1558 : }
1559 : }
1560 1435 : selected_strength[i] = best_gi;
1561 1435 : selected_strength_cnt[best_gi]++;
1562 :
1563 1435 : picture_control_set_ptr->mi_grid_base[sb_index[i]]->mbmi.cdef_strength = (int8_t)best_gi;
1564 : //in case the fb is within a block=128x128 or 128x64, or 64x128, then we genrate param only for the first 64x64.
1565 : //since our mi map deos not have the multi pointer single data assignment, we need to duplicate data.
1566 1435 : BlockSize sb_type = picture_control_set_ptr->mi_grid_base[sb_index[i]]->mbmi.block_mi.sb_type;
1567 :
1568 1435 : switch (sb_type)
1569 : {
1570 0 : case BLOCK_128X128:
1571 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64]->mbmi.cdef_strength = (int8_t)best_gi;
1572 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * picture_control_set_ptr->mi_stride]->mbmi.cdef_strength = (int8_t)best_gi;
1573 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * picture_control_set_ptr->mi_stride + MI_SIZE_64X64]->mbmi.cdef_strength = (int8_t)best_gi;
1574 0 : break;
1575 0 : case BLOCK_128X64:
1576 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64]->mbmi.cdef_strength = (int8_t)best_gi;
1577 0 : break;
1578 0 : case BLOCK_64X128:
1579 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * picture_control_set_ptr->mi_stride]->mbmi.cdef_strength = (int8_t)best_gi;
1580 0 : break;
1581 1435 : default:
1582 1435 : break;
1583 : }
1584 : }
1585 :
1586 120 : if (fast) {
1587 0 : for (int32_t j = 0; j < nb_strengths; j++) {
1588 0 : frm_hdr->CDEF_params.cdef_y_strength[j] = priconv[frm_hdr->CDEF_params.cdef_y_strength[j] / CDEF_SEC_STRENGTHS] * CDEF_SEC_STRENGTHS + (frm_hdr->CDEF_params.cdef_y_strength[j] % CDEF_SEC_STRENGTHS);
1589 0 : frm_hdr->CDEF_params.cdef_uv_strength[j] = priconv[frm_hdr->CDEF_params.cdef_uv_strength[j] / CDEF_SEC_STRENGTHS] * CDEF_SEC_STRENGTHS + (frm_hdr->CDEF_params.cdef_uv_strength[j] % CDEF_SEC_STRENGTHS);
1590 : }
1591 : }
1592 : //cdef_pri_damping & cdef_sec_damping consolidated to cdef_damping
1593 120 : frm_hdr->CDEF_params.cdef_damping = pri_damping;
1594 : //pPcs->cdef_pri_damping = pri_damping;
1595 : //pPcs->cdef_sec_damping = sec_damping;
1596 7800 : for (int i = 0; i < total_strengths; i++)
1597 7680 : best_frame_gi_cnt += selected_strength_cnt[i] > best_frame_gi_cnt ? 1 : 0;
1598 120 : pPcs->cdef_frame_strength = ((best_frame_gi_cnt + 4) / 4) * 4;
1599 :
1600 120 : free(mse[0]);
1601 120 : free(mse[1]);
1602 120 : free(sb_index);
1603 120 : free(selected_strength);
1604 120 : }
1605 :
1606 0 : void eb_av1_cdef_search(
1607 : EncDecContext *context_ptr,
1608 : SequenceControlSet *sequence_control_set_ptr,
1609 : PictureControlSet *picture_control_set_ptr
1610 : //Yv12BufferConfig *frame,
1611 : //const Yv12BufferConfig *ref,
1612 : //Av1Common *cm,
1613 : //MacroBlockD *xd,
1614 : //int32_t fast
1615 : )
1616 : {
1617 : (void)context_ptr;
1618 0 : int32_t fast = 0;
1619 0 : struct PictureParentControlSet *pPcs = picture_control_set_ptr->parent_pcs_ptr;
1620 0 : FrameHeader *frm_hdr = &pPcs->frm_hdr;
1621 0 : Av1Common* cm = pPcs->av1_cm;
1622 0 : int32_t mi_rows = pPcs->av1_cm->mi_rows;
1623 0 : int32_t mi_cols = pPcs->av1_cm->mi_cols;
1624 :
1625 : EbPictureBufferDesc * recon_picture_ptr;
1626 0 : if (pPcs->is_used_as_reference_flag == EB_TRUE)
1627 0 : recon_picture_ptr = ((EbReferenceObject*)picture_control_set_ptr->parent_pcs_ptr->reference_picture_wrapper_ptr->object_ptr)->reference_picture;
1628 : else
1629 0 : recon_picture_ptr = picture_control_set_ptr->recon_picture_ptr;
1630 :
1631 0 : EbByte reconBufferY = &((recon_picture_ptr->buffer_y)[recon_picture_ptr->origin_x + recon_picture_ptr->origin_y * recon_picture_ptr->stride_y]);
1632 0 : EbByte reconBufferCb = &((recon_picture_ptr->buffer_cb)[recon_picture_ptr->origin_x / 2 + recon_picture_ptr->origin_y / 2 * recon_picture_ptr->stride_cb]);
1633 0 : EbByte reconBufferCr = &((recon_picture_ptr->buffer_cr)[recon_picture_ptr->origin_x / 2 + recon_picture_ptr->origin_y / 2 * recon_picture_ptr->stride_cr]);
1634 :
1635 0 : EbPictureBufferDesc *input_picture_ptr = (EbPictureBufferDesc*)picture_control_set_ptr->parent_pcs_ptr->enhanced_picture_ptr;
1636 0 : EbByte inputBufferY = &((input_picture_ptr->buffer_y)[input_picture_ptr->origin_x + input_picture_ptr->origin_y * input_picture_ptr->stride_y]);
1637 0 : EbByte inputBufferCb = &((input_picture_ptr->buffer_cb)[input_picture_ptr->origin_x / 2 + input_picture_ptr->origin_y / 2 * input_picture_ptr->stride_cb]);
1638 0 : EbByte inputBufferCr = &((input_picture_ptr->buffer_cr)[input_picture_ptr->origin_x / 2 + input_picture_ptr->origin_y / 2 * input_picture_ptr->stride_cr]);
1639 :
1640 : int32_t r, c;
1641 : int32_t fbr, fbc;
1642 : uint16_t *src[3];
1643 : uint16_t *ref_coeff[3];
1644 : /*static*/ cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
1645 0 : int32_t dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
1646 0 : int32_t var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
1647 : int32_t stride[3];
1648 : int32_t bsize[3];
1649 : int32_t mi_wide_l2[3];
1650 : int32_t mi_high_l2[3];
1651 : int32_t xdec[3];
1652 : int32_t ydec[3];
1653 : int32_t pli;
1654 : int32_t cdef_count;
1655 :
1656 : //CHKN int32_t coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
1657 0 : int32_t coeff_shift = AOMMAX(sequence_control_set_ptr->static_config.encoder_bit_depth - 8, 0);
1658 :
1659 0 : uint64_t best_tot_mse = (uint64_t)1 << 63;
1660 : uint64_t tot_mse;
1661 : int32_t sb_count;
1662 :
1663 0 : int32_t nvfb = (mi_rows /*cm->mi_rows*/ + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
1664 0 : int32_t nhfb = (mi_cols/*cm->mi_cols*/ + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
1665 :
1666 0 : int32_t *sb_index = (int32_t *)eb_aom_malloc(nvfb * nhfb * sizeof(*sb_index)); //CHKN add cast
1667 0 : int32_t *selected_strength = (int32_t *)eb_aom_malloc(nvfb * nhfb * sizeof(*sb_index));
1668 :
1669 0 : assert(sb_index != NULL);
1670 0 : assert(selected_strength != NULL);
1671 :
1672 : uint64_t(*mse[2])[TOTAL_STRENGTHS];
1673 0 : int32_t pri_damping = 3 + (frm_hdr->quantization_params.base_q_idx /*cm->quant_param.base_q_idx*/ >> 6);
1674 0 : int32_t sec_damping = 3 + (frm_hdr->quantization_params.base_q_idx /*cm->quant_param.base_q_idx*/ >> 6);
1675 : int32_t i;
1676 : int32_t nb_strengths;
1677 : int32_t nb_strength_bits;
1678 : int32_t quantizer;
1679 : double lambda;
1680 0 : const int32_t num_planes = 3;// av1_num_planes(cm);
1681 0 : const int32_t total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
1682 : DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
1683 : uint16_t *in;
1684 : DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
1685 :
1686 0 : int32_t selected_strength_cnt[TOTAL_STRENGTHS] = { 0 };
1687 0 : int32_t best_frame_gi_cnt = 0;
1688 0 : int32_t gi_step = get_cdef_gi_step(pPcs->cdef_filter_mode);
1689 0 : int32_t mid_gi = pPcs->cdf_ref_frame_strenght;
1690 0 : int32_t start_gi = pPcs->use_ref_frame_cdef_strength && pPcs->cdef_filter_mode == 1 ? (AOMMAX(0, mid_gi - gi_step)) : 0;
1691 0 : int32_t end_gi = pPcs->use_ref_frame_cdef_strength ? AOMMIN(total_strengths, mid_gi + gi_step) : pPcs->cdef_filter_mode == 1 ? 8 : total_strengths;
1692 :
1693 0 : quantizer =
1694 : //CHKN av1_ac_quant_Q3(cm->quant_param.base_q_idx, 0, cm->bit_depth) >> (cm->bit_depth - 8);
1695 0 : eb_av1_ac_quant_Q3(frm_hdr->quantization_params.base_q_idx, 0, (AomBitDepth)sequence_control_set_ptr->static_config.encoder_bit_depth) >> (sequence_control_set_ptr->static_config.encoder_bit_depth - 8);
1696 0 : lambda = .12 * quantizer * quantizer / 256.;
1697 :
1698 : //eb_av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0, num_planes);
1699 :
1700 0 : mse[0] = (uint64_t(*)[64])eb_aom_malloc(sizeof(**mse) * nvfb * nhfb);
1701 0 : mse[1] = (uint64_t(*)[64])eb_aom_malloc(sizeof(**mse) * nvfb * nhfb);
1702 :
1703 0 : for (pli = 0; pli < num_planes; pli++) {
1704 0 : uint8_t *in_buffer = 0;
1705 0 : int32_t in_stride = 0;
1706 :
1707 0 : uint8_t *ref_buffer = 0;
1708 0 : int32_t ref_stride = 0;
1709 0 : switch (pli) {
1710 0 : case 0:
1711 0 : ref_buffer = inputBufferY;
1712 0 : ref_stride = input_picture_ptr->stride_y;
1713 0 : in_buffer = reconBufferY;
1714 0 : in_stride = recon_picture_ptr->stride_y;
1715 0 : break;
1716 0 : case 1:
1717 0 : ref_buffer = inputBufferCb;
1718 0 : ref_stride = input_picture_ptr->stride_cb;
1719 0 : in_buffer = reconBufferCb;
1720 0 : in_stride = recon_picture_ptr->stride_cb;
1721 0 : break;
1722 0 : case 2:
1723 0 : ref_buffer = inputBufferCr;
1724 0 : ref_stride = input_picture_ptr->stride_cr;
1725 0 : in_buffer = reconBufferCr;
1726 0 : in_stride = recon_picture_ptr->stride_cr;
1727 0 : break;
1728 : }
1729 :
1730 : ///CHKN: allocate one frame 16bit for src and recon!!
1731 0 : src[pli] = (uint16_t*)eb_aom_memalign(32, sizeof(*src) * mi_rows * mi_cols * MI_SIZE * MI_SIZE);
1732 0 : ref_coeff[pli] = (uint16_t*)eb_aom_memalign(32, sizeof(*ref_coeff) * mi_rows * mi_cols * MI_SIZE * MI_SIZE);
1733 :
1734 0 : int32_t subsampling_x = (pli == 0) ? 0 : 1;
1735 0 : int32_t subsampling_y = (pli == 0) ? 0 : 1;
1736 :
1737 0 : xdec[pli] = subsampling_x; //CHKN xd->plane[pli].subsampling_x;
1738 0 : ydec[pli] = subsampling_y; //CHKN xd->plane[pli].subsampling_y;
1739 0 : bsize[pli] = ydec[pli] ? (xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
1740 0 : : (xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
1741 :
1742 0 : stride[pli] = cm->mi_cols << MI_SIZE_LOG2;
1743 0 : mi_wide_l2[pli] = MI_SIZE_LOG2 - subsampling_x; //CHKN MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
1744 0 : mi_high_l2[pli] = MI_SIZE_LOG2 - subsampling_y; //CHKN MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
1745 :
1746 0 : const int32_t frame_height = (cm->mi_rows * MI_SIZE) >> subsampling_y;//CHKN xd->plane[pli].subsampling_y;
1747 0 : const int32_t frame_width = (cm->mi_cols * MI_SIZE) >> subsampling_x;//CHKN xd->plane[pli].subsampling_x;
1748 :
1749 0 : for (r = 0; r < frame_height; ++r) {
1750 0 : for (c = 0; c < frame_width; ++c) {
1751 : //if (cm->use_highbitdepth) {
1752 : // src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR(
1753 : // xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c];
1754 : // ref_coeff[pli][r * stride[pli] + c] =
1755 : // CONVERT_TO_SHORTPTR(ref_buffer)[r * ref_stride + c];
1756 : //}
1757 : //else
1758 : {
1759 0 : src[pli][r * stride[pli] + c] = in_buffer[r * in_stride + c];//CHKN xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c];
1760 0 : ref_coeff[pli][r * stride[pli] + c] = ref_buffer[r * ref_stride + c];
1761 : }
1762 : }
1763 : }
1764 : }
1765 :
1766 0 : in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
1767 0 : sb_count = 0;
1768 0 : for (fbr = 0; fbr < nvfb; ++fbr) {
1769 0 : for (fbc = 0; fbc < nhfb; ++fbc) {
1770 : int32_t nvb, nhb;
1771 : int32_t gi;
1772 0 : int32_t dirinit = 0;
1773 0 : nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
1774 0 : nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
1775 0 : int32_t hb_step = 1; //CHKN these should be all time with 64x64 LCUs
1776 0 : int32_t vb_step = 1;
1777 0 : BlockSize bs = BLOCK_64X64;
1778 0 : ModeInfo **mi = picture_control_set_ptr->mi_grid_base + MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc;
1779 0 : const MbModeInfo *mbmi = &mi[0]->mbmi;
1780 :
1781 : //MbModeInfo *const mbmi =
1782 : // cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
1783 : // MI_SIZE_64X64 * fbc];
1784 :
1785 0 : if (((fbc & 1) &&
1786 0 : (mbmi->block_mi.sb_type == BLOCK_128X128 || mbmi->block_mi.sb_type == BLOCK_128X64)) ||
1787 0 : ((fbr & 1) &&
1788 0 : (mbmi->block_mi.sb_type == BLOCK_128X128 || mbmi->block_mi.sb_type == BLOCK_64X128)))
1789 0 : continue;
1790 0 : if (mbmi->block_mi.sb_type == BLOCK_128X128 || mbmi->block_mi.sb_type == BLOCK_128X64 ||
1791 0 : mbmi->block_mi.sb_type == BLOCK_64X128)
1792 0 : bs = mbmi->block_mi.sb_type;
1793 0 : if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
1794 0 : nhb = AOMMIN(MI_SIZE_128X128, cm->mi_cols - MI_SIZE_64X64 * fbc);
1795 0 : hb_step = 2;
1796 : }
1797 0 : if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
1798 0 : nvb = AOMMIN(MI_SIZE_128X128, cm->mi_rows - MI_SIZE_64X64 * fbr);
1799 0 : vb_step = 2;
1800 : }
1801 :
1802 : // No filtering if the entire filter block is skipped
1803 0 : if (eb_sb_all_skip(picture_control_set_ptr, cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64))
1804 0 : continue;
1805 :
1806 0 : cdef_count = eb_sb_compute_cdef_list(picture_control_set_ptr, cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs);
1807 :
1808 0 : for (pli = 0; pli < num_planes; pli++) {
1809 0 : for (i = 0; i < CDEF_INBUF_SIZE; i++)
1810 0 : inbuf[i] = CDEF_VERY_LARGE;
1811 0 : int32_t yoff = CDEF_VBORDER * (fbr != 0);
1812 0 : int32_t xoff = CDEF_HBORDER * (fbc != 0);
1813 0 : int32_t ysize = (nvb << mi_high_l2[pli]) + CDEF_VBORDER * (fbr + vb_step < nvfb) + yoff;
1814 0 : int32_t xsize = (nhb << mi_wide_l2[pli]) + CDEF_HBORDER * (fbc + hb_step < nhfb) + xoff;
1815 :
1816 0 : copy_sb16_16(
1817 0 : &in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
1818 0 : src[pli],
1819 0 : (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
1820 0 : (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
1821 : stride[pli], ysize, xsize);
1822 :
1823 0 : for (gi = start_gi; gi < end_gi; gi++) {
1824 : int32_t threshold;
1825 : uint64_t curr_mse;
1826 : int32_t sec_strength;
1827 0 : threshold = gi / CDEF_SEC_STRENGTHS;
1828 0 : if (fast) threshold = priconv[threshold];
1829 : /* We avoid filtering the pixels for which some of the pixels to
1830 : average are outside the frame. We could change the filter instead, but it would add special cases for any future vectorization. */
1831 0 : sec_strength = gi % CDEF_SEC_STRENGTHS;
1832 0 : eb_cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli], ydec[pli],
1833 : dir, &dirinit, var, pli, dlist, cdef_count, threshold,
1834 0 : sec_strength + (sec_strength == 3), pri_damping,
1835 : sec_damping, coeff_shift);
1836 :
1837 0 : curr_mse = eb_compute_cdef_dist(
1838 0 : ref_coeff[pli] +
1839 0 : (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
1840 0 : (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]),
1841 0 : stride[pli], tmp_dst, dlist, cdef_count, (BlockSize)bsize[pli], coeff_shift,
1842 : pli);
1843 :
1844 0 : if (pli < 2)
1845 0 : mse[pli][sb_count][gi] = curr_mse;
1846 : else
1847 0 : mse[1][sb_count][gi] += curr_mse;
1848 :
1849 0 : sb_index[sb_count] = MI_SIZE_64X64 * fbr * picture_control_set_ptr->mi_stride + MI_SIZE_64X64 * fbc;//CHKN
1850 : }
1851 : }
1852 0 : sb_count++;
1853 : }
1854 : }
1855 :
1856 0 : nb_strength_bits = 0;
1857 : /* Search for different number of signalling bits. */
1858 0 : for (i = 0; i <= 3; i++) {
1859 : int32_t j;
1860 : int32_t best_lev0[CDEF_MAX_STRENGTHS];
1861 0 : int32_t best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
1862 0 : nb_strengths = 1 << i;
1863 0 : if (num_planes >= 3)
1864 0 : tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths, mse, sb_count, fast, start_gi, end_gi);
1865 : else
1866 0 : tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count, fast, start_gi, end_gi);
1867 : /* Count superblock signalling cost. */
1868 0 : tot_mse += (uint64_t)(sb_count * lambda * i);
1869 : /* Count header signalling cost. */
1870 0 : tot_mse += (uint64_t)(nb_strengths * lambda * CDEF_STRENGTH_BITS);
1871 0 : if (tot_mse < best_tot_mse) {
1872 0 : best_tot_mse = tot_mse;
1873 0 : nb_strength_bits = i;
1874 0 : for (j = 0; j < 1 << nb_strength_bits; j++) {
1875 0 : frm_hdr->CDEF_params.cdef_y_strength[j] = best_lev0[j];
1876 0 : frm_hdr->CDEF_params.cdef_uv_strength[j] = best_lev1[j];
1877 : }
1878 : }
1879 : }
1880 0 : nb_strengths = 1 << nb_strength_bits;
1881 :
1882 0 : /*cm*/frm_hdr->CDEF_params.cdef_bits = nb_strength_bits;
1883 0 : /*cm*/pPcs->nb_cdef_strengths = nb_strengths;
1884 0 : for (i = 0; i < sb_count; i++) {
1885 : int32_t gi;
1886 : int32_t best_gi;
1887 0 : uint64_t best_mse = (uint64_t)1 << 63;
1888 0 : best_gi = 0;
1889 0 : for (gi = 0; gi < /*cm*/pPcs->nb_cdef_strengths; gi++) {
1890 0 : uint64_t curr = mse[0][i][/*cm*/frm_hdr->CDEF_params.cdef_y_strength[gi]];
1891 0 : if (num_planes >= 3) curr += mse[1][i][/*cm*/frm_hdr->CDEF_params.cdef_uv_strength[gi]];
1892 0 : if (curr < best_mse) {
1893 0 : best_gi = gi;
1894 0 : best_mse = curr;
1895 : }
1896 : }
1897 0 : selected_strength[i] = best_gi;
1898 0 : selected_strength_cnt[best_gi]++;
1899 :
1900 : //CHKN cm->mi_grid_visible[sb_index[i]]->cdef_strength = best_gi;
1901 0 : picture_control_set_ptr->mi_grid_base[sb_index[i]]->mbmi.cdef_strength = (int8_t)best_gi;
1902 : //in case the fb is within a block=128x128 or 128x64, or 64x128, then we genrate param only for the first 64x64.
1903 : //since our mi map deos not have the multi pointer single data assignment, we need to duplicate data.
1904 0 : BlockSize sb_type = picture_control_set_ptr->mi_grid_base[sb_index[i]]->mbmi.block_mi.sb_type;
1905 :
1906 0 : if (sb_type == BLOCK_128X128)
1907 : {
1908 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64]->mbmi.cdef_strength = (int8_t)best_gi;
1909 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * picture_control_set_ptr->mi_stride]->mbmi.cdef_strength = (int8_t)best_gi;
1910 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * picture_control_set_ptr->mi_stride + MI_SIZE_64X64]->mbmi.cdef_strength = (int8_t)best_gi;
1911 : }
1912 0 : else if (sb_type == BLOCK_128X64)
1913 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64]->mbmi.cdef_strength = (int8_t)best_gi;
1914 0 : else if (sb_type == BLOCK_64X128)
1915 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * picture_control_set_ptr->mi_stride]->mbmi.cdef_strength = (int8_t)best_gi;
1916 : }
1917 :
1918 0 : if (fast) {
1919 0 : for (int32_t j = 0; j < nb_strengths; j++) {
1920 0 : frm_hdr->CDEF_params.cdef_y_strength[j] = priconv[frm_hdr->CDEF_params.cdef_y_strength[j] / CDEF_SEC_STRENGTHS] * CDEF_SEC_STRENGTHS + (frm_hdr->CDEF_params.cdef_y_strength[j] % CDEF_SEC_STRENGTHS);
1921 0 : frm_hdr->CDEF_params.cdef_uv_strength[j] = priconv[frm_hdr->CDEF_params.cdef_uv_strength[j] / CDEF_SEC_STRENGTHS] * CDEF_SEC_STRENGTHS + (frm_hdr->CDEF_params.cdef_uv_strength[j] % CDEF_SEC_STRENGTHS);
1922 : }
1923 : }
1924 :
1925 0 : for (int i = 0; i < total_strengths; i++)
1926 0 : best_frame_gi_cnt += selected_strength_cnt[i] > best_frame_gi_cnt ? 1 : 0;
1927 0 : pPcs->cdef_frame_strength = ((best_frame_gi_cnt + 4) / 4) * 4;
1928 :
1929 0 : frm_hdr->CDEF_params.cdef_damping = pri_damping;
1930 : //pPcs->cdef_pri_damping = pri_damping;
1931 : //pPcs->cdef_sec_damping = sec_damping;
1932 :
1933 0 : eb_aom_free(mse[0]);
1934 0 : eb_aom_free(mse[1]);
1935 0 : for (pli = 0; pli < num_planes; pli++) {
1936 0 : eb_aom_free(src[pli]);
1937 0 : eb_aom_free(ref_coeff[pli]);
1938 : }
1939 0 : eb_aom_free(sb_index);
1940 0 : eb_aom_free(selected_strength);
1941 0 : }
1942 :
1943 0 : void av1_cdef_search16bit(
1944 : EncDecContext *context_ptr,
1945 : SequenceControlSet *sequence_control_set_ptr,
1946 : PictureControlSet *picture_control_set_ptr
1947 : //Yv12BufferConfig *frame,
1948 : //const Yv12BufferConfig *ref,
1949 : //Av1Common *cm,
1950 : //MacroBlockD *xd,
1951 : //int32_t fast
1952 : )
1953 : {
1954 : (void)context_ptr;
1955 0 : int32_t fast = 0;
1956 0 : struct PictureParentControlSet *pPcs = picture_control_set_ptr->parent_pcs_ptr;
1957 0 : FrameHeader *frm_hdr = &pPcs->frm_hdr;
1958 0 : Av1Common* cm = pPcs->av1_cm;
1959 0 : int32_t mi_rows = pPcs->av1_cm->mi_rows;
1960 0 : int32_t mi_cols = pPcs->av1_cm->mi_cols;
1961 :
1962 : EbPictureBufferDesc * recon_picture_ptr;
1963 0 : if (pPcs->is_used_as_reference_flag == EB_TRUE)
1964 0 : recon_picture_ptr = ((EbReferenceObject*)picture_control_set_ptr->parent_pcs_ptr->reference_picture_wrapper_ptr->object_ptr)->reference_picture16bit;
1965 : else
1966 0 : recon_picture_ptr = picture_control_set_ptr->recon_picture16bit_ptr;
1967 :
1968 0 : uint16_t* reconBufferY = (uint16_t*)recon_picture_ptr->buffer_y + (recon_picture_ptr->origin_x + recon_picture_ptr->origin_y * recon_picture_ptr->stride_y);
1969 0 : uint16_t* reconBufferCb = (uint16_t*)recon_picture_ptr->buffer_cb + (recon_picture_ptr->origin_x / 2 + recon_picture_ptr->origin_y / 2 * recon_picture_ptr->stride_cb);
1970 0 : uint16_t* reconBufferCr = (uint16_t*)recon_picture_ptr->buffer_cr + (recon_picture_ptr->origin_x / 2 + recon_picture_ptr->origin_y / 2 * recon_picture_ptr->stride_cr);
1971 :
1972 0 : EbPictureBufferDesc *input_picture_ptr = picture_control_set_ptr->input_frame16bit;
1973 0 : uint16_t* inputBufferY = (uint16_t*)input_picture_ptr->buffer_y + (input_picture_ptr->origin_x + input_picture_ptr->origin_y * input_picture_ptr->stride_y);
1974 0 : uint16_t* inputBufferCb = (uint16_t*)input_picture_ptr->buffer_cb + (input_picture_ptr->origin_x / 2 + input_picture_ptr->origin_y / 2 * input_picture_ptr->stride_cb);
1975 0 : uint16_t* inputBufferCr = (uint16_t*)input_picture_ptr->buffer_cr + (input_picture_ptr->origin_x / 2 + input_picture_ptr->origin_y / 2 * input_picture_ptr->stride_cr);
1976 :
1977 : int32_t r, c;
1978 : int32_t fbr, fbc;
1979 : uint16_t *src[3];
1980 : uint16_t *ref_coeff[3];
1981 : /*static*/ cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
1982 0 : int32_t dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
1983 0 : int32_t var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
1984 : int32_t stride[3];
1985 : int32_t bsize[3];
1986 : int32_t mi_wide_l2[3];
1987 : int32_t mi_high_l2[3];
1988 : int32_t xdec[3];
1989 : int32_t ydec[3];
1990 : int32_t pli;
1991 : int32_t cdef_count;
1992 :
1993 : //CHKN int32_t coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
1994 0 : int32_t coeff_shift = AOMMAX(sequence_control_set_ptr->static_config.encoder_bit_depth - 8, 0);
1995 :
1996 0 : uint64_t best_tot_mse = (uint64_t)1 << 63;
1997 : uint64_t tot_mse;
1998 : int32_t sb_count;
1999 :
2000 0 : int32_t nvfb = (mi_rows /*cm->mi_rows*/ + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
2001 0 : int32_t nhfb = (mi_cols/*cm->mi_cols*/ + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
2002 :
2003 0 : int32_t *sb_index = (int32_t *)eb_aom_malloc(nvfb * nhfb * sizeof(*sb_index)); //CHKN add cast
2004 0 : int32_t *selected_strength = (int32_t *)eb_aom_malloc(nvfb * nhfb * sizeof(*sb_index));
2005 :
2006 0 : assert(sb_index);
2007 0 : assert(selected_strength);
2008 :
2009 : uint64_t(*mse[2])[TOTAL_STRENGTHS];
2010 :
2011 0 : int32_t pri_damping = 3 + (frm_hdr->quantization_params.base_q_idx /*cm->quant_param.base_q_idx*/ >> 6);
2012 0 : int32_t sec_damping = 3 + (frm_hdr->quantization_params.base_q_idx /*cm->quant_param.base_q_idx*/ >> 6);
2013 : int32_t i;
2014 : int32_t nb_strengths;
2015 : int32_t nb_strength_bits;
2016 : int32_t quantizer;
2017 : double lambda;
2018 0 : const int32_t num_planes = 3;// av1_num_planes(cm);
2019 0 : const int32_t total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
2020 : DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
2021 : uint16_t *in;
2022 : DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
2023 :
2024 0 : int32_t selected_strength_cnt[TOTAL_STRENGTHS] = { 0 };
2025 0 : int32_t best_frame_gi_cnt = 0;
2026 0 : int32_t gi_step = get_cdef_gi_step(pPcs->cdef_filter_mode);
2027 0 : int32_t mid_gi = pPcs->cdf_ref_frame_strenght;
2028 0 : int32_t start_gi = pPcs->use_ref_frame_cdef_strength && pPcs->cdef_filter_mode == 1 ? (AOMMAX(0, mid_gi - gi_step)) : 0;
2029 0 : int32_t end_gi = pPcs->use_ref_frame_cdef_strength ? AOMMIN(total_strengths, mid_gi + gi_step) : pPcs->cdef_filter_mode == 1 ? 8 : total_strengths;
2030 :
2031 0 : quantizer =
2032 : //CHKN av1_ac_quant_Q3(cm->quant_param.base_q_idx, 0, cm->bit_depth) >> (cm->bit_depth - 8);
2033 0 : eb_av1_ac_quant_Q3(frm_hdr->quantization_params.base_q_idx, 0, (AomBitDepth)sequence_control_set_ptr->static_config.encoder_bit_depth) >> (sequence_control_set_ptr->static_config.encoder_bit_depth - 8);
2034 0 : lambda = .12 * quantizer * quantizer / 256.;
2035 :
2036 : //eb_av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0, num_planes);
2037 :
2038 0 : mse[0] = (uint64_t(*)[64])eb_aom_malloc(sizeof(**mse) * nvfb * nhfb);
2039 0 : mse[1] = (uint64_t(*)[64])eb_aom_malloc(sizeof(**mse) * nvfb * nhfb);
2040 :
2041 0 : for (pli = 0; pli < num_planes; pli++) {
2042 0 : uint16_t *in_buffer = 0;
2043 0 : int32_t in_stride = 0;
2044 :
2045 0 : uint16_t *ref_buffer = 0;
2046 0 : int32_t ref_stride = 0;
2047 0 : switch (pli) {
2048 0 : case 0:
2049 0 : ref_buffer = inputBufferY;
2050 0 : ref_stride = input_picture_ptr->stride_y;
2051 0 : in_buffer = reconBufferY;
2052 0 : in_stride = recon_picture_ptr->stride_y;
2053 0 : break;
2054 0 : case 1:
2055 0 : ref_buffer = inputBufferCb;
2056 0 : ref_stride = input_picture_ptr->stride_cb;
2057 0 : in_buffer = reconBufferCb;
2058 0 : in_stride = recon_picture_ptr->stride_cb;
2059 0 : break;
2060 0 : case 2:
2061 0 : ref_buffer = inputBufferCr;
2062 0 : ref_stride = input_picture_ptr->stride_cr;
2063 0 : in_buffer = reconBufferCr;
2064 0 : in_stride = recon_picture_ptr->stride_cr;
2065 0 : break;
2066 : }
2067 :
2068 : ///CHKN: allocate one frame 16bit for src and recon!!
2069 0 : src[pli] = (uint16_t*)eb_aom_memalign(32, sizeof(*src) * mi_rows * mi_cols * MI_SIZE * MI_SIZE);
2070 0 : ref_coeff[pli] = (uint16_t*)eb_aom_memalign(32, sizeof(*ref_coeff) * mi_rows * mi_cols * MI_SIZE * MI_SIZE);
2071 :
2072 0 : int32_t subsampling_x = (pli == 0) ? 0 : 1;
2073 0 : int32_t subsampling_y = (pli == 0) ? 0 : 1;
2074 :
2075 0 : xdec[pli] = subsampling_x; //CHKN xd->plane[pli].subsampling_x;
2076 0 : ydec[pli] = subsampling_y; //CHKN xd->plane[pli].subsampling_y;
2077 0 : bsize[pli] = ydec[pli] ? (xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
2078 0 : : (xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
2079 :
2080 0 : stride[pli] = cm->mi_cols << MI_SIZE_LOG2;
2081 0 : mi_wide_l2[pli] = MI_SIZE_LOG2 - subsampling_x; //CHKN MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
2082 0 : mi_high_l2[pli] = MI_SIZE_LOG2 - subsampling_y; //CHKN MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
2083 :
2084 0 : const int32_t frame_height = (cm->mi_rows * MI_SIZE) >> subsampling_y;//CHKN xd->plane[pli].subsampling_y;
2085 0 : const int32_t frame_width = (cm->mi_cols * MI_SIZE) >> subsampling_x;//CHKN xd->plane[pli].subsampling_x;
2086 :
2087 0 : for (r = 0; r < frame_height; ++r) {
2088 0 : for (c = 0; c < frame_width; ++c) {
2089 : //if (cm->use_highbitdepth) {
2090 : // src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR(
2091 : // xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c];
2092 : // ref_coeff[pli][r * stride[pli] + c] =
2093 : // CONVERT_TO_SHORTPTR(ref_buffer)[r * ref_stride + c];
2094 : //}
2095 : //else
2096 : {
2097 0 : src[pli][r * stride[pli] + c] = in_buffer[r * in_stride + c];//CHKN xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c];
2098 0 : ref_coeff[pli][r * stride[pli] + c] = ref_buffer[r * ref_stride + c];
2099 : }
2100 : }
2101 : }
2102 : }
2103 :
2104 0 : in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
2105 0 : sb_count = 0;
2106 0 : for (fbr = 0; fbr < nvfb; ++fbr) {
2107 0 : for (fbc = 0; fbc < nhfb; ++fbc) {
2108 : int32_t nvb, nhb;
2109 : int32_t gi;
2110 0 : int32_t dirinit = 0;
2111 0 : nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
2112 0 : nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
2113 0 : int32_t hb_step = 1; //CHKN these should be all time with 64x64 LCUs
2114 0 : int32_t vb_step = 1;
2115 0 : BlockSize bs = BLOCK_64X64;
2116 0 : ModeInfo **mi = picture_control_set_ptr->mi_grid_base + MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc;
2117 0 : const MbModeInfo *mbmi = &mi[0]->mbmi;
2118 :
2119 : //MbModeInfo *const mbmi =
2120 : // cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
2121 : // MI_SIZE_64X64 * fbc];
2122 :
2123 0 : if (((fbc & 1) &&
2124 0 : (mbmi->block_mi.sb_type == BLOCK_128X128 || mbmi->block_mi.sb_type == BLOCK_128X64)) ||
2125 0 : ((fbr & 1) &&
2126 0 : (mbmi->block_mi.sb_type == BLOCK_128X128 || mbmi->block_mi.sb_type == BLOCK_64X128)))
2127 0 : continue;
2128 0 : if (mbmi->block_mi.sb_type == BLOCK_128X128 || mbmi->block_mi.sb_type == BLOCK_128X64 ||
2129 0 : mbmi->block_mi.sb_type == BLOCK_64X128)
2130 0 : bs = mbmi->block_mi.sb_type;
2131 0 : if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
2132 0 : nhb = AOMMIN(MI_SIZE_128X128, cm->mi_cols - MI_SIZE_64X64 * fbc);
2133 0 : hb_step = 2;
2134 : }
2135 0 : if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
2136 0 : nvb = AOMMIN(MI_SIZE_128X128, cm->mi_rows - MI_SIZE_64X64 * fbr);
2137 0 : vb_step = 2;
2138 : }
2139 :
2140 : // No filtering if the entire filter block is skipped
2141 0 : if (eb_sb_all_skip(picture_control_set_ptr, cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64))
2142 0 : continue;
2143 :
2144 0 : cdef_count = eb_sb_compute_cdef_list(picture_control_set_ptr, cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs);
2145 :
2146 0 : for (pli = 0; pli < num_planes; pli++) {
2147 0 : for (i = 0; i < CDEF_INBUF_SIZE; i++)
2148 0 : inbuf[i] = CDEF_VERY_LARGE;
2149 0 : for (gi = start_gi; gi < end_gi; gi++) {
2150 : int32_t threshold;
2151 : uint64_t curr_mse;
2152 : int32_t sec_strength;
2153 0 : threshold = gi / CDEF_SEC_STRENGTHS;
2154 0 : if (fast) threshold = priconv[threshold];
2155 : /* We avoid filtering the pixels for which some of the pixels to
2156 : average are outside the frame. We could change the filter instead, but it would add special cases for any future vectorization. */
2157 0 : int32_t yoff = CDEF_VBORDER * (fbr != 0);
2158 0 : int32_t xoff = CDEF_HBORDER * (fbc != 0);
2159 0 : int32_t ysize = (nvb << mi_high_l2[pli]) + CDEF_VBORDER * (fbr + vb_step < nvfb) + yoff;
2160 0 : int32_t xsize = (nhb << mi_wide_l2[pli]) + CDEF_HBORDER * (fbc + hb_step < nhfb) + xoff;
2161 0 : sec_strength = gi % CDEF_SEC_STRENGTHS;
2162 :
2163 0 : copy_sb16_16(
2164 0 : &in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
2165 0 : src[pli],
2166 0 : (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
2167 0 : (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
2168 : stride[pli], ysize, xsize);
2169 :
2170 0 : eb_cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli], ydec[pli],
2171 : dir, &dirinit, var, pli, dlist, cdef_count, threshold,
2172 0 : sec_strength + (sec_strength == 3), pri_damping,
2173 : sec_damping, coeff_shift);
2174 :
2175 0 : curr_mse = eb_compute_cdef_dist(
2176 0 : ref_coeff[pli] +
2177 0 : (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
2178 0 : (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]),
2179 0 : stride[pli], tmp_dst, dlist, cdef_count, (BlockSize)bsize[pli], coeff_shift,
2180 : pli);
2181 :
2182 0 : if (pli < 2)
2183 0 : mse[pli][sb_count][gi] = curr_mse;
2184 : else
2185 0 : mse[1][sb_count][gi] += curr_mse;
2186 :
2187 0 : sb_index[sb_count] = MI_SIZE_64X64 * fbr * picture_control_set_ptr->mi_stride + MI_SIZE_64X64 * fbc;//CHKN
2188 : }
2189 : }
2190 0 : sb_count++;
2191 : }
2192 : }
2193 :
2194 0 : nb_strength_bits = 0;
2195 : /* Search for different number of signalling bits. */
2196 0 : for (i = 0; i <= 3; i++) {
2197 : int32_t j;
2198 : int32_t best_lev0[CDEF_MAX_STRENGTHS];
2199 0 : int32_t best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
2200 0 : nb_strengths = 1 << i;
2201 0 : if (num_planes >= 3)
2202 0 : tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths, mse, sb_count, fast, start_gi, end_gi);
2203 : else
2204 0 : tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count, fast, start_gi, end_gi);
2205 : /* Count superblock signalling cost. */
2206 0 : tot_mse += (uint64_t)(sb_count * lambda * i);
2207 : /* Count header signalling cost. */
2208 0 : tot_mse += (uint64_t)(nb_strengths * lambda * CDEF_STRENGTH_BITS);
2209 0 : if (tot_mse < best_tot_mse) {
2210 0 : best_tot_mse = tot_mse;
2211 0 : nb_strength_bits = i;
2212 0 : for (j = 0; j < 1 << nb_strength_bits; j++) {
2213 0 : frm_hdr->CDEF_params.cdef_y_strength[j] = best_lev0[j];
2214 0 : frm_hdr->CDEF_params.cdef_uv_strength[j] = best_lev1[j];
2215 : }
2216 : }
2217 : }
2218 0 : nb_strengths = 1 << nb_strength_bits;
2219 :
2220 0 : /*cm*/frm_hdr->CDEF_params.cdef_bits = nb_strength_bits;
2221 0 : /*cm*/pPcs->nb_cdef_strengths = nb_strengths;
2222 0 : for (i = 0; i < sb_count; i++) {
2223 : int32_t gi;
2224 : int32_t best_gi;
2225 0 : uint64_t best_mse = (uint64_t)1 << 63;
2226 0 : best_gi = 0;
2227 0 : for (gi = 0; gi < /*cm*/pPcs->nb_cdef_strengths; gi++) {
2228 0 : uint64_t curr = mse[0][i][/*cm*/frm_hdr->CDEF_params.cdef_y_strength[gi]];
2229 0 : if (num_planes >= 3) curr += mse[1][i][/*cm*/frm_hdr->CDEF_params.cdef_uv_strength[gi]];
2230 0 : if (curr < best_mse) {
2231 0 : best_gi = gi;
2232 0 : best_mse = curr;
2233 : }
2234 : }
2235 0 : selected_strength[i] = best_gi;
2236 0 : selected_strength_cnt[best_gi]++;
2237 : //CHKN cm->mi_grid_visible[sb_index[i]]->cdef_strength = best_gi;
2238 0 : picture_control_set_ptr->mi_grid_base[sb_index[i]]->mbmi.cdef_strength = (int8_t)best_gi;
2239 : //in case the fb is within a block=128x128 or 128x64, or 64x128, then we genrate param only for the first 64x64.
2240 : //since our mi map deos not have the multi pointer single data assignment, we need to duplicate data.
2241 0 : BlockSize sb_type = picture_control_set_ptr->mi_grid_base[sb_index[i]]->mbmi.block_mi.sb_type;
2242 :
2243 0 : if (sb_type == BLOCK_128X128)
2244 : {
2245 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64]->mbmi.cdef_strength = (int8_t)best_gi;
2246 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * picture_control_set_ptr->mi_stride]->mbmi.cdef_strength = (int8_t)best_gi;
2247 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * picture_control_set_ptr->mi_stride + MI_SIZE_64X64]->mbmi.cdef_strength = (int8_t)best_gi;
2248 : }
2249 0 : else if (sb_type == BLOCK_128X64)
2250 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64]->mbmi.cdef_strength = (int8_t)best_gi;
2251 0 : else if (sb_type == BLOCK_64X128)
2252 0 : picture_control_set_ptr->mi_grid_base[sb_index[i] + MI_SIZE_64X64 * picture_control_set_ptr->mi_stride]->mbmi.cdef_strength = (int8_t)best_gi;
2253 : //ModeInfo *miPtr = *(picture_control_set_ptr->mi_grid_base + sb_index[i]);
2254 : //uint8_t miX, miY;
2255 : //for (miY = 0; miY < (block_size_high[sb_type] >> MI_SIZE_LOG2); miY++) {
2256 : // for (miX = 0; miX < (block_size_wide[sb_type] >> MI_SIZE_LOG2); miX++) {
2257 : // miPtr[miX + miY * picture_control_set_ptr->mi_stride].mbmi.cdef_strength = (int8_t)best_gi;
2258 : // }
2259 : //}
2260 : }
2261 :
2262 0 : if (fast) {
2263 0 : for (int32_t j = 0; j < nb_strengths; j++) {
2264 0 : frm_hdr->CDEF_params.cdef_y_strength[j] = priconv[frm_hdr->CDEF_params.cdef_y_strength[j] / CDEF_SEC_STRENGTHS] * CDEF_SEC_STRENGTHS + (frm_hdr->CDEF_params.cdef_y_strength[j] % CDEF_SEC_STRENGTHS);
2265 0 : frm_hdr->CDEF_params.cdef_uv_strength[j] = priconv[frm_hdr->CDEF_params.cdef_uv_strength[j] / CDEF_SEC_STRENGTHS] * CDEF_SEC_STRENGTHS + (frm_hdr->CDEF_params.cdef_uv_strength[j] % CDEF_SEC_STRENGTHS);
2266 : }
2267 : }
2268 0 : frm_hdr->CDEF_params.cdef_damping = pri_damping;
2269 : //pPcs->cdef_pri_damping = pri_damping;
2270 : //pPcs->cdef_sec_damping = sec_damping;
2271 :
2272 0 : for (int i = 0; i < total_strengths; i++)
2273 0 : best_frame_gi_cnt += selected_strength_cnt[i] > best_frame_gi_cnt ? 1 : 0;
2274 0 : pPcs->cdef_frame_strength = ((best_frame_gi_cnt + 4) / 4) * 4;
2275 :
2276 0 : eb_aom_free(mse[0]);
2277 0 : eb_aom_free(mse[1]);
2278 0 : for (pli = 0; pli < num_planes; pli++) {
2279 0 : eb_aom_free(src[pli]);
2280 0 : eb_aom_free(ref_coeff[pli]);
2281 : }
2282 0 : eb_aom_free(sb_index);
2283 0 : eb_aom_free(selected_strength);
2284 0 : }
|