Line data Source code
1 : /*
2 : * Copyright(c) 2019 Netflix, Inc.
3 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 : */
5 : /*
6 : * Copyright(c) 2019 Intel Corporation
7 : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
8 : */
9 : /*
10 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
11 : *
12 : * This source code is subject to the terms of the BSD 2 Clause License and
13 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
14 : * was not distributed with this source code in the LICENSE file, you can
15 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
16 : * Media Patent License 1.0 was not distributed with this source code in the
17 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
18 : */
19 :
20 : #include <stdlib.h>
21 : #include <stdio.h>
22 : #include <string.h>
23 : #include <assert.h>
24 : #include "EbTemporalFiltering.h"
25 : #include "EbComputeSAD.h"
26 : #include "EbMotionEstimation.h"
27 : #include "EbMotionEstimationProcess.h"
28 : #include "EbMotionEstimationContext.h"
29 : #include "EbDefinitions.h"
30 : #include "EbLambdaRateTables.h"
31 : #include "EbPictureAnalysisProcess.h"
32 : #include "EbMcp.h"
33 : #include "av1me.h"
34 : #include "EbTemporalFiltering_sse4.h"
35 : #include "EbObject.h"
36 : #include "EbPictureOperators.h"
37 : #include "EbInterPrediction.h"
38 : #include "aom_dsp_rtcd.h"
39 : #include "EbComputeVariance_C.h"
40 :
41 : #undef _MM_HINT_T2
42 : #define _MM_HINT_T2 1
43 :
44 : static EB_AV1_INTER_PREDICTION_FUNC_PTR av1_inter_prediction_function_table[2] =
45 : {
46 : av1_inter_prediction,
47 : av1_inter_prediction_hbd
48 : };
49 :
50 : static unsigned int index_mult[14] = {
51 : 0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124
52 : };
53 :
54 : static int64_t index_mult_highbd[14] = { 0U, 0U, 0U,
55 : 0U, 3221225472U, 2576980378U,
56 : 2147483648U, 1840700270U, 1610612736U,
57 : 1431655766U, 1288490189U, 1171354718U,
58 : 0U, 991146300U };
59 :
60 : // relationship between pu_index and row and col of the 32x32 sub-blocks
61 : static const uint32_t subblock_xy_32x32[4][2] = { {0,0}, {0,1}, {1,0}, {1,1} };
62 :
63 : static const uint32_t subblock_xy_16x16[N_16X16_BLOCKS][2] = { {0,0}, {0,1}, {0,2}, {0,3},
64 : {1,0}, {1,1}, {1,2}, {1,3},
65 : {2,0}, {2,1}, {2,2}, {2,3},
66 : {3,0}, {3,1}, {3,2}, {3,3} };
67 :
68 : static const uint32_t subblocks_from32x32_to_16x16[N_16X16_BLOCKS] = { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3 };
69 :
70 : static const uint32_t index_16x16_from_subindexes[4][4] = { {0, 1, 4, 5}, {2, 3, 6, 7}, {8, 9, 12, 13}, {10, 11, 14, 15} };
71 :
72 : extern aom_variance_fn_ptr_t mefn_ptr[BlockSizeS_ALL];
73 :
74 : #if DEBUG_TF
75 : // save YUV to file - auxiliary function for debug
76 : void save_YUV_to_file(char *filename, EbByte buffer_y, EbByte buffer_u, EbByte buffer_v,
77 : uint16_t width, uint16_t height,
78 : uint16_t stride_y, uint16_t stride_u, uint16_t stride_v,
79 : uint16_t origin_y, uint16_t origin_x,
80 : uint32_t ss_x, uint32_t ss_y){
81 : FILE *fid = NULL;
82 : EbByte pic_point;
83 : int h;
84 :
85 : // save current source picture to a YUV file
86 : FOPEN(fid, filename, "wb");
87 :
88 : if (!fid){
89 : printf("Unable to open file %s to write.\n", "temp_picture.yuv");
90 : }else{
91 : // the source picture saved in the enchanced_picture_ptr contains a border in x and y dimensions
92 : pic_point = buffer_y + (origin_y*stride_y) + origin_x;
93 : for (h = 0; h < height; h++) {
94 : fwrite(pic_point, 1, (size_t)width, fid);
95 : pic_point = pic_point + stride_y;
96 : }
97 : pic_point = buffer_u + ((origin_y >> ss_y)*stride_u) + (origin_x >> ss_x);
98 : for (h = 0; h < height >> ss_y; h++) {
99 : fwrite(pic_point, 1, (size_t)width >> ss_x, fid);
100 : pic_point = pic_point + stride_u;
101 : }
102 : pic_point = buffer_v + ((origin_y >> ss_y)*stride_v) + (origin_x >> ss_x);
103 : for (h = 0; h < height >> ss_y; h++) {
104 : fwrite(pic_point, 1, (size_t)width >> ss_x, fid);
105 : pic_point = pic_point + stride_v;
106 : }
107 : fclose(fid);
108 : }
109 : }
110 :
111 : // save YUV to file - auxiliary function for debug
112 : void save_YUV_to_file_highbd(char *filename, uint16_t* buffer_y, uint16_t* buffer_u, uint16_t* buffer_v,
113 : uint16_t width, uint16_t height,
114 : uint16_t stride_y, uint16_t stride_u, uint16_t stride_v,
115 : uint16_t origin_y, uint16_t origin_x,
116 : uint32_t ss_x, uint32_t ss_y){
117 : FILE *fid = NULL;
118 : uint16_t *pic_point;
119 : int h;
120 :
121 : // save current source picture to a YUV file
122 : FOPEN(fid, filename, "wb");
123 :
124 : if (!fid){
125 : printf("Unable to open file %s to write.\n", "temp_picture.yuv");
126 : }else{
127 : // the source picture saved in the enchanced_picture_ptr contains a border in x and y dimensions
128 : pic_point = buffer_y + (origin_y*stride_y) + origin_x;
129 : for (h = 0; h < height; h++) {
130 : fwrite(pic_point, 2, (size_t)width, fid);
131 : pic_point = pic_point + stride_y;
132 : }
133 : pic_point = buffer_u + ((origin_y >> ss_y)*stride_u) + (origin_x >> ss_x);
134 : for (h = 0; h < height >> ss_y; h++) {
135 : fwrite(pic_point, 2, (size_t)width >> ss_x, fid);
136 :
137 : pic_point = pic_point + stride_u;
138 : }
139 : pic_point = buffer_v + ((origin_y >> ss_y)*stride_v) + (origin_x >> ss_x);
140 : for (h = 0; h < height >> ss_y; h++) {
141 : fwrite(pic_point, 2, (size_t)width >> ss_x, fid);
142 : pic_point = pic_point + stride_v;
143 : }
144 : fclose(fid);
145 : }
146 : }
147 : #endif
148 :
149 0 : static void pack_highbd_pic(EbPictureBufferDesc *pic_ptr,
150 : uint16_t *buffer_16bit[3],
151 : uint32_t ss_x,
152 : uint32_t ss_y,
153 : EbBool include_padding)
154 : {
155 :
156 0 : uint32_t input_y_offset = 0;
157 0 : uint32_t input_bit_inc_y_offset = 0;
158 0 : uint32_t input_cb_offset = 0;
159 0 : uint32_t input_bit_inc_cb_offset = 0;
160 0 : uint32_t input_cr_offset = 0;
161 0 : uint32_t input_bit_inc_cr_offset = 0;
162 0 : uint16_t width = pic_ptr->stride_y;
163 0 : uint16_t height = (uint16_t)(pic_ptr->origin_y*2 + pic_ptr->height);
164 :
165 0 : if(!include_padding){
166 0 : input_y_offset = ((pic_ptr->origin_y) * pic_ptr->stride_y) + (pic_ptr->origin_x);
167 0 : input_bit_inc_y_offset = ((pic_ptr->origin_y) * pic_ptr->stride_bit_inc_y) + (pic_ptr->origin_x);
168 0 : input_cb_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_cb) + ((pic_ptr->origin_x) >> ss_x);
169 0 : input_bit_inc_cb_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_bit_inc_cb) + ((pic_ptr->origin_x) >> ss_x);
170 0 : input_cr_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_cr) + ((pic_ptr->origin_x) >> ss_x);
171 0 : input_bit_inc_cr_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_bit_inc_cr) + ((pic_ptr->origin_x) >> ss_x);
172 :
173 0 : width = pic_ptr->width;
174 0 : height = pic_ptr->height;
175 : }
176 :
177 0 : pack2d_src(pic_ptr->buffer_y + input_y_offset,
178 0 : pic_ptr->stride_y,
179 0 : pic_ptr->buffer_bit_inc_y + input_bit_inc_y_offset,
180 0 : pic_ptr->stride_bit_inc_y,
181 : buffer_16bit[C_Y],
182 0 : pic_ptr->stride_y,
183 : width,
184 : height);
185 :
186 0 : pack2d_src(pic_ptr->buffer_cb + input_cb_offset,
187 0 : pic_ptr->stride_cb,
188 0 : pic_ptr->buffer_bit_inc_cb + input_bit_inc_cb_offset,
189 0 : pic_ptr->stride_bit_inc_cb,
190 0 : buffer_16bit[C_U],
191 0 : pic_ptr->stride_cb,
192 0 : width >> ss_x,
193 0 : height >> ss_y);
194 :
195 0 : pack2d_src(pic_ptr->buffer_cr + input_cr_offset,
196 0 : pic_ptr->stride_cr,
197 0 : pic_ptr->buffer_bit_inc_cr + input_bit_inc_cr_offset,
198 0 : pic_ptr->stride_bit_inc_cr,
199 0 : buffer_16bit[C_V],
200 0 : pic_ptr->stride_cr,
201 0 : width >> ss_x,
202 0 : height >> ss_y);
203 :
204 0 : }
205 :
206 0 : static void unpack_highbd_pic(uint16_t *buffer_highbd[3],
207 : EbPictureBufferDesc *pic_ptr,
208 : uint32_t ss_x,
209 : uint32_t ss_y,
210 : EbBool include_padding)
211 : {
212 :
213 0 : uint32_t input_y_offset = 0;
214 0 : uint32_t input_bit_inc_y_offset = 0;
215 0 : uint32_t input_cb_offset = 0;
216 0 : uint32_t input_bit_inc_cb_offset = 0;
217 0 : uint32_t input_cr_offset = 0;
218 0 : uint32_t input_bit_inc_cr_offset = 0;
219 0 : uint16_t width = pic_ptr->stride_y;
220 0 : uint16_t height = (uint16_t)(pic_ptr->origin_y*2 + pic_ptr->height);
221 :
222 0 : if(!include_padding){
223 0 : input_y_offset = ((pic_ptr->origin_y) * pic_ptr->stride_y) + (pic_ptr->origin_x);
224 0 : input_bit_inc_y_offset = ((pic_ptr->origin_y) * pic_ptr->stride_bit_inc_y) + (pic_ptr->origin_x);
225 0 : input_cb_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_cb) + ((pic_ptr->origin_x) >> ss_x);
226 0 : input_bit_inc_cb_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_bit_inc_cb) + ((pic_ptr->origin_x) >> ss_x);
227 0 : input_cr_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_cr) + ((pic_ptr->origin_x) >> ss_x);
228 0 : input_bit_inc_cr_offset = (((pic_ptr->origin_y) >> ss_y) * pic_ptr->stride_bit_inc_cr) + ((pic_ptr->origin_x) >> ss_x);
229 :
230 0 : width = pic_ptr->width;
231 0 : height = pic_ptr->height;
232 : }
233 :
234 0 : un_pack2d(buffer_highbd[C_Y],
235 0 : pic_ptr->stride_y,
236 0 : pic_ptr->buffer_y + input_y_offset,
237 0 : pic_ptr->stride_y,
238 0 : pic_ptr->buffer_bit_inc_y + input_bit_inc_y_offset,
239 0 : pic_ptr->stride_bit_inc_y,
240 : width,
241 : height);
242 :
243 0 : un_pack2d(buffer_highbd[C_U],
244 0 : pic_ptr->stride_cb,
245 0 : pic_ptr->buffer_cb + input_cb_offset,
246 0 : pic_ptr->stride_cb,
247 0 : pic_ptr->buffer_bit_inc_cb + input_bit_inc_cb_offset,
248 0 : pic_ptr->stride_bit_inc_cb,
249 0 : width >> ss_x,
250 0 : height >> ss_y);
251 :
252 0 : un_pack2d(buffer_highbd[C_V],
253 0 : pic_ptr->stride_cr,
254 0 : pic_ptr->buffer_cr + input_cr_offset,
255 0 : pic_ptr->stride_cr,
256 0 : pic_ptr->buffer_bit_inc_cr + input_bit_inc_cr_offset,
257 0 : pic_ptr->stride_bit_inc_cr,
258 0 : width >> ss_x,
259 0 : height >> ss_y);
260 0 : }
261 :
262 48 : void generate_padding_pic(EbPictureBufferDesc *pic_ptr,
263 : uint32_t ss_x,
264 : uint32_t ss_y,
265 : EbBool is_highbd){
266 :
267 48 : if(!is_highbd){
268 48 : generate_padding(pic_ptr->buffer_cb,
269 48 : pic_ptr->stride_cb,
270 48 : pic_ptr->width >> ss_x,
271 48 : pic_ptr->height >> ss_y,
272 48 : pic_ptr->origin_x >> ss_x,
273 48 : pic_ptr->origin_y >> ss_y);
274 :
275 48 : generate_padding(pic_ptr->buffer_cr,
276 48 : pic_ptr->stride_cr,
277 48 : pic_ptr->width >> ss_x,
278 48 : pic_ptr->height >> ss_y,
279 48 : pic_ptr->origin_x >> ss_x,
280 48 : pic_ptr->origin_y >> ss_y);
281 : }else{
282 0 : generate_padding(pic_ptr->buffer_cb,
283 0 : pic_ptr->stride_cb,
284 0 : pic_ptr->width >> ss_x,
285 0 : pic_ptr->height >> ss_y,
286 0 : pic_ptr->origin_x >> ss_x,
287 0 : pic_ptr->origin_y >> ss_y);
288 :
289 0 : generate_padding(pic_ptr->buffer_cr,
290 0 : pic_ptr->stride_cr,
291 0 : pic_ptr->width >> ss_x,
292 0 : pic_ptr->height >> ss_y,
293 0 : pic_ptr->origin_x >> ss_x,
294 0 : pic_ptr->origin_y >> ss_y);
295 :
296 0 : generate_padding(pic_ptr->buffer_bit_inc_cb,
297 0 : pic_ptr->stride_cr,
298 0 : pic_ptr->width >> ss_x,
299 0 : pic_ptr->height >> ss_y,
300 0 : pic_ptr->origin_x >> ss_x,
301 0 : pic_ptr->origin_y >> ss_y);
302 :
303 0 : generate_padding(pic_ptr->buffer_bit_inc_cr,
304 0 : pic_ptr->stride_cr,
305 0 : pic_ptr->width >> ss_x,
306 0 : pic_ptr->height >> ss_y,
307 0 : pic_ptr->origin_x >> ss_x,
308 0 : pic_ptr->origin_y >> ss_y);
309 : }
310 48 : }
311 :
312 : // assign a single value to all elements in an array
313 4320 : static void populate_list_with_value(int *list,
314 : int nelements,
315 : const int value){
316 33120 : for(int i=0; i<nelements; i++)
317 28800 : list[i] = value;
318 4320 : }
319 :
320 : // get block filter weights using a distance metric
321 2880 : static void get_blk_fw_using_dist(int const *me_32x32_subblock_vf,
322 : int const *me_16x16_subblock_vf,
323 : EbBool use_16x16_subblocks_only,
324 : int *blk_fw,
325 : EbBool is_highbd){
326 : uint32_t blk_idx, idx_32x32;
327 :
328 2880 : int me_sum_16x16_subblock_vf[4] = {0};
329 2880 : int max_me_vf[4] = {INT_MIN_TF, INT_MIN_TF, INT_MIN_TF, INT_MIN_TF}, min_me_vf[4] = {INT_MAX_TF, INT_MAX_TF, INT_MAX_TF, INT_MAX_TF};
330 :
331 : int threshold_low, threshold_high;
332 :
333 2880 : if(!is_highbd){
334 2880 : threshold_low = THRES_LOW;
335 2880 : threshold_high = THRES_HIGH;
336 : }else{
337 0 : threshold_low = THRES_LOW*16;
338 0 : threshold_high = THRES_HIGH*16;
339 : }
340 :
341 2880 : if(use_16x16_subblocks_only) {
342 14400 : for (idx_32x32 = 0; idx_32x32 < 4; idx_32x32++) {
343 : // split into 16x16 sub-blocks
344 :
345 195840 : for (blk_idx = 0; blk_idx < N_16X16_BLOCKS; blk_idx++) {
346 184320 : if (subblocks_from32x32_to_16x16[blk_idx] == idx_32x32) {
347 46080 : blk_fw[blk_idx] = me_16x16_subblock_vf[blk_idx] < threshold_low
348 : ? 2
349 46080 : : me_16x16_subblock_vf[blk_idx] < threshold_high ? 1 : 0;
350 : }
351 : }
352 : }
353 : }else {
354 0 : for (blk_idx = 0; blk_idx < N_16X16_BLOCKS; blk_idx++) {
355 0 : idx_32x32 = subblocks_from32x32_to_16x16[blk_idx];
356 :
357 0 : if (min_me_vf[idx_32x32] > me_16x16_subblock_vf[blk_idx])
358 0 : min_me_vf[idx_32x32] = me_16x16_subblock_vf[blk_idx];
359 0 : if (max_me_vf[idx_32x32] < me_16x16_subblock_vf[blk_idx])
360 0 : max_me_vf[idx_32x32] = me_16x16_subblock_vf[blk_idx];
361 :
362 0 : me_sum_16x16_subblock_vf[idx_32x32] += me_16x16_subblock_vf[blk_idx];
363 : }
364 :
365 0 : for (idx_32x32 = 0; idx_32x32 < 4; idx_32x32++) {
366 0 : if (((me_32x32_subblock_vf[idx_32x32] * 15 < (me_sum_16x16_subblock_vf[idx_32x32] << 4)) &&
367 0 : max_me_vf - min_me_vf < THRES_DIFF_HIGH) ||
368 0 : ((me_32x32_subblock_vf[idx_32x32] * 14 < (me_sum_16x16_subblock_vf[idx_32x32] << 4)) &&
369 0 : max_me_vf - min_me_vf < THRES_DIFF_LOW)) {
370 : // split into 32x32 sub-blocks
371 :
372 0 : int weight = me_32x32_subblock_vf[idx_32x32] < (threshold_low << THR_SHIFT)
373 : ? 2
374 0 : : me_32x32_subblock_vf[idx_32x32] < (threshold_high << THR_SHIFT) ? 1 : 0;
375 :
376 0 : for (blk_idx = 0; blk_idx < N_16X16_BLOCKS; blk_idx++) {
377 0 : if (subblocks_from32x32_to_16x16[blk_idx] == idx_32x32)
378 0 : blk_fw[blk_idx] = weight;
379 : }
380 : } else {
381 : // split into 16x16 sub-blocks
382 :
383 0 : for (blk_idx = 0; blk_idx < N_16X16_BLOCKS; blk_idx++) {
384 0 : if (subblocks_from32x32_to_16x16[blk_idx] == idx_32x32) {
385 0 : blk_fw[blk_idx] = me_16x16_subblock_vf[blk_idx] < threshold_low
386 : ? 2
387 0 : : me_16x16_subblock_vf[blk_idx] < threshold_high ? 1 : 0;
388 : }
389 : }
390 : }
391 : }
392 : }
393 2880 : }
394 :
395 : // compute variance for the MC block residuals
396 2880 : static void get_ME_distortion(int *me_32x32_subblock_vf,
397 : int *me_16x16_subblock_vf,
398 : uint8_t *pred_y,
399 : int stride_pred_y,
400 : uint8_t *src_y,
401 : int stride_src_y){
402 : unsigned int sse;
403 :
404 : uint8_t * pred_y_ptr;
405 : uint8_t * src_y_ptr;
406 :
407 14400 : for(uint32_t index_32x32 = 0; index_32x32 < 4; index_32x32++) {
408 11519 : int row = subblock_xy_32x32[index_32x32][0];
409 11519 : int col = subblock_xy_32x32[index_32x32][1];
410 :
411 11519 : pred_y_ptr = pred_y + 32*row*stride_pred_y + 32*col;
412 11519 : src_y_ptr = src_y + 32*row*stride_src_y + 32*col;
413 :
414 11519 : const aom_variance_fn_ptr_t *fn_ptr = &mefn_ptr[BLOCK_32X32];
415 :
416 11519 : me_32x32_subblock_vf[index_32x32] = fn_ptr->vf(pred_y_ptr, stride_pred_y, src_y_ptr, stride_src_y, &sse );
417 : }
418 :
419 48959 : for(uint32_t index_16x16 = 0; index_16x16 < 16; index_16x16++) {
420 46078 : int row = subblock_xy_16x16[index_16x16][0];
421 46078 : int col = subblock_xy_16x16[index_16x16][1];
422 :
423 46078 : pred_y_ptr = pred_y + 16*row*stride_pred_y + 16*col;
424 46078 : src_y_ptr = src_y + 16*row*stride_src_y + 16*col;
425 :
426 46078 : const aom_variance_fn_ptr_t *fn_ptr = &mefn_ptr[BLOCK_16X16];
427 :
428 46078 : me_16x16_subblock_vf[index_16x16] = fn_ptr->vf(pred_y_ptr, stride_pred_y, src_y_ptr, stride_src_y, &sse );
429 : }
430 2881 : }
431 :
432 : // compute variance for the MC block residuals - highbd
433 0 : static void get_ME_distortion_highbd(int *me_32x32_subblock_vf,
434 : int *me_16x16_subblock_vf,
435 : uint16_t *pred_y,
436 : int stride_pred_y,
437 : uint16_t *src_y,
438 : int stride_src_y){
439 : unsigned int sse;
440 :
441 : uint16_t *pred_Y_ptr;
442 : uint16_t *src_Y_ptr;
443 :
444 0 : for(uint32_t index_32x32 = 0; index_32x32 < 4; index_32x32++) {
445 0 : int row = subblock_xy_32x32[index_32x32][0];
446 0 : int col = subblock_xy_32x32[index_32x32][1];
447 :
448 0 : pred_Y_ptr = pred_y + 32*row*stride_pred_y + 32*col;
449 0 : src_Y_ptr = src_y + 32*row*stride_src_y + 32*col;
450 :
451 0 : me_32x32_subblock_vf[index_32x32] = variance_highbd_c(pred_Y_ptr, stride_pred_y, src_Y_ptr, stride_src_y, 32, 32, &sse );
452 : }
453 :
454 0 : for(uint32_t index_16x16 = 0; index_16x16 < 16; index_16x16++) {
455 0 : int row = subblock_xy_16x16[index_16x16][0];
456 0 : int col = subblock_xy_16x16[index_16x16][1];
457 :
458 0 : pred_Y_ptr = pred_y + 16*row*stride_pred_y + 16*col;
459 0 : src_Y_ptr = src_y + 16*row*stride_src_y + 16*col;
460 :
461 0 : me_16x16_subblock_vf[index_16x16] = variance_highbd_c(pred_Y_ptr, stride_pred_y, src_Y_ptr, stride_src_y, 16, 16, &sse );
462 : }
463 0 : }
464 :
465 : // Create and initialize all necessary ME context structures
466 2880 : static void create_ME_context_and_picture_control(MotionEstimationContext_t *context_ptr,
467 : PictureParentControlSet *picture_control_set_ptr_frame,
468 : PictureParentControlSet *picture_control_set_ptr_central,
469 : EbPictureBufferDesc *input_picture_ptr_central,
470 : int blk_row,
471 : int blk_col,
472 : uint32_t ss_x,
473 : uint32_t ss_y){
474 : uint32_t lcuRow;
475 :
476 : // set reference picture for alt-refs
477 2880 : context_ptr->me_context_ptr->alt_ref_reference_ptr = (EbPaReferenceObject*)picture_control_set_ptr_frame->pa_reference_picture_wrapper_ptr->object_ptr;
478 2880 : context_ptr->me_context_ptr->me_alt_ref = EB_TRUE;
479 :
480 : // set the buffers with the original, quarter and sixteenth pixels version of the source frame
481 2880 : EbPaReferenceObject *src_object = (EbPaReferenceObject*)picture_control_set_ptr_central->pa_reference_picture_wrapper_ptr->object_ptr;
482 2880 : EbPictureBufferDesc *padded_pic_ptr = src_object->input_padded_picture_ptr;
483 2880 : SequenceControlSet *sequence_control_set_ptr = (SequenceControlSet*)picture_control_set_ptr_central->sequence_control_set_wrapper_ptr->object_ptr;
484 : // Set 1/4 and 1/16 ME reference buffer(s); filtered or decimated
485 5760 : EbPictureBufferDesc * quarter_pic_ptr = (sequence_control_set_ptr->down_sampling_method_me_search == ME_FILTERED_DOWNSAMPLED) ?
486 2880 : src_object->quarter_filtered_picture_ptr :
487 : src_object->quarter_decimated_picture_ptr;
488 :
489 5760 : EbPictureBufferDesc *sixteenth_pic_ptr = (sequence_control_set_ptr->down_sampling_method_me_search == ME_FILTERED_DOWNSAMPLED) ?
490 2880 : src_object->sixteenth_filtered_picture_ptr :
491 : src_object->sixteenth_decimated_picture_ptr;
492 : // Parts from MotionEstimationKernel()
493 2880 : uint32_t sb_origin_x = (uint32_t)(blk_col * BW);
494 2880 : uint32_t sb_origin_y = (uint32_t)(blk_row * BH);
495 :
496 2880 : uint32_t sb_width = (input_picture_ptr_central->width - sb_origin_x) < BLOCK_SIZE_64 ? input_picture_ptr_central->width - sb_origin_x : BLOCK_SIZE_64;
497 2880 : uint32_t sb_height = (input_picture_ptr_central->height - sb_origin_y) < BLOCK_SIZE_64 ? input_picture_ptr_central->height - sb_origin_y : BLOCK_SIZE_64;
498 :
499 : // Load the SB from the input to the intermediate SB buffer
500 2880 : int bufferIndex = (input_picture_ptr_central->origin_y + sb_origin_y) * input_picture_ptr_central->stride_y + input_picture_ptr_central->origin_x + sb_origin_x;
501 :
502 : // set search type
503 2880 : context_ptr->me_context_ptr->hme_search_type = HME_RECTANGULAR;
504 :
505 : // set search method
506 2880 : context_ptr->me_context_ptr->hme_search_method = FULL_SAD_SEARCH;
507 :
508 : // set Lambda
509 2880 : context_ptr->me_context_ptr->lambda = lambda_mode_decision_ra_sad[picture_control_set_ptr_central->picture_qp];
510 :
511 : // populate src block buffers: sb_buffer, quarter_sb_buffer and sixteenth_sb_buffer
512 187119 : for (lcuRow = 0; lcuRow < BLOCK_SIZE_64; lcuRow++) {
513 184244 : EB_MEMCPY((&(context_ptr->me_context_ptr->sb_buffer[lcuRow * BLOCK_SIZE_64])), (&(input_picture_ptr_central->buffer_y[bufferIndex + lcuRow * input_picture_ptr_central->stride_y])), BLOCK_SIZE_64 * sizeof(uint8_t));
514 : }
515 :
516 : {
517 2875 : uint8_t * src_ptr = &(padded_pic_ptr->buffer_y[bufferIndex]);
518 :
519 : //_MM_HINT_T0 //_MM_HINT_T1 //_MM_HINT_T2 //_MM_HINT_NTA
520 : uint32_t i;
521 175670 : for (i = 0; i < sb_height; i++)
522 : {
523 172799 : char const* p = (char const*)(src_ptr + i * padded_pic_ptr->stride_y);
524 172799 : _mm_prefetch(p, _MM_HINT_T2);
525 : }
526 : }
527 :
528 2871 : context_ptr->me_context_ptr->sb_src_ptr = &(padded_pic_ptr->buffer_y[bufferIndex]);
529 2871 : context_ptr->me_context_ptr->sb_src_stride = padded_pic_ptr->stride_y;
530 :
531 : // Load the 1/4 decimated SB from the 1/4 decimated input to the 1/4 intermediate SB buffer
532 2871 : bufferIndex = (quarter_pic_ptr->origin_y + (sb_origin_y >> ss_y)) * quarter_pic_ptr->stride_y + quarter_pic_ptr->origin_x + (sb_origin_x >> ss_x);
533 :
534 89271 : for (lcuRow = 0; lcuRow < (sb_height >> ss_y); lcuRow++) {
535 86400 : EB_MEMCPY((&(context_ptr->me_context_ptr->quarter_sb_buffer[lcuRow * context_ptr->me_context_ptr->quarter_sb_buffer_stride])), (&(quarter_pic_ptr->buffer_y[bufferIndex + lcuRow * quarter_pic_ptr->stride_y])), (sb_width >> ss_x) * sizeof(uint8_t));
536 : }
537 :
538 : // Load the 1/16 decimated SB from the 1/16 decimated input to the 1/16 intermediate SB buffer
539 2871 : bufferIndex = (sixteenth_pic_ptr->origin_y + (sb_origin_y >> 2)) * sixteenth_pic_ptr->stride_y + sixteenth_pic_ptr->origin_x + (sb_origin_x >> 2);
540 :
541 : {
542 2871 : uint8_t *framePtr = &(sixteenth_pic_ptr->buffer_y[bufferIndex]);
543 2871 : uint8_t *localPtr = context_ptr->me_context_ptr->sixteenth_sb_buffer;
544 :
545 2871 : if (context_ptr->me_context_ptr->hme_search_method == FULL_SAD_SEARCH) {
546 46080 : for (lcuRow = 0; lcuRow < (sb_height >> 2); lcuRow += 1) {
547 43200 : EB_MEMCPY(localPtr, framePtr, (sb_width >> 2) * sizeof(uint8_t));
548 43200 : localPtr += 16;
549 43200 : framePtr += sixteenth_pic_ptr->stride_y;
550 : }
551 : }
552 : else {
553 0 : for (lcuRow = 0; lcuRow < (sb_height >> 2); lcuRow += 2) {
554 0 : EB_MEMCPY(localPtr, framePtr, (sb_width >> 2) * sizeof(uint8_t));
555 0 : localPtr += 16;
556 0 : framePtr += sixteenth_pic_ptr->stride_y << 1;
557 : }
558 : }
559 : }
560 2871 : }
561 :
562 : // Get sub-block filter weights for the 16 subblocks case
563 0 : static INLINE int get_subblock_filter_weight_16subblocks(unsigned int y,
564 : unsigned int x,
565 : unsigned int block_height,
566 : unsigned int block_width,
567 : const int *blk_fw) {
568 0 : const unsigned int block_width_div4 = block_width / 4;
569 0 : const unsigned int block_height_div4 = block_height / 4;
570 :
571 0 : int filter_weight = 0;
572 0 : if (y < block_height_div4) {
573 0 : if (x < block_width_div4)
574 0 : filter_weight = blk_fw[0];
575 0 : else if(x < block_width_div4*2)
576 0 : filter_weight = blk_fw[1];
577 0 : else if(x < block_width_div4*3)
578 0 : filter_weight = blk_fw[2];
579 : else
580 0 : filter_weight = blk_fw[3];
581 0 : } else if(y < block_height_div4*2){
582 0 : if (x < block_width_div4)
583 0 : filter_weight = blk_fw[4];
584 0 : else if(x < block_width_div4*2)
585 0 : filter_weight = blk_fw[5];
586 0 : else if(x < block_width_div4*3)
587 0 : filter_weight = blk_fw[6];
588 : else
589 0 : filter_weight = blk_fw[7];
590 0 : } else if(y < block_height_div4*3){
591 0 : if (x < block_width_div4)
592 0 : filter_weight = blk_fw[8];
593 0 : else if(x < block_width_div4*2)
594 0 : filter_weight = blk_fw[9];
595 0 : else if(x < block_width_div4*3)
596 0 : filter_weight = blk_fw[10];
597 : else
598 0 : filter_weight = blk_fw[11];
599 : } else {
600 0 : if (x < block_width_div4)
601 0 : filter_weight = blk_fw[12];
602 0 : else if(x < block_width_div4*2)
603 0 : filter_weight = blk_fw[13];
604 0 : else if(x < block_width_div4*3)
605 0 : filter_weight = blk_fw[14];
606 : else
607 0 : filter_weight = blk_fw[15];
608 : }
609 :
610 0 : return filter_weight;
611 : }
612 :
613 : // Get sub-block filter weights for the 4 subblocks case
614 0 : static INLINE int get_subblock_filter_weight_4subblocks(unsigned int y,
615 : unsigned int x,
616 : unsigned int block_height,
617 : unsigned int block_width,
618 : const int *blk_fw) {
619 0 : int filter_weight = 0;
620 0 : if (y < block_height / 2) {
621 0 : if (x < block_width / 2)
622 0 : filter_weight = blk_fw[0];
623 : else
624 0 : filter_weight = blk_fw[1];
625 : } else {
626 0 : if (x < block_width / 2)
627 0 : filter_weight = blk_fw[2];
628 : else
629 0 : filter_weight = blk_fw[3];
630 : }
631 0 : return filter_weight;
632 : }
633 :
634 : // Adjust value of the modified (weight of filtering) based on the distortion and strength parameter
635 0 : static INLINE int adjust_modifier(int sum_dist,
636 : int index,
637 : int rounding,
638 : int strength,
639 : int filter_weight) {
640 0 : assert(index >= 0 && index <= 13);
641 0 : assert(index_mult[index] != 0);
642 :
643 : //mod = (sum_dist / index) * 3;
644 0 : int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
645 :
646 0 : mod += rounding;
647 0 : mod >>= strength;
648 :
649 0 : mod = AOMMIN(16, mod);
650 :
651 0 : mod = 16 - mod;
652 0 : mod *= filter_weight;
653 :
654 0 : return mod;
655 : }
656 :
657 : // Adjust value of the modified (weight of filtering) based on the distortion and strength parameter - highbd
658 0 : static INLINE int adjust_modifier_highbd(int64_t sum_dist,
659 : int index,
660 : int rounding,
661 : int strength,
662 : int filter_weight) {
663 0 : assert(index >= 0 && index <= 13);
664 0 : assert(index_mult_highbd[index] != 0);
665 :
666 : //mod = (sum_dist / index) * 3;
667 0 : int mod = (int)((AOMMIN(sum_dist, INT32_MAX) * index_mult_highbd[index]) >> 32);
668 :
669 0 : mod += rounding;
670 0 : mod >>= strength;
671 :
672 0 : mod = AOMMIN(16, mod);
673 :
674 0 : mod = 16 - mod;
675 0 : mod *= filter_weight;
676 :
677 0 : return mod;
678 : }
679 :
680 0 : static INLINE void calculate_squared_errors(const uint8_t *s,
681 : int s_stride,
682 : const uint8_t *p,
683 : int p_stride,
684 : uint16_t *diff_sse,
685 : unsigned int w,
686 : unsigned int h) {
687 0 : int idx = 0;
688 : unsigned int i, j;
689 :
690 0 : for (i = 0; i < h; i++) {
691 0 : for (j = 0; j < w; j++) {
692 0 : const int16_t diff = s[i * s_stride + j] - p[i * p_stride + j];
693 0 : diff_sse[idx] = (uint16_t)(diff * diff);
694 0 : idx++;
695 : }
696 : }
697 0 : }
698 :
699 0 : static INLINE void calculate_squared_errors_highbd(const uint16_t *s,
700 : int s_stride,
701 : const uint16_t *p,
702 : int p_stride,
703 : uint32_t *diff_sse,
704 : unsigned int w,
705 : unsigned int h) {
706 0 : int idx = 0;
707 : unsigned int i, j;
708 :
709 0 : for (i = 0; i < h; i++) {
710 0 : for (j = 0; j < w; j++) {
711 0 : const int32_t diff = s[i * s_stride + j] - p[i * p_stride + j];
712 0 : diff_sse[idx] = (uint32_t)(diff * diff);
713 0 : idx++;
714 : }
715 : }
716 0 : }
717 :
718 : // Main function that applies filtering to a block according to the weights
719 0 : void svt_av1_apply_filtering_c(const uint8_t *y_src,
720 : int y_src_stride,
721 : const uint8_t *y_pre,
722 : int y_pre_stride,
723 : const uint8_t *u_src,
724 : const uint8_t *v_src,
725 : int uv_src_stride,
726 : const uint8_t *u_pre,
727 : const uint8_t *v_pre,
728 : int uv_pre_stride,
729 : unsigned int block_width,
730 : unsigned int block_height,
731 : int ss_x,
732 : int ss_y,
733 : int strength,
734 : const int *blk_fw,
735 : int use_whole_blk,
736 : uint32_t *y_accum,
737 : uint16_t *y_count,
738 : uint32_t *u_accum,
739 : uint16_t *u_count,
740 : uint32_t *v_accum,
741 : uint16_t *v_count){ // sub-block filter weights
742 :
743 : unsigned int i, j, k, m;
744 : int idx, idy;
745 : int modifier;
746 0 : const int rounding = (1 << strength) >> 1;
747 0 : const unsigned int uv_block_width = block_width >> ss_x;
748 0 : const unsigned int uv_block_height = block_height >> ss_y;
749 : DECLARE_ALIGNED(16, uint16_t, y_diff_se[BLK_PELS]);
750 : DECLARE_ALIGNED(16, uint16_t, u_diff_se[BLK_PELS]);
751 : DECLARE_ALIGNED(16, uint16_t, v_diff_se[BLK_PELS]);
752 :
753 0 : memset(y_diff_se, 0, BLK_PELS * sizeof(uint16_t));
754 0 : memset(u_diff_se, 0, BLK_PELS * sizeof(uint16_t));
755 0 : memset(v_diff_se, 0, BLK_PELS * sizeof(uint16_t));
756 :
757 0 : assert(use_whole_blk == 0);
758 : UNUSED(use_whole_blk);
759 :
760 : // Calculate squared differences for each pixel of the block (pred-orig)
761 0 : calculate_squared_errors(y_src, y_src_stride, y_pre, y_pre_stride, y_diff_se,
762 : block_width, block_height);
763 0 : calculate_squared_errors(u_src, uv_src_stride, u_pre, uv_pre_stride,
764 : u_diff_se, uv_block_width, uv_block_height);
765 0 : calculate_squared_errors(v_src, uv_src_stride, v_pre, uv_pre_stride,
766 : v_diff_se, uv_block_width, uv_block_height);
767 :
768 0 : for (i = 0; i < block_height; i++) {
769 0 : for (j = 0; j < block_width; j++) {
770 0 : const int pixel_value = y_pre[i * y_pre_stride + j];
771 :
772 : int filter_weight;
773 :
774 0 : if(block_width == (BW>>1)){
775 0 : filter_weight = get_subblock_filter_weight_4subblocks(i, j, block_height, block_width, blk_fw);
776 : }else{
777 0 : filter_weight = get_subblock_filter_weight_16subblocks(i, j, block_height, block_width, blk_fw);
778 : }
779 :
780 : // non-local mean approach
781 0 : int y_index = 0;
782 :
783 0 : const int uv_r = i >> ss_y;
784 0 : const int uv_c = j >> ss_x;
785 0 : modifier = 0;
786 :
787 0 : for (idy = -1; idy <= 1; ++idy) {
788 0 : for (idx = -1; idx <= 1; ++idx) {
789 0 : const int row = (int)i + idy;
790 0 : const int col = (int)j + idx;
791 :
792 0 : if (row >= 0 && row < (int)block_height && col >= 0 &&
793 0 : col < (int)block_width) {
794 0 : modifier += y_diff_se[row * (int)block_width + col];
795 0 : ++y_index;
796 : }
797 : }
798 : }
799 :
800 0 : assert(y_index > 0);
801 :
802 0 : modifier += u_diff_se[uv_r * uv_block_width + uv_c];
803 0 : modifier += v_diff_se[uv_r * uv_block_width + uv_c];
804 :
805 0 : y_index += 2;
806 :
807 0 : modifier = adjust_modifier(modifier, y_index, rounding, strength, filter_weight);
808 :
809 0 : k = i * y_pre_stride + j;
810 :
811 0 : y_count[k] += modifier;
812 0 : y_accum[k] += modifier * pixel_value;
813 :
814 : // Process chroma component
815 0 : if (!(i & ss_y) && !(j & ss_x)) {
816 0 : const int u_pixel_value = u_pre[uv_r * uv_pre_stride + uv_c];
817 0 : const int v_pixel_value = v_pre[uv_r * uv_pre_stride + uv_c];
818 :
819 : // non-local mean approach
820 0 : int cr_index = 0;
821 0 : int u_mod = 0, v_mod = 0;
822 0 : int y_diff = 0;
823 :
824 0 : for (idy = -1; idy <= 1; ++idy) {
825 0 : for (idx = -1; idx <= 1; ++idx) {
826 0 : const int row = uv_r + idy;
827 0 : const int col = uv_c + idx;
828 :
829 0 : if (row >= 0 && row < (int)uv_block_height && col >= 0 &&
830 0 : col < (int)uv_block_width) {
831 0 : u_mod += u_diff_se[row * uv_block_width + col];
832 0 : v_mod += v_diff_se[row * uv_block_width + col];
833 0 : ++cr_index;
834 : }
835 : }
836 : }
837 :
838 0 : assert(cr_index > 0);
839 :
840 0 : for (idy = 0; idy < 1 + ss_y; ++idy) {
841 0 : for (idx = 0; idx < 1 + ss_x; ++idx) {
842 0 : const int row = (uv_r << ss_y) + idy;
843 0 : const int col = (uv_c << ss_x) + idx;
844 0 : y_diff += y_diff_se[row * (int)block_width + col];
845 0 : ++cr_index;
846 : }
847 : }
848 :
849 0 : u_mod += y_diff;
850 0 : v_mod += y_diff;
851 :
852 0 : u_mod = adjust_modifier(u_mod, cr_index, rounding, strength, filter_weight);
853 0 : v_mod = adjust_modifier(v_mod, cr_index, rounding, strength, filter_weight);
854 :
855 0 : m = (i>>ss_y) * uv_pre_stride + (j>>ss_x);
856 :
857 0 : u_count[m] += u_mod;
858 0 : u_accum[m] += u_mod * u_pixel_value;
859 :
860 0 : m = (i>>ss_y) * uv_pre_stride + (j>>ss_x);
861 :
862 0 : v_count[m] += v_mod;
863 0 : v_accum[m] += v_mod * v_pixel_value;
864 : }
865 : }
866 : }
867 0 : }
868 :
869 : // Main function that applies filtering to a block according to the weights - highbd
870 0 : void svt_av1_apply_filtering_highbd_c(const uint16_t *y_src,
871 : int y_src_stride,
872 : const uint16_t *y_pre,
873 : int y_pre_stride,
874 : const uint16_t *u_src,
875 : const uint16_t *v_src,
876 : int uv_src_stride,
877 : const uint16_t *u_pre,
878 : const uint16_t *v_pre,
879 : int uv_pre_stride,
880 : unsigned int block_width,
881 : unsigned int block_height,
882 : int ss_x,
883 : int ss_y,
884 : int strength,
885 : const int *blk_fw,
886 : int use_whole_blk,
887 : uint32_t *y_accum,
888 : uint16_t *y_count,
889 : uint32_t *u_accum,
890 : uint16_t *u_count,
891 : uint32_t *v_accum,
892 : uint16_t *v_count){ // sub-block filter weights
893 :
894 : unsigned int i, j, k, m;
895 : int idx, idy;
896 0 : const int rounding = (1 << strength) >> 1;
897 0 : const unsigned int uv_block_width = block_width >> ss_x;
898 0 : const unsigned int uv_block_height = block_height >> ss_y;
899 : DECLARE_ALIGNED(16, uint32_t, y_diff_se[BLK_PELS]);
900 : DECLARE_ALIGNED(16, uint32_t, u_diff_se[BLK_PELS]);
901 : DECLARE_ALIGNED(16, uint32_t, v_diff_se[BLK_PELS]);
902 :
903 0 : memset(y_diff_se, 0, BLK_PELS * sizeof(uint32_t));
904 0 : memset(u_diff_se, 0, BLK_PELS * sizeof(uint32_t));
905 0 : memset(v_diff_se, 0, BLK_PELS * sizeof(uint32_t));
906 :
907 0 : assert(use_whole_blk == 0);
908 : UNUSED(use_whole_blk);
909 :
910 : // Calculate squared differences for each pixel of the block (pred-orig)
911 0 : calculate_squared_errors_highbd(y_src, y_src_stride, y_pre, y_pre_stride, y_diff_se,
912 : block_width, block_height);
913 0 : calculate_squared_errors_highbd(u_src, uv_src_stride, u_pre, uv_pre_stride,
914 : u_diff_se, uv_block_width, uv_block_height);
915 0 : calculate_squared_errors_highbd(v_src, uv_src_stride, v_pre, uv_pre_stride,
916 : v_diff_se, uv_block_width, uv_block_height);
917 :
918 0 : for (i = 0; i < block_height; i++) {
919 0 : for (j = 0; j < block_width; j++) {
920 0 : const int pixel_value = y_pre[i * y_pre_stride + j];
921 :
922 : int filter_weight;
923 :
924 0 : if(block_width == (BW>>1)){
925 0 : filter_weight = get_subblock_filter_weight_4subblocks(i, j, block_height, block_width, blk_fw);
926 : }else{
927 0 : filter_weight = get_subblock_filter_weight_16subblocks(i, j, block_height, block_width, blk_fw);
928 : }
929 :
930 : // non-local mean approach
931 0 : int y_index = 0;
932 :
933 0 : const int uv_r = i >> ss_y;
934 0 : const int uv_c = j >> ss_x;
935 : int final_y_mod;
936 0 : int64_t y_mod = 0;
937 :
938 0 : for (idy = -1; idy <= 1; ++idy) {
939 0 : for (idx = -1; idx <= 1; ++idx) {
940 0 : const int row = (int)i + idy;
941 0 : const int col = (int)j + idx;
942 :
943 0 : if (row >= 0 && row < (int)block_height && col >= 0 &&
944 0 : col < (int)block_width) {
945 0 : y_mod += y_diff_se[row * (int)block_width + col];
946 0 : ++y_index;
947 : }
948 : }
949 : }
950 :
951 0 : assert(y_index > 0);
952 :
953 0 : y_mod += u_diff_se[uv_r * uv_block_width + uv_c];
954 0 : y_mod += v_diff_se[uv_r * uv_block_width + uv_c];
955 :
956 0 : y_index += 2;
957 :
958 0 : final_y_mod = adjust_modifier_highbd(y_mod, y_index, rounding, strength, filter_weight);
959 :
960 0 : k = i * y_pre_stride + j;
961 :
962 0 : y_count[k] += final_y_mod;
963 0 : y_accum[k] += final_y_mod * pixel_value;
964 :
965 : // Process chroma component
966 0 : if (!(i & ss_y) && !(j & ss_x)) {
967 0 : const int u_pixel_value = u_pre[uv_r * uv_pre_stride + uv_c];
968 0 : const int v_pixel_value = v_pre[uv_r * uv_pre_stride + uv_c];
969 :
970 : // non-local mean approach
971 0 : int cr_index = 0;
972 0 : int64_t u_mod = 0, v_mod = 0;
973 : int final_u_mod, final_v_mod;
974 0 : int y_diff = 0;
975 :
976 0 : for (idy = -1; idy <= 1; ++idy) {
977 0 : for (idx = -1; idx <= 1; ++idx) {
978 0 : const int row = uv_r + idy;
979 0 : const int col = uv_c + idx;
980 :
981 0 : if (row >= 0 && row < (int)uv_block_height && col >= 0 &&
982 0 : col < (int)uv_block_width) {
983 0 : u_mod += u_diff_se[row * uv_block_width + col];
984 0 : v_mod += v_diff_se[row * uv_block_width + col];
985 0 : ++cr_index;
986 : }
987 : }
988 : }
989 :
990 0 : assert(cr_index > 0);
991 :
992 0 : for (idy = 0; idy < 1 + ss_y; ++idy) {
993 0 : for (idx = 0; idx < 1 + ss_x; ++idx) {
994 0 : const int row = (uv_r << ss_y) + idy;
995 0 : const int col = (uv_c << ss_x) + idx;
996 0 : y_diff += y_diff_se[row * (int)block_width + col];
997 0 : ++cr_index;
998 : }
999 : }
1000 :
1001 0 : u_mod += y_diff;
1002 0 : v_mod += y_diff;
1003 :
1004 0 : final_u_mod = adjust_modifier_highbd(u_mod, cr_index, rounding, strength, filter_weight);
1005 0 : final_v_mod = adjust_modifier_highbd(v_mod, cr_index, rounding, strength, filter_weight);
1006 :
1007 0 : m = (i>>ss_y) * uv_pre_stride + (j>>ss_x);
1008 :
1009 0 : u_count[m] += final_u_mod;
1010 0 : u_accum[m] += final_u_mod * u_pixel_value;
1011 :
1012 0 : m = (i>>ss_y) * uv_pre_stride + (j>>ss_x);
1013 :
1014 0 : v_count[m] += final_v_mod;
1015 0 : v_accum[m] += final_v_mod * v_pixel_value;
1016 : }
1017 : }
1018 : }
1019 0 : }
1020 :
1021 11520 : static void apply_filtering_block(int block_row,
1022 : int block_col,
1023 : EbByte *src,
1024 : uint16_t **src_16bit,
1025 : EbByte *pred,
1026 : uint16_t **pred_16bit,
1027 : uint32_t **accum,
1028 : uint16_t **count,
1029 : uint32_t *stride,
1030 : uint32_t *stride_pred,
1031 : int block_width,
1032 : int block_height,
1033 : uint32_t ss_x, // chroma sub-sampling in x
1034 : uint32_t ss_y, // chroma sub-sampling in y
1035 : int altref_strength,
1036 : const int *blk_fw,
1037 : EbBool is_highbd) {
1038 :
1039 11520 : int blk_h = BH >> 1; int blk_w = BW >> 1; // fixed 32x32 blocks for now
1040 :
1041 11520 : int offset_src_buffer_Y = block_row * blk_h * stride[C_Y] + block_col * blk_w;
1042 11520 : int offset_src_buffer_U = block_row * (blk_h >> ss_y) * stride[C_U] + block_col * (blk_w >> ss_x);
1043 11520 : int offset_src_buffer_V = block_row * (blk_h >> ss_y) * stride[C_V] + block_col * (blk_w >> ss_x);
1044 :
1045 11520 : int offset_block_buffer_Y = block_row * blk_h * stride_pred[C_Y] + block_col * blk_w;
1046 11520 : int offset_block_buffer_U = block_row * (blk_h >> ss_y) * stride_pred[C_U] + block_col * (blk_w >> ss_x);
1047 11520 : int offset_block_buffer_V = block_row * (blk_h >> ss_y) * stride_pred[C_V] + block_col * (blk_w >> ss_x);
1048 :
1049 : int blk_fw_32x32[4];
1050 :
1051 11520 : int idx_32x32 = block_row * 2 + block_col;
1052 :
1053 : uint8_t *src_ptr[COLOR_CHANNELS];
1054 : uint8_t *pred_ptr[COLOR_CHANNELS];
1055 : uint32_t *accum_ptr[COLOR_CHANNELS];
1056 : uint16_t *count_ptr[COLOR_CHANNELS];
1057 :
1058 : uint16_t *src_ptr_16bit[COLOR_CHANNELS];
1059 : uint16_t *pred_ptr_16bit[COLOR_CHANNELS];
1060 :
1061 57600 : for (int ifw = 0; ifw < 4; ifw++) {
1062 46080 : int ifw_index = index_16x16_from_subindexes[idx_32x32][ifw];
1063 :
1064 46080 : blk_fw_32x32[ifw] = blk_fw[ifw_index];
1065 : }
1066 :
1067 11520 : accum_ptr[C_Y] = accum[C_Y] + offset_block_buffer_Y;
1068 11520 : accum_ptr[C_U] = accum[C_U] + offset_block_buffer_U;
1069 11520 : accum_ptr[C_V] = accum[C_V] + offset_block_buffer_V;
1070 :
1071 11520 : count_ptr[C_Y] = count[C_Y] + offset_block_buffer_Y;
1072 11520 : count_ptr[C_U] = count[C_U] + offset_block_buffer_U;
1073 11520 : count_ptr[C_V] = count[C_V] + offset_block_buffer_V;
1074 :
1075 11520 : if(!is_highbd){
1076 11520 : src_ptr[C_Y] = src[C_Y] + offset_src_buffer_Y;
1077 11520 : src_ptr[C_U] = src[C_U] + offset_src_buffer_U;
1078 11520 : src_ptr[C_V] = src[C_V] + offset_src_buffer_V;
1079 :
1080 11520 : pred_ptr[C_Y] = pred[C_Y] + offset_block_buffer_Y;
1081 11520 : pred_ptr[C_U] = pred[C_U] + offset_block_buffer_U;
1082 11520 : pred_ptr[C_V] = pred[C_V] + offset_block_buffer_V;
1083 :
1084 : // Apply the temporal filtering strategy
1085 11520 : svt_av1_apply_filtering(src_ptr[C_Y],
1086 11520 : stride[C_Y],
1087 11520 : pred_ptr[C_Y],
1088 11520 : stride_pred[C_Y],
1089 11520 : src_ptr[C_U],
1090 11520 : src_ptr[C_V],
1091 11520 : stride[C_U],
1092 11520 : pred_ptr[C_U],
1093 11520 : pred_ptr[C_V],
1094 11520 : stride_pred[C_U],
1095 : (unsigned int)block_width,
1096 : (unsigned int)block_height,
1097 : ss_x,
1098 : ss_y,
1099 : altref_strength,
1100 : blk_fw_32x32,
1101 : 0, // use_32x32
1102 : accum_ptr[C_Y],
1103 : count_ptr[C_Y],
1104 : accum_ptr[C_U],
1105 : count_ptr[C_U],
1106 : accum_ptr[C_V],
1107 : count_ptr[C_V]);
1108 : }else{
1109 0 : src_ptr_16bit[C_Y] = src_16bit[C_Y] + offset_src_buffer_Y;
1110 0 : src_ptr_16bit[C_U] = src_16bit[C_U] + offset_src_buffer_U;
1111 0 : src_ptr_16bit[C_V] = src_16bit[C_V] + offset_src_buffer_V;
1112 :
1113 0 : pred_ptr_16bit[C_Y] = pred_16bit[C_Y] + offset_block_buffer_Y;
1114 0 : pred_ptr_16bit[C_U] = pred_16bit[C_U] + offset_block_buffer_U;
1115 0 : pred_ptr_16bit[C_V] = pred_16bit[C_V] + offset_block_buffer_V;
1116 :
1117 : // Apply the temporal filtering strategy
1118 0 : svt_av1_apply_filtering_highbd(src_ptr_16bit[C_Y],
1119 0 : stride[C_Y],
1120 0 : pred_ptr_16bit[C_Y],
1121 0 : stride_pred[C_Y],
1122 0 : src_ptr_16bit[C_U],
1123 0 : src_ptr_16bit[C_V],
1124 0 : stride[C_U],
1125 0 : pred_ptr_16bit[C_U],
1126 0 : pred_ptr_16bit[C_V],
1127 0 : stride_pred[C_U],
1128 : (unsigned int)block_width,
1129 : (unsigned int)block_height,
1130 : ss_x,
1131 : ss_y,
1132 : altref_strength,
1133 : blk_fw_32x32,
1134 : 0, // use_32x32
1135 : accum_ptr[C_Y],
1136 : count_ptr[C_Y],
1137 : accum_ptr[C_U],
1138 : count_ptr[C_U],
1139 : accum_ptr[C_V],
1140 : count_ptr[C_V]);
1141 : }
1142 :
1143 11520 : }
1144 :
1145 : // Apply filtering to the central picture
1146 479 : static void apply_filtering_central(EbByte *pred,
1147 : uint32_t **accum,
1148 : uint16_t **count,
1149 : uint16_t blk_width,
1150 : uint16_t blk_height,
1151 : uint32_t ss_x,
1152 : uint32_t ss_y) {
1153 :
1154 : uint16_t i, j, k;
1155 479 : uint16_t blk_height_y = blk_height;
1156 479 : uint16_t blk_width_y = blk_width;
1157 479 : uint16_t blk_height_ch = blk_height >> ss_y;
1158 479 : uint16_t blk_width_ch = blk_width >> ss_x;
1159 479 : uint16_t blk_stride_y = blk_width;
1160 479 : uint16_t blk_stride_ch = blk_width >> ss_x;
1161 :
1162 479 : int filter_weight = INIT_WEIGHT;
1163 479 : const int modifier = filter_weight * WEIGHT_MULTIPLIER;
1164 :
1165 : // Luma
1166 479 : k = 0;
1167 31146 : for (i = 0; i < blk_height_y; i++) {
1168 1979690 : for (j = 0; j < blk_width_y; j++) {
1169 1949030 : accum[C_Y][k] += modifier * pred[C_Y][i * blk_stride_y + j];
1170 1949030 : count[C_Y][k] += modifier;
1171 1949030 : ++k;
1172 : }
1173 : }
1174 :
1175 : // Chroma
1176 479 : k = 0;
1177 15833 : for (i = 0; i < blk_height_ch; i++) {
1178 505235 : for (j = 0; j < blk_width_ch; j++) {
1179 489881 : accum[C_U][k] += modifier * pred[C_U][i * blk_stride_ch + j];
1180 489881 : count[C_U][k] += modifier;
1181 :
1182 489881 : accum[C_V][k] += modifier * pred[C_V][i * blk_stride_ch + j];
1183 489881 : count[C_V][k] += modifier;
1184 489881 : ++k;
1185 : }
1186 : }
1187 479 : }
1188 :
1189 : // Apply filtering to the central picture
1190 0 : static void apply_filtering_central_highbd(uint16_t **pred_16bit,
1191 : uint32_t **accum,
1192 : uint16_t **count,
1193 : uint16_t blk_width,
1194 : uint16_t blk_height,
1195 : uint32_t ss_x,
1196 : uint32_t ss_y) {
1197 :
1198 : uint16_t i, j, k;
1199 0 : uint16_t blk_height_y = blk_height;
1200 0 : uint16_t blk_width_y = blk_width;
1201 0 : uint16_t blk_height_ch= blk_height >> ss_y;
1202 0 : uint16_t blk_width_ch = blk_width >> ss_x;
1203 0 : uint16_t blk_stride_y = blk_width;
1204 0 : uint16_t blk_stride_ch = blk_width >> ss_x;
1205 :
1206 0 : int filter_weight = INIT_WEIGHT;
1207 0 : const int modifier = filter_weight * WEIGHT_MULTIPLIER;
1208 :
1209 : // Luma
1210 0 : k = 0;
1211 0 : for (i = 0; i < blk_height_y; i++) {
1212 0 : for (j = 0; j < blk_width_y; j++) {
1213 0 : accum[C_Y][k] += modifier * pred_16bit[C_Y][i * blk_stride_y + j];
1214 0 : count[C_Y][k] += modifier;
1215 0 : ++k;
1216 : }
1217 : }
1218 :
1219 : // Chroma
1220 0 : k = 0;
1221 0 : for (i = 0; i < blk_height_ch; i++) {
1222 0 : for (j = 0; j < blk_width_ch; j++) {
1223 0 : accum[C_U][k] += modifier * pred_16bit[C_U][i * blk_stride_ch + j];
1224 0 : count[C_U][k] += modifier;
1225 :
1226 0 : accum[C_V][k] += modifier * pred_16bit[C_V][i * blk_stride_ch + j];
1227 0 : count[C_V][k] += modifier;
1228 0 : ++k;
1229 : }
1230 : }
1231 0 : }
1232 :
1233 : uint32_t get_mds_idx(uint32_t orgx, uint32_t orgy, uint32_t size, uint32_t use_128x128);
1234 :
1235 2880 : static void tf_inter_prediction(PictureParentControlSet *picture_control_set_ptr,
1236 : MeContext *context_ptr,
1237 : EbPictureBufferDesc *pic_ptr_ref,
1238 : EbByte *pred,
1239 : uint16_t **pred_16bit,
1240 : uint32_t *stride_pred,
1241 : EbByte *src,
1242 : uint16_t **src_16bit,
1243 : uint32_t *stride_src,
1244 : uint32_t sb_origin_x,
1245 : uint32_t sb_origin_y,
1246 : uint32_t ss_x,
1247 : uint32_t ss_y,
1248 : const int* use_16x16_subblocks,
1249 : int encoder_bit_depth)
1250 : {
1251 : const InterpFilters interp_filters =
1252 2880 : av1_make_interp_filters(MULTITAP_SHARP, MULTITAP_SHARP);
1253 :
1254 2880 : EbBool is_highbd = (encoder_bit_depth == 8) ? (uint8_t)EB_FALSE : (uint8_t)EB_TRUE;
1255 :
1256 : CodingUnit cu_ptr;
1257 : MacroBlockD av1xd;
1258 2880 : cu_ptr.av1xd = &av1xd;
1259 : MvUnit mv_unit;
1260 2880 : mv_unit.pred_direction = UNI_PRED_LIST_0;
1261 :
1262 : EbPictureBufferDesc reference_ptr;
1263 : EbPictureBufferDesc prediction_ptr;
1264 :
1265 : UNUSED(ss_x);
1266 :
1267 2880 : prediction_ptr.origin_x = 0;
1268 2880 : prediction_ptr.origin_y = 0;
1269 2880 : prediction_ptr.stride_y = BW;
1270 2880 : prediction_ptr.stride_cb = (uint16_t)BW >> ss_x;
1271 2880 : prediction_ptr.stride_cr = (uint16_t)BW >> ss_x;
1272 :
1273 2880 : if(!is_highbd){
1274 2880 : assert(src[C_Y] != NULL);
1275 2880 : assert(src[C_U] != NULL);
1276 2880 : assert(src[C_V] != NULL);
1277 2880 : prediction_ptr.buffer_y = pred[C_Y];
1278 2880 : prediction_ptr.buffer_cb = pred[C_U];
1279 2880 : prediction_ptr.buffer_cr = pred[C_V];
1280 : }else{
1281 0 : assert(src_16bit[C_Y] != NULL);
1282 0 : assert(src_16bit[C_U] != NULL);
1283 0 : assert(src_16bit[C_V] != NULL);
1284 0 : prediction_ptr.buffer_y = (uint8_t*) pred_16bit[C_Y];
1285 0 : prediction_ptr.buffer_cb = (uint8_t*) pred_16bit[C_U];
1286 0 : prediction_ptr.buffer_cr = (uint8_t*) pred_16bit[C_V];
1287 :
1288 0 : reference_ptr.buffer_y = (uint8_t*)malloc(pic_ptr_ref->luma_size * sizeof(uint16_t));
1289 0 : reference_ptr.buffer_cb = (uint8_t*)malloc(pic_ptr_ref->chroma_size * sizeof(uint16_t));
1290 0 : reference_ptr.buffer_cr = (uint8_t*)malloc(pic_ptr_ref->chroma_size * sizeof(uint16_t));
1291 :
1292 0 : reference_ptr.origin_x = pic_ptr_ref->origin_x;
1293 0 : reference_ptr.origin_y = pic_ptr_ref->origin_y;
1294 0 : reference_ptr.stride_y = pic_ptr_ref->stride_y;
1295 0 : reference_ptr.stride_cb = pic_ptr_ref->stride_cb;
1296 0 : reference_ptr.stride_cr = pic_ptr_ref->stride_cr;
1297 0 : reference_ptr.width = pic_ptr_ref->width;
1298 0 : reference_ptr.height = pic_ptr_ref->height;
1299 :
1300 0 : uint32_t height_y = (uint32_t)(2*reference_ptr.origin_y + reference_ptr.height);
1301 :
1302 0 : pack2d_src(pic_ptr_ref->buffer_y,
1303 0 : reference_ptr.stride_y,
1304 : pic_ptr_ref->buffer_bit_inc_y,
1305 0 : pic_ptr_ref->stride_bit_inc_y,
1306 0 : (uint16_t*)reference_ptr.buffer_y,
1307 0 : reference_ptr.stride_y,
1308 0 : reference_ptr.stride_y,
1309 : height_y);
1310 :
1311 0 : pack2d_src(pic_ptr_ref->buffer_cb,
1312 0 : reference_ptr.stride_cb,
1313 : pic_ptr_ref->buffer_bit_inc_cb,
1314 0 : pic_ptr_ref->stride_bit_inc_cb,
1315 0 : (uint16_t*)reference_ptr.buffer_cb,
1316 0 : reference_ptr.stride_cb,
1317 0 : reference_ptr.stride_cb,
1318 : height_y >> ss_y);
1319 :
1320 0 : pack2d_src(pic_ptr_ref->buffer_cr,
1321 0 : reference_ptr.stride_cr,
1322 : pic_ptr_ref->buffer_bit_inc_cr,
1323 0 : pic_ptr_ref->stride_bit_inc_cr,
1324 0 : (uint16_t*)reference_ptr.buffer_cr,
1325 0 : reference_ptr.stride_cr,
1326 0 : reference_ptr.stride_cr,
1327 : height_y >> ss_y);
1328 : }
1329 :
1330 14399 : for (uint32_t idx_32x32 = 0; idx_32x32 < 4; idx_32x32++) {
1331 11520 : if (use_16x16_subblocks[idx_32x32] != 0) {
1332 11520 : uint32_t bsize = 16;
1333 :
1334 57586 : for (uint32_t idx_16x16 = 0; idx_16x16 < 4; idx_16x16++) {
1335 46067 : uint32_t pu_index = index_16x16_from_subindexes[idx_32x32][idx_16x16];
1336 :
1337 46067 : uint32_t idx_y = subblock_xy_16x16[pu_index][0];
1338 46067 : uint32_t idx_x = subblock_xy_16x16[pu_index][1];
1339 46067 : uint16_t local_origin_x = idx_x * bsize;
1340 46067 : uint16_t local_origin_y = idx_y * bsize;
1341 46067 : uint16_t pu_origin_x = sb_origin_x + local_origin_x;
1342 46067 : uint16_t pu_origin_y = sb_origin_y + local_origin_y;
1343 46067 : uint32_t mirow = pu_origin_y >> MI_SIZE_LOG2;
1344 46067 : uint32_t micol = pu_origin_x >> MI_SIZE_LOG2;
1345 46067 : cu_ptr.mds_idx = get_mds_idx(local_origin_x, local_origin_y, bsize, picture_control_set_ptr->sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128);
1346 :
1347 46065 : const int32_t bw = mi_size_wide[BLOCK_16X16];
1348 46065 : const int32_t bh = mi_size_high[BLOCK_16X16];
1349 46065 : cu_ptr.av1xd->mb_to_top_edge = -(int32_t)((mirow * MI_SIZE) * 8);
1350 46065 : cu_ptr.av1xd->mb_to_bottom_edge = ((picture_control_set_ptr->av1_cm->mi_rows - bw - mirow) * MI_SIZE) * 8;
1351 46065 : cu_ptr.av1xd->mb_to_left_edge = -(int32_t)((micol * MI_SIZE) * 8);
1352 46065 : cu_ptr.av1xd->mb_to_right_edge = ((picture_control_set_ptr->av1_cm->mi_cols - bh - micol) * MI_SIZE) * 8;
1353 :
1354 46065 : uint32_t mv_index = tab16x16[pu_index];
1355 46065 : mv_unit.mv->x = _MVXT(context_ptr->p_best_mv16x16[mv_index]);
1356 46065 : mv_unit.mv->y = _MVYT(context_ptr->p_best_mv16x16[mv_index]);
1357 : //AV1 MVs are always in 1/8th pel precision.
1358 46065 : mv_unit.mv->x = mv_unit.mv->x << 1;
1359 46065 : mv_unit.mv->y = mv_unit.mv->y << 1;
1360 46065 : uint64_t best_distortion = (uint64_t)~0;
1361 46065 : signed short best_mv_x = 0;
1362 46065 : signed short best_mv_y = 0;
1363 46065 : signed short mv_x = (_MVXT(context_ptr->p_best_mv16x16[mv_index])) << 1;
1364 46065 : signed short mv_y = (_MVYT(context_ptr->p_best_mv16x16[mv_index])) << 1;
1365 :
1366 184380 : for (signed short i = -1; i <= 1; i++) {
1367 552455 : for (signed short j = -1; j <= 1; j++) {
1368 :
1369 414140 : mv_unit.mv->x = mv_x + i;
1370 414140 : mv_unit.mv->y = mv_y + j;
1371 :
1372 414140 : av1_inter_prediction_function_table[is_highbd](
1373 : NULL, //picture_control_set_ptr,
1374 : (uint32_t)interp_filters,
1375 : &cu_ptr,
1376 : 0,//ref_frame_type,
1377 : &mv_unit,
1378 : 0,//use_intrabc,
1379 : #if OBMC_FLAG
1380 : SIMPLE_TRANSLATION,
1381 : 0,
1382 : 0,
1383 : #endif
1384 : 1,//compound_idx not used
1385 : NULL,// interinter_comp not used
1386 : #if II_COMP_FLAG
1387 : NULL,
1388 : NULL,
1389 : NULL,
1390 : NULL,
1391 : 0,
1392 : 0,
1393 : 0,
1394 : 0,
1395 : #endif
1396 : pu_origin_x,
1397 : pu_origin_y,
1398 : bsize,
1399 : bsize,
1400 : !is_highbd ? pic_ptr_ref : &reference_ptr,
1401 : NULL,//ref_pic_list1,
1402 : &prediction_ptr,
1403 : local_origin_x,
1404 : local_origin_y,
1405 : 1,//perform_chroma,
1406 414140 : (uint8_t)encoder_bit_depth);
1407 :
1408 : uint64_t distortion;
1409 413835 : if(!is_highbd){
1410 413854 : uint8_t *pred_y_ptr = pred[C_Y] + bsize * idx_y*stride_pred[C_Y] + bsize * idx_x;
1411 413854 : uint8_t *src_y_ptr = src[C_Y] + bsize * idx_y*stride_src[C_Y] + bsize * idx_x;
1412 :
1413 413854 : const aom_variance_fn_ptr_t *fn_ptr = &mefn_ptr[BLOCK_16X16];
1414 :
1415 : unsigned int sse;
1416 413854 : distortion = fn_ptr->vf(pred_y_ptr, stride_pred[C_Y], src_y_ptr, stride_src[C_Y], &sse);
1417 : }else{
1418 0 : uint16_t *pred_y_ptr = pred_16bit[C_Y] + bsize * idx_y*stride_pred[C_Y] + bsize * idx_x;
1419 0 : uint16_t *src_y_ptr = src_16bit[C_Y] + bsize * idx_y*stride_src[C_Y] + bsize * idx_x;;
1420 :
1421 : unsigned int sse;
1422 0 : distortion = variance_highbd_c(pred_y_ptr, stride_pred[C_Y], src_y_ptr, stride_src[C_Y], 16, 16, &sse);
1423 : }
1424 :
1425 414290 : if (distortion < best_distortion) {
1426 106024 : best_distortion = distortion;
1427 106024 : best_mv_x = mv_unit.mv->x;
1428 106024 : best_mv_y = mv_unit.mv->y;
1429 : }
1430 : }
1431 : }
1432 :
1433 : // Perform final pass using the 1/8 MV
1434 : //AV1 MVs are always in 1/8th pel precision.
1435 46215 : mv_unit.mv->x = best_mv_x;
1436 46215 : mv_unit.mv->y = best_mv_y;
1437 :
1438 46215 : av1_inter_prediction_function_table[is_highbd](
1439 : NULL, //picture_control_set_ptr,
1440 : (uint32_t)interp_filters,
1441 : &cu_ptr,
1442 : 0,//ref_frame_type,
1443 : &mv_unit,
1444 : 0,//use_intrabc,
1445 : #if OBMC_FLAG
1446 : SIMPLE_TRANSLATION,
1447 : 0,
1448 : 0,
1449 : #endif
1450 : 1,//compound_idx not used
1451 : NULL,// interinter_comp not used
1452 : #if II_COMP_FLAG
1453 : NULL,
1454 : NULL,
1455 : NULL,
1456 : NULL,
1457 : 0,
1458 : 0,
1459 : 0,
1460 : 0,
1461 : #endif
1462 : pu_origin_x,
1463 : pu_origin_y,
1464 : bsize,
1465 : bsize,
1466 : !is_highbd ? pic_ptr_ref : &reference_ptr,
1467 : NULL,//ref_pic_list1,
1468 : &prediction_ptr,
1469 : local_origin_x,
1470 : local_origin_y,
1471 : 1,//perform_chroma,
1472 46215 : (uint8_t)encoder_bit_depth);
1473 :
1474 : }
1475 : }
1476 : }
1477 :
1478 2879 : if(is_highbd){
1479 0 : free(reference_ptr.buffer_y);
1480 0 : free(reference_ptr.buffer_cb);
1481 0 : free(reference_ptr.buffer_cr);
1482 : }
1483 :
1484 2879 : }
1485 :
1486 480 : static void get_final_filtered_pixels(EbByte *src_center_ptr_start,
1487 : uint16_t **altref_buffer_highbd_start,
1488 : uint32_t **accum,
1489 : uint16_t **count,
1490 : const uint32_t *stride,
1491 : int blk_y_src_offset,
1492 : int blk_ch_src_offset,
1493 : uint16_t blk_width_ch,
1494 : uint16_t blk_height_ch,
1495 : uint64_t *filtered_sse,
1496 : uint64_t *filtered_sse_uv,
1497 : EbBool is_highbd){
1498 :
1499 : int i, j, k;
1500 :
1501 480 : if(!is_highbd){
1502 : // Process luma
1503 480 : int pos = blk_y_src_offset;
1504 31116 : for (i = 0, k = 0; i < BH; i++) {
1505 1981440 : for (j = 0; j < BW; j++, k++) {
1506 1950800 : (*filtered_sse) += (uint64_t)((int32_t)src_center_ptr_start[C_Y][pos] - (int32_t)OD_DIVU(accum[C_Y][k] + (count[C_Y][k] >> 1), count[C_Y][k]))* ((int32_t)src_center_ptr_start[C_Y][pos] - (int32_t)OD_DIVU(accum[C_Y][k] + (count[C_Y][k] >> 1), count[C_Y][k]));
1507 1950750 : src_center_ptr_start[C_Y][pos] = (uint8_t)OD_DIVU(accum[C_Y][k] + (count[C_Y][k] >> 1), count[C_Y][k]);
1508 1950740 : pos++;
1509 : }
1510 30636 : pos += stride[C_Y] - BW;
1511 : }
1512 : // Process chroma
1513 415 : pos = blk_ch_src_offset;
1514 15776 : for (i = 0, k = 0; i < blk_height_ch; i++) {
1515 506341 : for (j = 0; j < blk_width_ch; j++, k++) {
1516 490980 : (*filtered_sse_uv) += (uint64_t)((int32_t)src_center_ptr_start[C_U][pos] - (int32_t)OD_DIVU(accum[C_U][k] + (count[C_U][k] >> 1), count[C_U][k]))* ((int32_t)src_center_ptr_start[C_U][pos] - (int32_t)OD_DIVU(accum[C_U][k] + (count[C_U][k] >> 1), count[C_U][k]));
1517 490980 : (*filtered_sse_uv) += (uint64_t)((int32_t)src_center_ptr_start[C_V][pos] - (int32_t)OD_DIVU(accum[C_V][k] + (count[C_V][k] >> 1), count[C_V][k]))* ((int32_t)src_center_ptr_start[C_V][pos] - (int32_t)OD_DIVU(accum[C_V][k] + (count[C_V][k] >> 1), count[C_V][k]));
1518 490981 : src_center_ptr_start[C_U][pos] = (uint8_t)OD_DIVU(accum[C_U][k] + (count[C_U][k] >> 1), count[C_U][k]);
1519 490981 : src_center_ptr_start[C_V][pos] = (uint8_t)OD_DIVU(accum[C_V][k] + (count[C_V][k] >> 1), count[C_V][k]);
1520 490981 : pos++;
1521 : }
1522 15361 : pos += stride[C_U] - blk_width_ch;
1523 : }
1524 : }else{
1525 : // Process luma
1526 0 : int pos = blk_y_src_offset;
1527 0 : for (i = 0, k = 0; i < BH; i++) {
1528 0 : for (j = 0; j < BW; j++, k++) {
1529 0 : (*filtered_sse) += (uint64_t)((int32_t)altref_buffer_highbd_start[C_Y][pos] - (int32_t)OD_DIVU(accum[C_Y][k] + (count[C_Y][k] >> 1), count[C_Y][k]))* ((int32_t)altref_buffer_highbd_start[C_Y][pos] - (int32_t)OD_DIVU(accum[C_Y][k] + (count[C_Y][k] >> 1), count[C_Y][k]));
1530 0 : altref_buffer_highbd_start[C_Y][pos] = (uint16_t)OD_DIVU(accum[C_Y][k] + (count[C_Y][k] >> 1), count[C_Y][k]);
1531 0 : pos++;
1532 : }
1533 0 : pos += stride[C_Y] - BW;
1534 : }
1535 : // Process chroma
1536 0 : pos = blk_ch_src_offset;
1537 0 : for (i = 0, k = 0; i < blk_height_ch; i++) {
1538 0 : for (j = 0; j < blk_width_ch; j++, k++) {
1539 0 : (*filtered_sse_uv) += (uint64_t)((int32_t)altref_buffer_highbd_start[C_U][pos] - (int32_t)OD_DIVU(accum[C_U][k] + (count[C_U][k] >> 1), count[C_U][k]))* ((int32_t)altref_buffer_highbd_start[C_U][pos] - (int32_t)OD_DIVU(accum[C_U][k] + (count[C_U][k] >> 1), count[C_U][k]));
1540 0 : (*filtered_sse_uv) += (uint64_t)((int32_t)altref_buffer_highbd_start[C_V][pos] - (int32_t)OD_DIVU(accum[C_V][k] + (count[C_V][k] >> 1), count[C_V][k]))* ((int32_t)altref_buffer_highbd_start[C_V][pos] - (int32_t)OD_DIVU(accum[C_V][k] + (count[C_V][k] >> 1), count[C_V][k]));
1541 0 : altref_buffer_highbd_start[C_U][pos] = (uint16_t)OD_DIVU(accum[C_U][k] + (count[C_U][k] >> 1), count[C_U][k]);
1542 0 : altref_buffer_highbd_start[C_V][pos] = (uint16_t)OD_DIVU(accum[C_V][k] + (count[C_V][k] >> 1), count[C_V][k]);
1543 0 : pos++;
1544 : }
1545 0 : pos += stride[C_U] - blk_width_ch;
1546 : }
1547 : }
1548 416 : }
1549 :
1550 : // Produce the filtered alt-ref picture
1551 : // - core function
1552 480 : static EbErrorType produce_temporally_filtered_pic(PictureParentControlSet **list_picture_control_set_ptr,
1553 : EbPictureBufferDesc **list_input_picture_ptr,
1554 : uint8_t altref_strength,
1555 : uint8_t index_center,
1556 : uint64_t *filtered_sse,
1557 : uint64_t *filtered_sse_uv,
1558 : MotionEstimationContext_t *me_context_ptr,
1559 : int32_t segment_index,
1560 : EbBool is_highbd) {
1561 : int frame_index;
1562 : DECLARE_ALIGNED(16, uint32_t, accumulator[BLK_PELS * COLOR_CHANNELS]);
1563 : DECLARE_ALIGNED(16, uint16_t, counter[BLK_PELS * COLOR_CHANNELS]);
1564 480 : uint32_t *accum[COLOR_CHANNELS] = { accumulator, accumulator + BLK_PELS, accumulator + (BLK_PELS<<1) };
1565 480 : uint16_t *count[COLOR_CHANNELS] = { counter, counter + BLK_PELS, counter + (BLK_PELS<<1) };
1566 :
1567 480 : EbByte predictor = { NULL };
1568 480 : uint16_t *predictor_16bit = { NULL };
1569 480 : if(!is_highbd){
1570 480 : EB_MALLOC_ALIGNED_ARRAY(predictor, BLK_PELS * COLOR_CHANNELS);
1571 : }else{
1572 0 : EB_MALLOC_ALIGNED_ARRAY(predictor_16bit, BLK_PELS * COLOR_CHANNELS);
1573 : }
1574 480 : EbByte pred[COLOR_CHANNELS] = { predictor, predictor + BLK_PELS, predictor + (BLK_PELS<<1) };
1575 480 : uint16_t* pred_16bit[COLOR_CHANNELS] = { predictor_16bit, predictor_16bit + BLK_PELS, predictor_16bit + (BLK_PELS<<1) };
1576 :
1577 480 : EbByte src_center_ptr_start[COLOR_CHANNELS], src_center_ptr[COLOR_CHANNELS] = { NULL };
1578 480 : uint16_t* altref_buffer_highbd_start[COLOR_CHANNELS], *altref_buffer_highbd_ptr[COLOR_CHANNELS] = { NULL };
1579 :
1580 : uint32_t blk_row, blk_col;
1581 480 : int blk_y_src_offset = 0, blk_ch_src_offset = 0;
1582 :
1583 480 : PictureParentControlSet *picture_control_set_ptr_central = list_picture_control_set_ptr[index_center];
1584 480 : EbPictureBufferDesc *input_picture_ptr_central = list_input_picture_ptr[index_center];
1585 :
1586 480 : int encoder_bit_depth = (int)picture_control_set_ptr_central->sequence_control_set_ptr->static_config.encoder_bit_depth;
1587 :
1588 : // chroma subsampling
1589 480 : uint32_t ss_x = picture_control_set_ptr_central->sequence_control_set_ptr->subsampling_x;
1590 480 : uint32_t ss_y = picture_control_set_ptr_central->sequence_control_set_ptr->subsampling_y;
1591 480 : uint16_t blk_width_ch = (uint16_t)BW >> ss_x;
1592 480 : uint16_t blk_height_ch = (uint16_t)BH >> ss_y;
1593 :
1594 480 : uint32_t blk_cols = (uint32_t)(input_picture_ptr_central->width + BW - 1) / BW; // I think only the part of the picture
1595 480 : uint32_t blk_rows = (uint32_t)(input_picture_ptr_central->height + BH - 1) / BH; // that fits to the 32x32 blocks are actually filtered
1596 :
1597 480 : uint32_t stride[COLOR_CHANNELS] = { input_picture_ptr_central->stride_y,
1598 480 : input_picture_ptr_central->stride_cb,
1599 480 : input_picture_ptr_central->stride_cr };
1600 480 : uint32_t stride_pred[COLOR_CHANNELS] = {BW, blk_width_ch, blk_width_ch};
1601 :
1602 480 : MeContext *context_ptr = me_context_ptr->me_context_ptr;
1603 :
1604 : uint32_t x_seg_idx;
1605 : uint32_t y_seg_idx;
1606 480 : uint32_t picture_width_in_b64 = blk_cols;
1607 480 : uint32_t picture_height_in_b64 = blk_rows;
1608 480 : SEGMENT_CONVERT_IDX_TO_XY(segment_index, x_seg_idx, y_seg_idx, picture_control_set_ptr_central->tf_segments_column_count);
1609 480 : uint32_t x_b64_start_idx = SEGMENT_START_IDX(x_seg_idx, picture_width_in_b64, picture_control_set_ptr_central->tf_segments_column_count);
1610 480 : uint32_t x_b64_end_idx = SEGMENT_END_IDX (x_seg_idx, picture_width_in_b64, picture_control_set_ptr_central->tf_segments_column_count);
1611 480 : uint32_t y_b64_start_idx = SEGMENT_START_IDX(y_seg_idx, picture_height_in_b64, picture_control_set_ptr_central->tf_segments_row_count);
1612 480 : uint32_t y_b64_end_idx = SEGMENT_END_IDX (y_seg_idx, picture_height_in_b64, picture_control_set_ptr_central->tf_segments_row_count);
1613 :
1614 : // first position of the frame buffer according to the index center
1615 480 : src_center_ptr_start[C_Y] = input_picture_ptr_central->buffer_y +
1616 480 : input_picture_ptr_central->origin_y*input_picture_ptr_central->stride_y +
1617 480 : input_picture_ptr_central->origin_x;
1618 :
1619 480 : src_center_ptr_start[C_U] = input_picture_ptr_central->buffer_cb +
1620 480 : (input_picture_ptr_central->origin_y>>ss_y)*input_picture_ptr_central->stride_cb +
1621 480 : (input_picture_ptr_central->origin_x>>ss_x);
1622 :
1623 480 : src_center_ptr_start[C_V] = input_picture_ptr_central->buffer_cr +
1624 480 : (input_picture_ptr_central->origin_y>>ss_y)*input_picture_ptr_central->stride_cr +
1625 480 : (input_picture_ptr_central->origin_x>>ss_x);
1626 :
1627 480 : altref_buffer_highbd_start[C_Y] = picture_control_set_ptr_central->altref_buffer_highbd[C_Y] +
1628 480 : input_picture_ptr_central->origin_y*input_picture_ptr_central->stride_y +
1629 480 : input_picture_ptr_central->origin_x;
1630 :
1631 480 : altref_buffer_highbd_start[C_U] = picture_control_set_ptr_central->altref_buffer_highbd[C_U] +
1632 480 : (input_picture_ptr_central->origin_y>>ss_y)*input_picture_ptr_central->stride_bit_inc_cb +
1633 480 : (input_picture_ptr_central->origin_x>>ss_x);
1634 :
1635 480 : altref_buffer_highbd_start[C_V] = picture_control_set_ptr_central->altref_buffer_highbd[C_V] +
1636 480 : (input_picture_ptr_central->origin_y>>ss_y)*input_picture_ptr_central->stride_bit_inc_cr +
1637 480 : (input_picture_ptr_central->origin_x>>ss_x);
1638 :
1639 480 : *filtered_sse = 0;
1640 480 : *filtered_sse_uv = 0;
1641 :
1642 960 : for (blk_row = y_b64_start_idx; blk_row < y_b64_end_idx; blk_row++) {
1643 960 : for (blk_col = x_b64_start_idx; blk_col < x_b64_end_idx; blk_col++) {
1644 :
1645 480 : blk_y_src_offset = (blk_col * BW) + (blk_row * BH) * stride[C_Y];
1646 480 : blk_ch_src_offset = (blk_col * blk_width_ch) + (blk_row * blk_height_ch) * stride[C_U];
1647 :
1648 : // reset accumulator and count
1649 480 : memset(accumulator, 0, BLK_PELS * COLOR_CHANNELS * sizeof(accumulator[0]));
1650 480 : memset(counter, 0, BLK_PELS * COLOR_CHANNELS * sizeof(counter[0]));
1651 :
1652 : int blk_fw[N_16X16_BLOCKS];
1653 480 : int use_16x16_subblocks[N_32X32_BLOCKS] = {0};
1654 : int me_16x16_subblock_vf[N_16X16_BLOCKS];
1655 : int me_32x32_subblock_vf[N_32X32_BLOCKS];
1656 :
1657 480 : populate_list_with_value(blk_fw, 16, INIT_WEIGHT);
1658 :
1659 : // for every frame to filter
1660 3840 : for (frame_index = 0; frame_index < (picture_control_set_ptr_central->past_altref_nframes + picture_control_set_ptr_central->future_altref_nframes + 1); frame_index++) {
1661 :
1662 3360 : if(!is_highbd){
1663 3360 : src_center_ptr[C_Y] = src_center_ptr_start[C_Y] + blk_y_src_offset;
1664 3360 : src_center_ptr[C_U] = src_center_ptr_start[C_U] + blk_ch_src_offset;
1665 3360 : src_center_ptr[C_V] = src_center_ptr_start[C_V] + blk_ch_src_offset;
1666 : }else{
1667 0 : altref_buffer_highbd_ptr[C_Y] = altref_buffer_highbd_start[C_Y] + blk_y_src_offset;
1668 0 : altref_buffer_highbd_ptr[C_U] = altref_buffer_highbd_start[C_U] + blk_ch_src_offset;
1669 0 : altref_buffer_highbd_ptr[C_V] = altref_buffer_highbd_start[C_V] + blk_ch_src_offset;
1670 : }
1671 :
1672 : // ------------
1673 : // Step 1: motion estimation + compensation
1674 : // ------------
1675 :
1676 : // if frame to process is the center frame
1677 3360 : if (frame_index == index_center) {
1678 : // skip MC (central frame)
1679 :
1680 480 : populate_list_with_value(blk_fw, N_16X16_BLOCKS, 2);
1681 480 : populate_list_with_value(use_16x16_subblocks, N_32X32_BLOCKS, 0);
1682 :
1683 480 : if(!is_highbd){
1684 480 : pic_copy_kernel_8bit(src_center_ptr[C_Y], stride[C_Y], pred[C_Y], stride_pred[C_Y], BW, BH);
1685 480 : pic_copy_kernel_8bit(src_center_ptr[C_U], stride[C_U], pred[C_U], stride_pred[C_U], blk_width_ch, blk_height_ch);
1686 480 : pic_copy_kernel_8bit(src_center_ptr[C_V], stride[C_V], pred[C_V], stride_pred[C_V], blk_width_ch, blk_height_ch);
1687 : }else{
1688 0 : pic_copy_kernel_16bit(altref_buffer_highbd_ptr[C_Y], stride[C_Y], pred_16bit[C_Y], stride_pred[C_Y], BW, BH);
1689 0 : pic_copy_kernel_16bit(altref_buffer_highbd_ptr[C_U], stride[C_U], pred_16bit[C_U], stride_pred[C_U], blk_width_ch, blk_height_ch);
1690 0 : pic_copy_kernel_16bit(altref_buffer_highbd_ptr[C_V], stride[C_V], pred_16bit[C_V], stride_pred[C_V], blk_width_ch, blk_height_ch);
1691 : }
1692 :
1693 : }else{
1694 : // Initialize ME context
1695 2880 : create_ME_context_and_picture_control(me_context_ptr,
1696 2880 : list_picture_control_set_ptr[frame_index],
1697 2880 : list_picture_control_set_ptr[index_center],
1698 : input_picture_ptr_central,
1699 : blk_row,
1700 : blk_col,
1701 : ss_x,
1702 : ss_y);
1703 :
1704 : // Perform ME - context_ptr will store the outputs (MVs, buffers, etc)
1705 : // Block-based MC using open-loop HME + refinement
1706 2880 : motion_estimate_lcu( picture_control_set_ptr_central, // source picture control set -> references come from here
1707 2880 : (uint32_t)blk_row*blk_cols + blk_col,
1708 : (uint32_t)blk_col*BW, // x block
1709 : (uint32_t)blk_row*BH, // y block
1710 : context_ptr,
1711 : input_picture_ptr_central); // source picture
1712 :
1713 2880 : EbBool use_16x16_subblocks_only = EB_TRUE; // TODO: hardcoded to use 16x16 subblocks only, however,
1714 : // the support for the use of 32x32 subblocks as well is almost complete
1715 : // experiments have shown low gains by adding this possibility
1716 2880 : populate_list_with_value(use_16x16_subblocks,N_32X32_BLOCKS,1);
1717 :
1718 : // Perform MC using the information acquired using the ME step
1719 2880 : tf_inter_prediction(picture_control_set_ptr_central,
1720 : context_ptr,
1721 2880 : list_input_picture_ptr[frame_index],
1722 : pred,
1723 : pred_16bit,
1724 : stride_pred,
1725 : src_center_ptr,
1726 : altref_buffer_highbd_ptr,
1727 : stride,
1728 : (uint32_t)blk_col*BW,
1729 : (uint32_t)blk_row*BH,
1730 : ss_x,
1731 : ss_y,
1732 : use_16x16_subblocks,
1733 : encoder_bit_depth);
1734 :
1735 : // Retrieve distortion (variance) on 32x32 and 16x16 sub-blocks
1736 2880 : if(!is_highbd)
1737 2880 : get_ME_distortion(me_32x32_subblock_vf,
1738 : me_16x16_subblock_vf,
1739 : pred[C_Y],
1740 2880 : stride_pred[C_Y],
1741 : src_center_ptr[C_Y],
1742 2880 : stride[C_Y]);
1743 : else
1744 0 : get_ME_distortion_highbd(me_32x32_subblock_vf,
1745 : me_16x16_subblock_vf,
1746 : pred_16bit[C_Y],
1747 0 : stride_pred[C_Y],
1748 : altref_buffer_highbd_ptr[C_Y],
1749 0 : stride[C_Y]);
1750 :
1751 : // Get sub-block filter weights depending on the variance
1752 2880 : get_blk_fw_using_dist(me_32x32_subblock_vf,
1753 : me_16x16_subblock_vf,
1754 : use_16x16_subblocks_only,
1755 : blk_fw,
1756 : is_highbd);
1757 : }
1758 :
1759 : // ------------
1760 : // Step 2: temporal filtering using the motion compensated blocks
1761 : // ------------
1762 :
1763 : // if frame to process is the center frame
1764 3360 : if (frame_index == index_center) {
1765 480 : if(!is_highbd)
1766 480 : apply_filtering_central(pred,
1767 : accum,
1768 : count,
1769 : BW,
1770 : BH,
1771 : ss_x,
1772 : ss_y);
1773 : else
1774 0 : apply_filtering_central_highbd(pred_16bit,
1775 : accum,
1776 : count,
1777 : BW,
1778 : BH,
1779 : ss_x,
1780 : ss_y);
1781 : }else{
1782 : // split filtering function into 32x32 blocks
1783 : // TODO: implement a 64x64 SIMD version
1784 8640 : for(int block_row = 0; block_row<2; block_row++){
1785 17280 : for(int block_col = 0; block_col<2; block_col++) {
1786 11520 : apply_filtering_block(block_row,
1787 : block_col,
1788 : src_center_ptr,
1789 : altref_buffer_highbd_ptr,
1790 : pred,
1791 : pred_16bit,
1792 : accum,
1793 : count,
1794 : stride,
1795 : stride_pred,
1796 : BW >> 1, // fixed 32x32
1797 : BH >> 1, // fixed 32x32
1798 : ss_x, // chroma sub-sampling in x
1799 : ss_y, // chroma sub-sampling in y
1800 : altref_strength,
1801 : blk_fw,
1802 : is_highbd);
1803 : }
1804 : }
1805 : }
1806 : }
1807 :
1808 : // Normalize filter output to produce temporally filtered frame
1809 480 : get_final_filtered_pixels(src_center_ptr_start,
1810 : altref_buffer_highbd_start,
1811 : accum,
1812 : count,
1813 : stride,
1814 : blk_y_src_offset,
1815 : blk_ch_src_offset,
1816 : blk_width_ch,
1817 : blk_height_ch,
1818 : filtered_sse,
1819 : filtered_sse_uv,
1820 : is_highbd);
1821 : }
1822 : }
1823 :
1824 480 : if(!is_highbd)
1825 480 : EB_FREE_ALIGNED_ARRAY(predictor);
1826 : else
1827 0 : EB_FREE_ALIGNED_ARRAY(predictor_16bit);
1828 :
1829 480 : return EB_ErrorNone;
1830 : }
1831 :
1832 : // This is an adaptation of the mehtod in the following paper:
1833 : // Shen-Chuan Tai, Shih-Ming Yang, "A fast method for image noise
1834 : // estimation using Laplacian operator and adaptive edge detection,"
1835 : // Proc. 3rd International Symposium on Communications, Control and
1836 : // Signal Processing, 2008, St Julians, Malta.
1837 : // Return noise estimate, or -1.0 if there was a failure
1838 : // function from libaom
1839 : // Standard bit depht input (=8 bits) to estimate the noise, I don't think there needs to be two methods for this
1840 : // Operates on the Y component only
1841 8 : static double estimate_noise(const uint8_t *src,
1842 : uint16_t width,
1843 : uint16_t height,
1844 : uint16_t stride_y) {
1845 8 : int64_t sum = 0;
1846 8 : int64_t num = 0;
1847 :
1848 2872 : for (int i = 1; i < height - 1; ++i) {
1849 1830100 : for (int j = 1; j < width - 1; ++j) {
1850 1827230 : const int k = i * stride_y + j;
1851 : // Sobel gradients
1852 1827230 : const int Gx = (src[k - stride_y - 1] - src[k - stride_y + 1]) +
1853 1827230 : (src[k + stride_y - 1] - src[k + stride_y + 1]) +
1854 1827230 : 2 * (src[k - 1] - src[k + 1]);
1855 1827230 : const int Gy = (src[k - stride_y - 1] - src[k + stride_y - 1]) +
1856 1827230 : (src[k - stride_y + 1] - src[k + stride_y + 1]) +
1857 1827230 : 2 * (src[k - stride_y] - src[k + stride_y]);
1858 1827230 : const int Ga = abs(Gx) + abs(Gy);
1859 1827230 : if (Ga < EDGE_THRESHOLD) { // Do not consider edge pixels to estimate the noise
1860 : // Find Laplacian
1861 1284960 : const int v =
1862 1284960 : 4 * src[k] -
1863 1284960 : 2 * (src[k - 1] + src[k + 1] + src[k - stride_y] + src[k + stride_y]) +
1864 1284960 : (src[k - stride_y - 1] + src[k - stride_y + 1] + src[k + stride_y - 1] +
1865 1284960 : src[k + stride_y + 1]);
1866 1284960 : sum += abs(v);
1867 1284960 : ++num;
1868 : }
1869 : }
1870 : }
1871 : // If very few smooth pels, return -1 since the estimate is unreliable
1872 8 : if (num < SMOOTH_THRESHOLD)
1873 0 : return -1.0;
1874 :
1875 8 : const double sigma = (double)sum / (6 * num) * SQRT_PI_BY_2;
1876 :
1877 8 : return sigma;
1878 : }
1879 :
1880 : // Noise estimation for highbd
1881 0 : static double estimate_noise_highbd(const uint16_t *src,
1882 : int width,
1883 : int height,
1884 : int stride,
1885 : int bd) {
1886 0 : int64_t sum = 0;
1887 0 : int64_t num = 0;
1888 :
1889 0 : for (int i = 1; i < height - 1; ++i) {
1890 0 : for (int j = 1; j < width - 1; ++j) {
1891 0 : const int k = i * stride + j;
1892 : // Sobel gradients
1893 0 : const int Gx = (src[k - stride - 1] - src[k - stride + 1]) +
1894 0 : (src[k + stride - 1] - src[k + stride + 1]) +
1895 0 : 2 * (src[k - 1] - src[k + 1]);
1896 0 : const int Gy = (src[k - stride - 1] - src[k + stride - 1]) +
1897 0 : (src[k - stride + 1] - src[k + stride + 1]) +
1898 0 : 2 * (src[k - stride] - src[k + stride]);
1899 0 : const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bd - 8); // divide by 2^2 and round up
1900 0 : if (Ga < EDGE_THRESHOLD) { // Do not consider edge pixels to estimate the noise
1901 : // Find Laplacian
1902 0 : const int v =
1903 0 : 4 * src[k] -
1904 0 : 2 * (src[k - 1] + src[k + 1] + src[k - stride] + src[k + stride]) +
1905 0 : (src[k - stride - 1] + src[k - stride + 1] + src[k + stride - 1] +
1906 0 : src[k + stride + 1]);
1907 0 : sum += ROUND_POWER_OF_TWO(abs(v), bd - 8);
1908 0 : ++num;
1909 : }
1910 : }
1911 : }
1912 : // If very few smooth pels, return -1 since the estimate is unreliable
1913 0 : if (num < SMOOTH_THRESHOLD) return -1.0;
1914 :
1915 0 : const double sigma = (double)sum / (6 * num) * SQRT_PI_BY_2;
1916 0 : return sigma;
1917 : }
1918 :
1919 : // Adjust filtering parameters: strength and nframes
1920 8 : static void adjust_filter_strength(
1921 : #if TWO_PASS
1922 : PictureParentControlSet *picture_control_set_ptr_central,
1923 : #endif
1924 : double noise_level,
1925 : uint8_t *altref_strength,
1926 : EbBool is_highbd,
1927 : uint32_t encoder_bit_depth) {
1928 :
1929 8 : int strength = *altref_strength, adj_strength=strength;
1930 :
1931 : // Adjust the strength of the temporal filtering
1932 : // based on the amount of noise present in the frame
1933 : // adjustment in the integer range [-2, 1]
1934 : // if noiselevel < 0, it means that the estimation was
1935 : // unsuccessful and therefore keep the strength as it was set
1936 8 : if (noise_level > 0) {
1937 : int noiselevel_adj;
1938 8 : if (noise_level < 0.75)
1939 8 : noiselevel_adj = -2;
1940 0 : else if (noise_level < 1.75)
1941 0 : noiselevel_adj = -1;
1942 0 : else if (noise_level < 4.0)
1943 0 : noiselevel_adj = 0;
1944 : else
1945 0 : noiselevel_adj = 1;
1946 : #if TWO_PASS
1947 8 : if (picture_control_set_ptr_central->sequence_control_set_ptr->use_input_stat_file &&
1948 0 : picture_control_set_ptr_central->temporal_layer_index == 0 && picture_control_set_ptr_central->sc_content_detected == 0) {
1949 0 : if (noiselevel_adj < 0) {
1950 0 : if ((picture_control_set_ptr_central->referenced_area_avg < 20 && picture_control_set_ptr_central->slice_type == 2) ||
1951 0 : (picture_control_set_ptr_central->referenced_area_avg < 30 && picture_control_set_ptr_central->slice_type != 2)) {
1952 0 : noiselevel_adj = CLIP3(-2, 0, noiselevel_adj - 1);
1953 : }
1954 : else
1955 0 : noiselevel_adj = 0;
1956 : }
1957 : }
1958 : #endif
1959 8 : adj_strength += noiselevel_adj;
1960 : }
1961 :
1962 8 : if(adj_strength > 0)
1963 8 : strength = adj_strength;
1964 : else
1965 0 : strength = 0;
1966 :
1967 : // if highbd, adjust filter strength strength = strength + 2*(bit depth - 8)
1968 8 : if(is_highbd)
1969 0 : strength = strength + 2 * (encoder_bit_depth - 8);
1970 :
1971 : #if DEBUG_TF
1972 : printf("[DEBUG] noise level: %g, strength = %d, adj_strength = %d\n", noise_level, *altref_strength, strength);
1973 : #endif
1974 :
1975 8 : *altref_strength = (uint8_t)strength;
1976 :
1977 : // TODO: apply further refinements to the filter parameters according to 1st pass statistics
1978 :
1979 8 : }
1980 :
1981 8 : static void pad_and_decimate_filtered_pic(PictureParentControlSet *picture_control_set_ptr_central){
1982 : // reference structures (padded pictures + downsampled versions)
1983 8 : EbPaReferenceObject *src_object = (EbPaReferenceObject*)picture_control_set_ptr_central->pa_reference_picture_wrapper_ptr->object_ptr;
1984 8 : EbPictureBufferDesc *padded_pic_ptr = src_object->input_padded_picture_ptr;
1985 8 : generate_padding(
1986 : &(padded_pic_ptr->buffer_y[C_Y]),
1987 8 : padded_pic_ptr->stride_y,
1988 8 : padded_pic_ptr->width,
1989 8 : padded_pic_ptr->height,
1990 8 : padded_pic_ptr->origin_x,
1991 8 : padded_pic_ptr->origin_y);
1992 :
1993 : // 1/4 & 1/16 input picture decimation
1994 8 : DownsampleDecimationInputPicture(
1995 : picture_control_set_ptr_central,
1996 : padded_pic_ptr,
1997 : src_object->quarter_decimated_picture_ptr,
1998 : src_object->sixteenth_decimated_picture_ptr);
1999 :
2000 : // 1/4 & 1/16 input picture downsampling through filtering
2001 8 : SequenceControlSet *sequence_control_set_ptr = (SequenceControlSet*)picture_control_set_ptr_central->sequence_control_set_wrapper_ptr->object_ptr;
2002 8 : if (sequence_control_set_ptr->down_sampling_method_me_search == ME_FILTERED_DOWNSAMPLED)
2003 4 : DownsampleFilteringInputPicture(
2004 : picture_control_set_ptr_central,
2005 : padded_pic_ptr,
2006 : src_object->quarter_filtered_picture_ptr,
2007 : src_object->sixteenth_filtered_picture_ptr);
2008 8 : }
2009 :
2010 : // save original enchanced_picture_ptr buffer in a separate buffer (to be replaced by the temporally filtered pic)
2011 0 : static EbErrorType save_src_pic_buffers(PictureParentControlSet *picture_control_set_ptr_central,
2012 : uint32_t ss_y,
2013 : EbBool is_highbd){
2014 :
2015 : // allocate memory for the copy of the original enhanced buffer
2016 0 : EB_MALLOC_ARRAY(picture_control_set_ptr_central->save_enhanced_picture_ptr[C_Y],
2017 : picture_control_set_ptr_central->enhanced_picture_ptr->luma_size);
2018 0 : EB_MALLOC_ARRAY(picture_control_set_ptr_central->save_enhanced_picture_ptr[C_U],
2019 : picture_control_set_ptr_central->enhanced_picture_ptr->chroma_size);
2020 0 : EB_MALLOC_ARRAY(picture_control_set_ptr_central->save_enhanced_picture_ptr[C_V],
2021 : picture_control_set_ptr_central->enhanced_picture_ptr->chroma_size);
2022 :
2023 : // if highbd, allocate memory for the copy of the original enhanced buffer - bit inc
2024 0 : if(is_highbd){
2025 0 : EB_MALLOC_ARRAY(picture_control_set_ptr_central->save_enhanced_picture_bit_inc_ptr[C_Y],
2026 : picture_control_set_ptr_central->enhanced_picture_ptr->luma_size);
2027 0 : EB_MALLOC_ARRAY(picture_control_set_ptr_central->save_enhanced_picture_bit_inc_ptr[C_U],
2028 : picture_control_set_ptr_central->enhanced_picture_ptr->chroma_size);
2029 0 : EB_MALLOC_ARRAY(picture_control_set_ptr_central->save_enhanced_picture_bit_inc_ptr[C_V],
2030 : picture_control_set_ptr_central->enhanced_picture_ptr->chroma_size);
2031 : }
2032 :
2033 : // copy buffers
2034 : // Y
2035 0 : uint32_t height_y = (uint32_t)(picture_control_set_ptr_central->enhanced_picture_ptr->height +
2036 0 : picture_control_set_ptr_central->enhanced_picture_ptr->origin_y + picture_control_set_ptr_central->enhanced_picture_ptr->origin_bot_y);
2037 0 : uint32_t height_uv = (uint32_t)((picture_control_set_ptr_central->enhanced_picture_ptr->height +
2038 0 : picture_control_set_ptr_central->enhanced_picture_ptr->origin_y + picture_control_set_ptr_central->enhanced_picture_ptr->origin_bot_y) >> ss_y);
2039 :
2040 0 : assert(height_y * picture_control_set_ptr_central->enhanced_picture_ptr->stride_y == picture_control_set_ptr_central->enhanced_picture_ptr->luma_size);
2041 0 : assert(height_uv * picture_control_set_ptr_central->enhanced_picture_ptr->stride_cb == picture_control_set_ptr_central->enhanced_picture_ptr->chroma_size);
2042 0 : assert(height_uv * picture_control_set_ptr_central->enhanced_picture_ptr->stride_cr == picture_control_set_ptr_central->enhanced_picture_ptr->chroma_size);
2043 :
2044 0 : pic_copy_kernel_8bit(picture_control_set_ptr_central->enhanced_picture_ptr->buffer_y,
2045 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_y,
2046 : picture_control_set_ptr_central->save_enhanced_picture_ptr[C_Y],
2047 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_y,
2048 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_y,
2049 : height_y);
2050 :
2051 0 : pic_copy_kernel_8bit(picture_control_set_ptr_central->enhanced_picture_ptr->buffer_cb,
2052 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_cb,
2053 : picture_control_set_ptr_central->save_enhanced_picture_ptr[C_U],
2054 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_cb,
2055 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_cb,
2056 : height_uv);
2057 :
2058 0 : pic_copy_kernel_8bit(picture_control_set_ptr_central->enhanced_picture_ptr->buffer_cr,
2059 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_cr,
2060 : picture_control_set_ptr_central->save_enhanced_picture_ptr[C_V],
2061 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_cr,
2062 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_cr,
2063 : height_uv);
2064 :
2065 0 : if(is_highbd){
2066 : // if highbd, copy bit inc buffers
2067 : // Y
2068 0 : pic_copy_kernel_8bit(picture_control_set_ptr_central->enhanced_picture_ptr->buffer_bit_inc_y,
2069 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_y,
2070 : picture_control_set_ptr_central->save_enhanced_picture_bit_inc_ptr[C_Y],
2071 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_y,
2072 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_y,
2073 : height_y);
2074 : // U
2075 0 : pic_copy_kernel_8bit(picture_control_set_ptr_central->enhanced_picture_ptr->buffer_bit_inc_cb,
2076 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_cb,
2077 : picture_control_set_ptr_central->save_enhanced_picture_bit_inc_ptr[C_U],
2078 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_cb,
2079 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_cb,
2080 : height_uv);
2081 : // V
2082 0 : pic_copy_kernel_8bit(picture_control_set_ptr_central->enhanced_picture_ptr->buffer_bit_inc_cr,
2083 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_cr,
2084 : picture_control_set_ptr_central->save_enhanced_picture_bit_inc_ptr[C_V],
2085 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_cr,
2086 0 : picture_control_set_ptr_central->enhanced_picture_ptr->stride_bit_inc_cr,
2087 : height_uv);
2088 : }
2089 :
2090 0 : return EB_ErrorNone;
2091 :
2092 : }
2093 :
2094 477 : EbErrorType svt_av1_init_temporal_filtering(PictureParentControlSet **list_picture_control_set_ptr,
2095 : PictureParentControlSet *picture_control_set_ptr_central,
2096 : MotionEstimationContext_t *me_context_ptr,
2097 : int32_t segment_index) {
2098 : uint8_t *altref_strength_ptr, index_center;
2099 : EbPictureBufferDesc *central_picture_ptr;
2100 :
2101 477 : altref_strength_ptr = &(picture_control_set_ptr_central->altref_strength);
2102 :
2103 : // index of the central source frame
2104 477 : index_center = picture_control_set_ptr_central->past_altref_nframes;
2105 :
2106 : // if this assertion does not fail (as I think it should not, then remove picture_control_set_ptr_central from the input parameters of init_temporal_filtering())
2107 477 : assert(list_picture_control_set_ptr[index_center] == picture_control_set_ptr_central);
2108 :
2109 : // source central frame picture buffer
2110 477 : central_picture_ptr = picture_control_set_ptr_central->enhanced_picture_ptr;
2111 :
2112 477 : uint32_t encoder_bit_depth = picture_control_set_ptr_central->sequence_control_set_ptr->static_config.encoder_bit_depth;
2113 477 : EbBool is_highbd = (encoder_bit_depth == 8) ? (uint8_t)EB_FALSE : (uint8_t)EB_TRUE;
2114 :
2115 : // chroma subsampling
2116 477 : uint32_t ss_x = picture_control_set_ptr_central->sequence_control_set_ptr->subsampling_x;
2117 477 : uint32_t ss_y = picture_control_set_ptr_central->sequence_control_set_ptr->subsampling_y;
2118 :
2119 : //only one performs any picture based prep
2120 477 : eb_block_on_mutex(picture_control_set_ptr_central->temp_filt_mutex);
2121 480 : if (picture_control_set_ptr_central->temp_filt_prep_done == 0){
2122 :
2123 8 : picture_control_set_ptr_central->temp_filt_prep_done = 1;
2124 :
2125 : // allocate 16 bit buffer
2126 8 : if (is_highbd) {
2127 0 : EB_MALLOC_ARRAY(picture_control_set_ptr_central->altref_buffer_highbd[C_Y], central_picture_ptr->luma_size);
2128 0 : EB_MALLOC_ARRAY(picture_control_set_ptr_central->altref_buffer_highbd[C_U], central_picture_ptr->chroma_size);
2129 0 : EB_MALLOC_ARRAY(picture_control_set_ptr_central->altref_buffer_highbd[C_V], central_picture_ptr->chroma_size);
2130 :
2131 : // pack byte buffers to 16 bit buffer
2132 0 : pack_highbd_pic(central_picture_ptr, picture_control_set_ptr_central->altref_buffer_highbd, ss_x, ss_y, EB_TRUE);
2133 : }
2134 :
2135 : // Estimate source noise level
2136 : double noise_level;
2137 8 : if(is_highbd){
2138 0 : noise_level = estimate_noise_highbd(picture_control_set_ptr_central->altref_buffer_highbd[C_Y], // Y only
2139 0 : central_picture_ptr->width,
2140 0 : central_picture_ptr->height,
2141 0 : central_picture_ptr->stride_y,
2142 : encoder_bit_depth);
2143 : }
2144 : else{
2145 8 : EbByte buffer_y = central_picture_ptr->buffer_y + central_picture_ptr->origin_y*central_picture_ptr->stride_y + central_picture_ptr->origin_x;
2146 8 : noise_level = estimate_noise(buffer_y, // Y only
2147 8 : central_picture_ptr->width,
2148 8 : central_picture_ptr->height,
2149 8 : central_picture_ptr->stride_y);
2150 : }
2151 :
2152 : // adjust filter parameter based on the estimated noise of the picture
2153 : #if TWO_PASS
2154 8 : adjust_filter_strength( picture_control_set_ptr_central,
2155 : noise_level,
2156 : altref_strength_ptr,
2157 : is_highbd,
2158 : encoder_bit_depth);
2159 : #else
2160 : adjust_filter_strength(noise_level, altref_strength_ptr, is_highbd, encoder_bit_depth);
2161 : #endif
2162 :
2163 : // Pad chroma reference samples - once only per picture
2164 64 : for (int i = 0; i < (picture_control_set_ptr_central->past_altref_nframes + picture_control_set_ptr_central->future_altref_nframes + 1); i++) {
2165 56 : EbPictureBufferDesc *pic_ptr_ref = list_picture_control_set_ptr[i]->enhanced_picture_ptr;
2166 : #if FIX_ALTREF
2167 56 : if (i != picture_control_set_ptr_central->past_altref_nframes)
2168 : #endif
2169 48 : generate_padding_pic(pic_ptr_ref,
2170 : ss_x,
2171 : ss_y,
2172 : is_highbd);
2173 : }
2174 :
2175 8 : picture_control_set_ptr_central->temporal_filtering_on = EB_TRUE; // set temporal filtering flag ON for current picture
2176 :
2177 : // save original source picture (to be replaced by the temporally filtered pic)
2178 : // if stat_report is enabled for PSNR computation
2179 8 : if(picture_control_set_ptr_central->sequence_control_set_ptr->static_config.stat_report){
2180 0 : save_src_pic_buffers(picture_control_set_ptr_central,
2181 : ss_y,
2182 : is_highbd);
2183 : }
2184 :
2185 : }
2186 480 : eb_release_mutex(picture_control_set_ptr_central->temp_filt_mutex);
2187 :
2188 : // populate source frames picture buffer list
2189 480 : EbPictureBufferDesc *list_input_picture_ptr[ALTREF_MAX_NFRAMES] = { NULL };
2190 3840 : for (int i = 0; i < (picture_control_set_ptr_central->past_altref_nframes + picture_control_set_ptr_central->future_altref_nframes + 1); i++)
2191 3360 : list_input_picture_ptr[i] = list_picture_control_set_ptr[i]->enhanced_picture_ptr;
2192 :
2193 : uint64_t filtered_sse, filtered_sse_uv;
2194 :
2195 480 : produce_temporally_filtered_pic(list_picture_control_set_ptr,
2196 : list_input_picture_ptr,
2197 480 : *altref_strength_ptr,
2198 : index_center,
2199 : &filtered_sse,
2200 : &filtered_sse_uv,
2201 : me_context_ptr,
2202 : segment_index,
2203 : is_highbd);
2204 :
2205 480 : eb_block_on_mutex(picture_control_set_ptr_central->temp_filt_mutex);
2206 480 : picture_control_set_ptr_central->temp_filt_seg_acc++;
2207 :
2208 480 : if(!is_highbd){
2209 480 : picture_control_set_ptr_central->filtered_sse += filtered_sse;
2210 480 : picture_control_set_ptr_central->filtered_sse_uv += filtered_sse_uv;
2211 : }else{
2212 0 : picture_control_set_ptr_central->filtered_sse += filtered_sse >> 4;
2213 0 : picture_control_set_ptr_central->filtered_sse_uv += filtered_sse_uv >> 4;
2214 : }
2215 :
2216 480 : if (picture_control_set_ptr_central->temp_filt_seg_acc == picture_control_set_ptr_central->tf_segments_total_count){
2217 :
2218 : #if DEBUG_TF
2219 : if(!is_highbd)
2220 : save_YUV_to_file("filtered_picture.yuv",
2221 : central_picture_ptr->buffer_y,
2222 : central_picture_ptr->buffer_cb,
2223 : central_picture_ptr->buffer_cr,
2224 : central_picture_ptr->width,
2225 : central_picture_ptr->height,
2226 : central_picture_ptr->stride_y,
2227 : central_picture_ptr->stride_cb,
2228 : central_picture_ptr->stride_cr,
2229 : central_picture_ptr->origin_y,
2230 : central_picture_ptr->origin_x,
2231 : ss_x,
2232 : ss_y);
2233 : else
2234 : save_YUV_to_file_highbd("filtered_picture.yuv",
2235 : picture_control_set_ptr_central->altref_buffer_highbd[C_Y],
2236 : picture_control_set_ptr_central->altref_buffer_highbd[C_U],
2237 : picture_control_set_ptr_central->altref_buffer_highbd[C_V],
2238 : central_picture_ptr->width,
2239 : central_picture_ptr->height,
2240 : central_picture_ptr->stride_y,
2241 : central_picture_ptr->stride_cb,
2242 : central_picture_ptr->stride_cb,
2243 : central_picture_ptr->origin_y,
2244 : central_picture_ptr->origin_x,
2245 : ss_x,
2246 : ss_y);
2247 : #endif
2248 :
2249 8 : if(is_highbd) {
2250 0 : unpack_highbd_pic(picture_control_set_ptr_central->altref_buffer_highbd,
2251 : central_picture_ptr,
2252 : ss_x,
2253 : ss_y,
2254 : EB_TRUE);
2255 :
2256 0 : EB_FREE_ARRAY(picture_control_set_ptr_central->altref_buffer_highbd[C_Y]);
2257 0 : EB_FREE_ARRAY(picture_control_set_ptr_central->altref_buffer_highbd[C_U]);
2258 0 : EB_FREE_ARRAY(picture_control_set_ptr_central->altref_buffer_highbd[C_V]);
2259 : }
2260 :
2261 : // padding + decimation: even if highbd src, this is only performed on the 8 bit buffer (excluding the LSBs)
2262 8 : pad_and_decimate_filtered_pic(picture_control_set_ptr_central);
2263 :
2264 : // Normalize the filtered SSE. Add 8 bit precision.
2265 8 : picture_control_set_ptr_central->filtered_sse = (picture_control_set_ptr_central->filtered_sse << 8) / central_picture_ptr->width / central_picture_ptr->height;
2266 8 : picture_control_set_ptr_central->filtered_sse_uv = ((picture_control_set_ptr_central->filtered_sse_uv << 8) / (central_picture_ptr->width >> ss_x) / (central_picture_ptr->height >> ss_y)) / 2;
2267 :
2268 : // signal that temp filt is done
2269 8 : eb_post_semaphore(picture_control_set_ptr_central->temp_filt_done_semaphore);
2270 : }
2271 :
2272 480 : eb_release_mutex(picture_control_set_ptr_central->temp_filt_mutex);
2273 :
2274 480 : return EB_ErrorNone;
2275 :
2276 : }
|