LCOV - coverage.info - Codec/EbDeblockingFilter.c

LCOV - code coverage report

Current view:	top level - Codec - EbDeblockingFilter.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	495	949	52.2 %
Date:	2019-11-25 17:38:06	Functions:	19	53	35.8 %

          Line data    Source code

       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : /*
       7             : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       8             : *
       9             : * This source code is subject to the terms of the BSD 2 Clause License and
      10             : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      11             : * was not distributed with this source code in the LICENSE file, you can
      12             : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      13             : * Media Patent License 1.0 was not distributed with this source code in the
      14             : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      15             : */
      16             : 
      17             : #include <string.h>
      18             : 
      19             : #include "EbDefinitions.h"
      20             : #include "EbUtility.h"
      21             : #include "EbPictureControlSet.h"
      22             : #include "EbCodingUnit.h"
      23             : #include "EbSequenceControlSet.h"
      24             : #include "EbReferenceObject.h"
      25             : #include "EbDeblockingFilter.h"
      26             : 
      27             : #include "EbCommonUtils.h"
      28             : 
      29             : #define   convertToChromaQp(iQpY)  ( ((iQpY) < 0) ? (iQpY) : (((iQpY) > 57) ? ((iQpY)-6) : (int32_t)(map_chroma_qp((uint32_t)iQpY))) )
      30             : 
      31             : static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { { 0, 1 },
      32             :                                                       { 2, 2 },
      33             :                                                       { 3, 3 } };
      34             : 
      35             : static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = {
      36             :   { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H },
      37             :   { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U },
      38             :   { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
      39             : };
      40             : 
      41             : /** setQpArrayBasedOnCU()
      42             : is used to set qp in the qp_array on a CU basis.
      43             : */
      44           0 : void set_qp_array_based_on_cu(
      45             :     PictureControlSet *picture_control_set_ptr,          //input parameter
      46             :     uint32_t               cuPos_x,                       //input parameter, sample-based horizontal picture-wise locatin of the CU
      47             :     uint32_t               cuPos_y,                       //input parameter, sample-based vertical picture-wise locatin of the CU
      48             :     uint32_t               cu_size_in_min_cu_size,             //input parameter
      49             :     uint32_t               cu_qp)                          //input parameter, Qp of the CU
      50             : {
      51             :     uint32_t verticalIdx;
      52           0 :     uint32_t qpArrayIdx = (cuPos_y / MIN_BLOCK_SIZE) * picture_control_set_ptr->qp_array_stride + (cuPos_x / MIN_BLOCK_SIZE);
      53             : 
      54           0 :     for (verticalIdx = 0; verticalIdx < cu_size_in_min_cu_size; ++verticalIdx) {
      55           0 :         EB_MEMSET(picture_control_set_ptr->qp_array + qpArrayIdx + verticalIdx * picture_control_set_ptr->qp_array_stride,
      56             :             cu_qp, sizeof(uint8_t)*cu_size_in_min_cu_size);
      57             :     }
      58             : 
      59           0 :     return;
      60             : }
      61             : 
      62           0 : static INLINE int8_t signed_char_clamp(int32_t t) {
      63           0 :     return (int8_t)clamp(t, -128, 127);
      64             : }
      65             : 
      66           0 : static INLINE int16_t signed_char_clamp_high(int32_t t, int32_t bd) {
      67           0 :     switch (bd) {
      68           0 :     case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
      69           0 :     case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
      70           0 :     case 8:
      71           0 :     default: return (int16_t)clamp(t, -128, 128 - 1);
      72             :     }
      73             : }
      74             : 
      75             : // should we apply any filter at all: 11111111 yes, 00000000 no
      76           0 : static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
      77             :     uint8_t p0, uint8_t q0, uint8_t q1) {
      78           0 :     int8_t mask = 0;
      79           0 :     mask |= (abs(p1 - p0) > limit) * -1;
      80           0 :     mask |= (abs(q1 - q0) > limit) * -1;
      81           0 :     mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
      82           0 :     return ~mask;
      83             : }
      84             : 
      85           0 : static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
      86             :     uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
      87             :     uint8_t q1, uint8_t q2, uint8_t q3) {
      88           0 :     int8_t mask = 0;
      89           0 :     mask |= (abs(p3 - p2) > limit) * -1;
      90           0 :     mask |= (abs(p2 - p1) > limit) * -1;
      91           0 :     mask |= (abs(p1 - p0) > limit) * -1;
      92           0 :     mask |= (abs(q1 - q0) > limit) * -1;
      93           0 :     mask |= (abs(q2 - q1) > limit) * -1;
      94           0 :     mask |= (abs(q3 - q2) > limit) * -1;
      95           0 :     mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
      96           0 :     return ~mask;
      97             : }
      98             : 
      99           0 : static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit,
     100             :     uint8_t p2, uint8_t p1, uint8_t p0,
     101             :     uint8_t q0, uint8_t q1, uint8_t q2) {
     102           0 :     int8_t mask = 0;
     103           0 :     mask |= (abs(p2 - p1) > limit) * -1;
     104           0 :     mask |= (abs(p1 - p0) > limit) * -1;
     105           0 :     mask |= (abs(q1 - q0) > limit) * -1;
     106           0 :     mask |= (abs(q2 - q1) > limit) * -1;
     107           0 :     mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
     108           0 :     return ~mask;
     109             : }
     110             : 
     111           0 : static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
     112             :     uint8_t p0, uint8_t q0, uint8_t q1,
     113             :     uint8_t q2) {
     114           0 :     int8_t mask = 0;
     115           0 :     mask |= (abs(p1 - p0) > thresh) * -1;
     116           0 :     mask |= (abs(q1 - q0) > thresh) * -1;
     117           0 :     mask |= (abs(p2 - p0) > thresh) * -1;
     118           0 :     mask |= (abs(q2 - q0) > thresh) * -1;
     119           0 :     return ~mask;
     120             : }
     121             : 
     122           0 : static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
     123             :     uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
     124             :     uint8_t q2, uint8_t q3) {
     125           0 :     int8_t mask = 0;
     126           0 :     mask |= (abs(p1 - p0) > thresh) * -1;
     127           0 :     mask |= (abs(q1 - q0) > thresh) * -1;
     128           0 :     mask |= (abs(p2 - p0) > thresh) * -1;
     129           0 :     mask |= (abs(q2 - q0) > thresh) * -1;
     130           0 :     mask |= (abs(p3 - p0) > thresh) * -1;
     131           0 :     mask |= (abs(q3 - q0) > thresh) * -1;
     132           0 :     return ~mask;
     133             : }
     134             : 
     135             : // is there high edge variance internal edge: 11111111 yes, 00000000 no
     136           0 : static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
     137             :     uint8_t q0, uint8_t q1) {
     138           0 :     int8_t hev = 0;
     139           0 :     hev |= (abs(p1 - p0) > thresh) * -1;
     140           0 :     hev |= (abs(q1 - q0) > thresh) * -1;
     141           0 :     return hev;
     142             : }
     143             : 
     144           0 : static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
     145             :     uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
     146             :     int8_t filter1, filter2;
     147             : 
     148           0 :     const int8_t ps1 = (int8_t)*op1 ^ 0x80;
     149           0 :     const int8_t ps0 = (int8_t)*op0 ^ 0x80;
     150           0 :     const int8_t qs0 = (int8_t)*oq0 ^ 0x80;
     151           0 :     const int8_t qs1 = (int8_t)*oq1 ^ 0x80;
     152           0 :     const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
     153             : 
     154             :     // add outer taps if we have high edge variance
     155           0 :     int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
     156             : 
     157             :     // inner taps
     158           0 :     filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
     159             : 
     160             :     // save bottom 3 bits so that we round one side +4 and the other +3
     161             :     // if it equals 4 we'll set to adjust by -1 to account for the fact
     162             :     // we'd round 3 the other way
     163           0 :     filter1 = signed_char_clamp(filter + 4) >> 3;
     164           0 :     filter2 = signed_char_clamp(filter + 3) >> 3;
     165             : 
     166           0 :     *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
     167           0 :     *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
     168             : 
     169             :     // outer tap adjustments
     170           0 :     filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
     171             : 
     172           0 :     *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
     173           0 :     *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
     174           0 : }
     175             : 
     176           0 : void aom_lpf_horizontal_4_c(uint8_t *s, int32_t p /* pitch */,
     177             :     const uint8_t *blimit, const uint8_t *limit,
     178             :     const uint8_t *thresh) {
     179             :     int32_t i;
     180           0 :     int32_t count = 4;
     181             : 
     182             :     // loop filter designed to work using chars so that we can make maximum use
     183             :     // of 8 bit simd instructions.
     184           0 :     for (i = 0; i < count; ++i) {
     185           0 :         const uint8_t p1 = s[-2 * p], p0 = s[-p];
     186           0 :         const uint8_t q0 = s[0 * p], q1 = s[1 * p];
     187           0 :         const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
     188           0 :         filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
     189           0 :         ++s;
     190             :     }
     191           0 : }
     192             : 
     193           0 : void aom_lpf_vertical_4_c(uint8_t *s, int32_t pitch, const uint8_t *blimit,
     194             :     const uint8_t *limit, const uint8_t *thresh) {
     195             :     int32_t i;
     196           0 :     int32_t count = 4;
     197             : 
     198             :     // loop filter designed to work using chars so that we can make maximum use
     199             :     // of 8 bit simd instructions.
     200           0 :     for (i = 0; i < count; ++i) {
     201           0 :         const uint8_t p1 = s[-2], p0 = s[-1];
     202           0 :         const uint8_t q0 = s[0], q1 = s[1];
     203           0 :         const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
     204           0 :         filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
     205           0 :         s += pitch;
     206             :     }
     207           0 : }
     208             : 
     209           0 : void aom_lpf_vertical_4_dual_c(uint8_t *s, int32_t pitch, const uint8_t *blimit0,
     210             :     const uint8_t *limit0, const uint8_t *thresh0,
     211             :     const uint8_t *blimit1, const uint8_t *limit1,
     212             :     const uint8_t *thresh1) {
     213           0 :     aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
     214           0 :     aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
     215           0 : }
     216             : 
     217           0 : static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
     218             :     uint8_t *op2, uint8_t *op1, uint8_t *op0,
     219             :     uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
     220           0 :     if (flat && mask) {
     221           0 :         const uint8_t p2 = *op2, p1 = *op1, p0 = *op0;
     222           0 :         const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
     223             : 
     224             :         // 5-tap filter [1, 2, 2, 2, 1]
     225           0 :         *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
     226           0 :         *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
     227           0 :         *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
     228           0 :         *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
     229             :     }
     230             :     else
     231           0 :         filter4(mask, thresh, op1, op0, oq0, oq1);
     232           0 : }
     233             : 
     234           0 : static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
     235             :     uint8_t *op3, uint8_t *op2, uint8_t *op1,
     236             :     uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
     237             :     uint8_t *oq2, uint8_t *oq3) {
     238           0 :     if (flat && mask) {
     239           0 :         const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
     240           0 :         const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
     241             : 
     242             :         // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
     243           0 :         *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
     244           0 :         *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
     245           0 :         *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
     246           0 :         *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
     247           0 :         *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
     248           0 :         *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
     249             :     }
     250             :     else
     251           0 :         filter4(mask, thresh, op1, op0, oq0, oq1);
     252           0 : }
     253             : 
     254           0 : void aom_lpf_horizontal_6_c(uint8_t *s, int32_t p, const uint8_t *blimit,
     255             :     const uint8_t *limit, const uint8_t *thresh) {
     256             :     int32_t i;
     257           0 :     int32_t count = 4;
     258             : 
     259             :     // loop filter designed to work using chars so that we can make maximum use
     260             :     // of 8 bit simd instructions.
     261           0 :     for (i = 0; i < count; ++i) {
     262           0 :         const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     263           0 :         const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
     264             : 
     265             :         const int8_t mask =
     266           0 :             filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
     267           0 :         const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
     268           0 :         filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
     269           0 :             s + 2 * p);
     270           0 :         ++s;
     271             :     }
     272           0 : }
     273             : 
     274           0 : void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
     275             :     const uint8_t *limit, const uint8_t *thresh)
     276             : {
     277             :     int i;
     278           0 :     int count = 4;
     279             : 
     280           0 :     for (i = 0; i < count; ++i) {
     281           0 :         const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
     282           0 :         const uint8_t q0 = s[0], q1 = s[1], q2 = s[2];
     283             :         const int8_t mask =
     284           0 :             filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
     285           0 :         const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
     286           0 :         filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2);
     287           0 :         s += pitch;
     288             :     }
     289           0 : }
     290             : 
     291           0 : void aom_lpf_horizontal_8_c(uint8_t *s, int32_t p, const uint8_t *blimit,
     292             :     const uint8_t *limit, const uint8_t *thresh) {
     293             :     int32_t i;
     294           0 :     int32_t count = 4;
     295             : 
     296             :     // loop filter designed to work using chars so that we can make maximum use
     297             :     // of 8 bit simd instructions.
     298           0 :     for (i = 0; i < count; ++i) {
     299           0 :         const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     300           0 :         const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
     301             : 
     302             :         const int8_t mask =
     303           0 :             filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     304           0 :         const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     305           0 :         filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
     306           0 :             s + 1 * p, s + 2 * p, s + 3 * p);
     307           0 :         ++s;
     308             :     }
     309           0 : }
     310             : 
     311           0 : void aom_lpf_horizontal_8_dual_c(uint8_t *s, int32_t p, const uint8_t *blimit0,
     312             :     const uint8_t *limit0, const uint8_t *thresh0,
     313             :     const uint8_t *blimit1, const uint8_t *limit1,
     314             :     const uint8_t *thresh1) {
     315           0 :     aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
     316           0 :     aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1);
     317           0 : }
     318             : 
     319           0 : void aom_lpf_vertical_8_c(uint8_t *s, int32_t pitch, const uint8_t *blimit,
     320             :     const uint8_t *limit, const uint8_t *thresh) {
     321             :     int32_t i;
     322           0 :     int32_t count = 4;
     323             : 
     324           0 :     for (i = 0; i < count; ++i) {
     325           0 :         const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     326           0 :         const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     327             :         const int8_t mask =
     328           0 :             filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     329           0 :         const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     330           0 :         filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
     331             :             s + 3);
     332           0 :         s += pitch;
     333             :     }
     334           0 : }
     335             : 
     336           0 : void aom_lpf_vertical_8_dual_c(uint8_t *s, int32_t pitch, const uint8_t *blimit0,
     337             :     const uint8_t *limit0, const uint8_t *thresh0,
     338             :     const uint8_t *blimit1, const uint8_t *limit1,
     339             :     const uint8_t *thresh1) {
     340           0 :     aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
     341           0 :     aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
     342           0 : }
     343             : 
     344           0 : static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
     345             :     int8_t flat2, uint8_t *op6, uint8_t *op5,
     346             :     uint8_t *op4, uint8_t *op3, uint8_t *op2,
     347             :     uint8_t *op1, uint8_t *op0, uint8_t *oq0,
     348             :     uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
     349             :     uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) {
     350           0 :     if (flat2 && flat && mask) {
     351           0 :         const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2,
     352           0 :             p1 = *op1, p0 = *op0;
     353           0 :         const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
     354           0 :             q5 = *oq5, q6 = *oq6;
     355             : 
     356             :         // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
     357           0 :         *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
     358             :             4);
     359           0 :         *op4 = ROUND_POWER_OF_TWO(
     360             :             p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
     361           0 :         *op3 = ROUND_POWER_OF_TWO(
     362             :             p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
     363           0 :         *op2 = ROUND_POWER_OF_TWO(
     364             :             p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
     365             :             4);
     366           0 :         *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
     367             :             q0 + q1 + q2 + q3 + q4,
     368             :             4);
     369           0 :         *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
     370             :             q0 * 2 + q1 + q2 + q3 + q4 + q5,
     371             :             4);
     372           0 :         *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
     373             :             q1 * 2 + q2 + q3 + q4 + q5 + q6,
     374             :             4);
     375           0 :         *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
     376             :             q2 * 2 + q3 + q4 + q5 + q6 * 2,
     377             :             4);
     378           0 :         *oq2 = ROUND_POWER_OF_TWO(
     379             :             p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
     380             :             4);
     381           0 :         *oq3 = ROUND_POWER_OF_TWO(
     382             :             p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
     383           0 :         *oq4 = ROUND_POWER_OF_TWO(
     384             :             p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
     385           0 :         *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
     386             :             4);
     387             :     }
     388             :     else
     389           0 :         filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
     390           0 : }
     391             : 
     392           0 : static void mb_lpf_vertical_edge_w(uint8_t *s, int32_t p, const uint8_t *blimit,
     393             :     const uint8_t *limit, const uint8_t *thresh,
     394             :     int32_t count) {
     395             :     int32_t i;
     396             : 
     397           0 :     for (i = 0; i < count; ++i) {
     398           0 :         const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3],
     399           0 :             p1 = s[-2], p0 = s[-1];
     400           0 :         const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4],
     401           0 :             q5 = s[5], q6 = s[6];
     402             :         const int8_t mask =
     403           0 :             filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     404           0 :         const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     405           0 :         const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
     406             : 
     407           0 :         filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3,
     408             :             s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6);
     409           0 :         s += p;
     410             :     }
     411           0 : }
     412             : 
     413           0 : void aom_lpf_vertical_14_dual_c(uint8_t *s, int32_t p, const uint8_t *blimit,
     414             :     const uint8_t *limit, const uint8_t *thresh) {
     415           0 :     mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
     416           0 : }
     417             : 
     418             : // Should we apply any filter at all: 11111111 yes, 00000000 no ?
     419           0 : static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
     420             :     uint16_t p1, uint16_t p0, uint16_t q0,
     421             :     uint16_t q1, int32_t bd) {
     422           0 :     int8_t mask = 0;
     423           0 :     int16_t limit16 = (uint16_t)limit << (bd - 8);
     424           0 :     int16_t blimit16 = (uint16_t)blimit << (bd - 8);
     425           0 :     mask |= (abs(p1 - p0) > limit16) * -1;
     426           0 :     mask |= (abs(q1 - q0) > limit16) * -1;
     427           0 :     mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
     428           0 :     return ~mask;
     429             : }
     430             : 
     431             : // Should we apply any filter at all: 11111111 yes, 00000000 no ?
     432           0 : static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
     433             :     uint16_t p3, uint16_t p2, uint16_t p1,
     434             :     uint16_t p0, uint16_t q0, uint16_t q1,
     435             :     uint16_t q2, uint16_t q3, int32_t bd) {
     436           0 :     int8_t mask = 0;
     437           0 :     int16_t limit16 = (uint16_t)limit << (bd - 8);
     438           0 :     int16_t blimit16 = (uint16_t)blimit << (bd - 8);
     439           0 :     mask |= (abs(p3 - p2) > limit16) * -1;
     440           0 :     mask |= (abs(p2 - p1) > limit16) * -1;
     441           0 :     mask |= (abs(p1 - p0) > limit16) * -1;
     442           0 :     mask |= (abs(q1 - q0) > limit16) * -1;
     443           0 :     mask |= (abs(q2 - q1) > limit16) * -1;
     444           0 :     mask |= (abs(q3 - q2) > limit16) * -1;
     445           0 :     mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
     446           0 :     return ~mask;
     447             : }
     448             : 
     449           0 : static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
     450             :     uint16_t p1, uint16_t p0, uint16_t q0,
     451             :     uint16_t q1, uint16_t q2, uint16_t q3,
     452             :     int32_t bd) {
     453           0 :     int8_t mask = 0;
     454           0 :     int16_t thresh16 = (uint16_t)thresh << (bd - 8);
     455           0 :     mask |= (abs(p1 - p0) > thresh16) * -1;
     456           0 :     mask |= (abs(q1 - q0) > thresh16) * -1;
     457           0 :     mask |= (abs(p2 - p0) > thresh16) * -1;
     458           0 :     mask |= (abs(q2 - q0) > thresh16) * -1;
     459           0 :     mask |= (abs(p3 - p0) > thresh16) * -1;
     460           0 :     mask |= (abs(q3 - q0) > thresh16) * -1;
     461           0 :     return ~mask;
     462             : }
     463             : 
     464             : // Is there high edge variance internal edge:
     465             : // 11111111_11111111 yes, 00000000_00000000 no ?
     466           0 : static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
     467             :     uint16_t q0, uint16_t q1, int32_t bd) {
     468           0 :     int16_t hev = 0;
     469           0 :     int16_t thresh16 = (uint16_t)thresh << (bd - 8);
     470           0 :     hev |= (abs(p1 - p0) > thresh16) * -1;
     471           0 :     hev |= (abs(q1 - q0) > thresh16) * -1;
     472           0 :     return hev;
     473             : }
     474             : 
     475           0 : static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
     476             :     uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
     477             :     int32_t bd) {
     478             :     int16_t filter1, filter2;
     479             :     // ^0x80 equivalent to subtracting 0x80 from the values to turn them
     480             :     // into -128 to +127 instead of 0 to 255.
     481           0 :     int32_t shift = bd - 8;
     482           0 :     const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
     483           0 :     const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
     484           0 :     const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
     485           0 :     const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
     486           0 :     const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
     487             : 
     488             :     // Add outer taps if we have high edge variance.
     489           0 :     int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
     490             : 
     491             :     // Inner taps.
     492           0 :     filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
     493             : 
     494             :     // Save bottom 3 bits so that we round one side +4 and the other +3
     495             :     // if it equals 4 we'll set to adjust by -1 to account for the fact
     496             :     // we'd round 3 the other way.
     497           0 :     filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
     498           0 :     filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
     499             : 
     500           0 :     *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
     501           0 :     *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
     502             : 
     503             :     // Outer tap adjustments.
     504           0 :     filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
     505             : 
     506           0 :     *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
     507           0 :     *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
     508           0 : }
     509             : 
     510           0 : void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int32_t p /* pitch */,
     511             :     const uint8_t *blimit, const uint8_t *limit,
     512             :     const uint8_t *thresh, int32_t bd) {
     513             :     int32_t i;
     514           0 :     int32_t count = 4;
     515             : 
     516             :     // loop filter designed to work using chars so that we can make maximum use
     517             :     // of 8 bit simd instructions.
     518           0 :     for (i = 0; i < count; ++i) {
     519           0 :         const uint16_t p1 = s[-2 * p];
     520           0 :         const uint16_t p0 = s[-p];
     521           0 :         const uint16_t q0 = s[0 * p];
     522           0 :         const uint16_t q1 = s[1 * p];
     523             :         const int8_t mask =
     524           0 :             highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
     525           0 :         highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
     526           0 :         ++s;
     527             :     }
     528           0 : }
     529             : 
     530           0 : void aom_highbd_lpf_vertical_4_c(uint16_t *s, int32_t pitch, const uint8_t *blimit,
     531             :     const uint8_t *limit, const uint8_t *thresh,
     532             :     int32_t bd) {
     533             :     int32_t i;
     534           0 :     int32_t count = 4;
     535             : 
     536             :     // loop filter designed to work using chars so that we can make maximum use
     537             :     // of 8 bit simd instructions.
     538           0 :     for (i = 0; i < count; ++i) {
     539           0 :         const uint16_t p1 = s[-2], p0 = s[-1];
     540           0 :         const uint16_t q0 = s[0], q1 = s[1];
     541             :         const int8_t mask =
     542           0 :             highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
     543           0 :         highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
     544           0 :         s += pitch;
     545             :     }
     546           0 : }
     547             : 
     548           0 : static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
     549             :     uint16_t *op3, uint16_t *op2, uint16_t *op1,
     550             :     uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
     551             :     uint16_t *oq2, uint16_t *oq3, int32_t bd) {
     552           0 :     if (flat && mask) {
     553           0 :         const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
     554           0 :         const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
     555             : 
     556             :         // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
     557           0 :         *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
     558           0 :         *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
     559           0 :         *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
     560           0 :         *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
     561           0 :         *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
     562           0 :         *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
     563             :     }
     564             :     else
     565           0 :         highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
     566           0 : }
     567             : 
     568           0 : void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int32_t p, const uint8_t *blimit,
     569             :     const uint8_t *limit, const uint8_t *thresh,
     570             :     int32_t bd) {
     571             :     int32_t i;
     572           0 :     int32_t count = 4;
     573             : 
     574             :     // loop filter designed to work using chars so that we can make maximum use
     575             :     // of 8 bit simd instructions.
     576           0 :     for (i = 0; i < count; ++i) {
     577           0 :         const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     578           0 :         const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
     579             : 
     580             :         const int8_t mask =
     581           0 :             highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     582             :         const int8_t flat =
     583           0 :             highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     584           0 :         highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
     585           0 :             s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
     586           0 :         ++s;
     587             :     }
     588           0 : }
     589             : 
     590           0 : void aom_highbd_lpf_vertical_8_c(uint16_t *s, int32_t pitch, const uint8_t *blimit,
     591             :     const uint8_t *limit, const uint8_t *thresh,
     592             :     int32_t bd) {
     593             :     int32_t i;
     594           0 :     int32_t count = 4;
     595             : 
     596           0 :     for (i = 0; i < count; ++i) {
     597           0 :         const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     598           0 :         const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     599             :         const int8_t mask =
     600           0 :             highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     601             :         const int8_t flat =
     602           0 :             highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     603           0 :         highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
     604             :             s + 2, s + 3, bd);
     605           0 :         s += pitch;
     606             :     }
     607           0 : }
     608             : 
     609             : //**********************************************************************************************************************//
     610             : 
     611             : //static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = {
     612             : //    { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H },
     613             : //    { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U },
     614             : //    { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
     615             : //};
     616             : 
     617             : const int32_t mode_lf_lut[] = {
     618             :     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
     619             :     1, 1, 0, 1,                             // INTER_MODES (GLOBALMV == 0)
     620             :     1, 1, 1, 1, 1, 1, 0, 1  // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0)
     621             : };
     622             : 
     623        1228 : void update_sharpness(LoopFilterInfoN *lfi, int32_t sharpness_lvl) {
     624             :     int32_t lvl;
     625             : 
     626             :     // For each possible value for the loop filter fill out limits
     627       79820 :     for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
     628             :         // Set loop filter parameters that control sharpness.
     629       78592 :         int32_t block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
     630             : 
     631       78592 :         if (sharpness_lvl > 0) {
     632           0 :             if (block_inside_limit > (9 - sharpness_lvl))
     633           0 :                 block_inside_limit = (9 - sharpness_lvl);
     634             :         }
     635             : 
     636       78592 :         if (block_inside_limit < 1) block_inside_limit = 1;
     637             : 
     638       78592 :         memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
     639       78592 :         memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
     640             :             SIMD_WIDTH);
     641             :     }
     642        1228 : }
     643             : 
     644       18336 : static int seg_feature_active(SegmentationParams *seg, int segment_id,
     645             :     SEG_LVL_FEATURES feature_id)
     646             : {
     647       18336 :     return seg->segmentation_enabled && seg->feature_enabled[segment_id][feature_id];
     648             : }
     649             : 
     650     5231240 : uint8_t get_filter_level(
     651             :     FrameHeader* frm_hdr,
     652             :     const LoopFilterInfoN *lfi_n,
     653             :     const int32_t dir_idx, int32_t plane,
     654             :     int32_t *sb_delta_lf, uint8_t seg_id,
     655             :     PredictionMode pred_mode, MvReferenceFrame ref_frame_0)
     656             : {
     657     5231240 :     const int32_t segment_id = seg_id; /* const int32_t segment_id =  0; might cause encoder problem */
     658             :     PredictionMode mode; // Added to address 4x4 problem
     659     5231240 :     mode = (pred_mode == INTRA_MODE_4x4) ? DC_PRED : pred_mode;
     660     5231240 :     if (frm_hdr->delta_lf_params.delta_lf_present) {
     661           0 :         printf("ERROR[AN]: delta_lf_present not supported yet\n");
     662           0 :         int32_t delta_lf = -1;
     663           0 :         if (frm_hdr->delta_lf_params.delta_lf_multi) {
     664           0 :             const int32_t delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
     665           0 :             delta_lf = sb_delta_lf[delta_lf_idx];
     666             :         }
     667             :         else {
     668           0 :             delta_lf = sb_delta_lf[0];
     669             :         }
     670             :         int32_t base_level;
     671           0 :         if (plane == 0)
     672           0 :             base_level = frm_hdr->loop_filter_params.filter_level[dir_idx];
     673           0 :         else if (plane == 1)
     674           0 :             base_level = frm_hdr->loop_filter_params.filter_level_u;
     675             :         else
     676           0 :             base_level = frm_hdr->loop_filter_params.filter_level_v;
     677           0 :         int32_t lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER);
     678           0 :         assert(plane >= 0 && plane <= 2);
     679           0 :         const int32_t seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx];
     680           0 :         if (seg_feature_active(&frm_hdr->segmentation_params, segment_id,
     681             :             seg_lf_feature_id))
     682             :         {
     683           0 :             const int32_t data = get_segdata(&frm_hdr->segmentation_params,
     684             :                                              segment_id, seg_lf_feature_id);
     685           0 :             lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
     686             :         }
     687             : 
     688           0 :         if (frm_hdr->loop_filter_params.mode_ref_delta_enabled) {
     689           0 :             const int32_t scale = 1 << (lvl_seg >> 5);
     690           0 :             lvl_seg += frm_hdr->loop_filter_params.ref_deltas[ref_frame_0] * scale;
     691           0 :             if (ref_frame_0 > INTRA_FRAME)
     692           0 :                 lvl_seg += frm_hdr->loop_filter_params.
     693           0 :                     mode_deltas[mode_lf_lut[mode]] * scale;
     694           0 :             lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER);
     695             :         }
     696           0 :         return lvl_seg;
     697             :     }
     698             :     else {
     699             :         ASSERT(mode < MB_MODE_COUNT);
     700             :         return lfi_n->lvl[plane][segment_id][dir_idx][ref_frame_0]
     701     5231240 :             [mode_lf_lut[mode]];
     702             :     }
     703             : }
     704             : 
     705          90 : void eb_av1_loop_filter_init(PictureControlSet *pcs_ptr) {
     706             :     //assert(MB_MODE_COUNT == n_elements(mode_lf_lut));
     707          90 :     LoopFilterInfoN *lfi = &pcs_ptr->parent_pcs_ptr->lf_info;
     708          90 :     struct LoopFilter *lf = &pcs_ptr->parent_pcs_ptr->frm_hdr.loop_filter_params;
     709             :     int32_t lvl;
     710             : 
     711          90 :     lf->combine_vert_horz_lf = 1;
     712             : 
     713             :     // init limits for given sharpness
     714          90 :     update_sharpness(lfi, lf->sharpness_level);
     715             : 
     716             :     // init hev threshold const vectors
     717        5850 :     for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
     718        5760 :         memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
     719          90 : }
     720             : 
     721             : // Update the loop filter for the current frame.
     722             : // This should be called before loop_filter_rows(),
     723             : // eb_av1_loop_filter_frame() calls this function directly.
     724        1138 : void eb_av1_loop_filter_frame_init(FrameHeader *frm_hdr,
     725             :     LoopFilterInfoN *lfi, int32_t plane_start, int32_t plane_end)
     726             : {
     727             :     int32_t filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE];
     728             :     int32_t plane;
     729             :     int32_t seg_id;
     730             :     // n_shift is the multiplier for lf_deltas
     731             :     // the multiplier is 1 for when filter_lvl is between 0 and 31;
     732             :     // 2 when filter_lvl is between 32 and 63
     733             : 
     734        1138 :     struct LoopFilter *const lf = &frm_hdr->loop_filter_params;
     735             :     // const struct segmentation *const seg = &pcs_ptr->parent_pcs_ptr->seg;
     736             : 
     737             :      // update sharpness limits
     738        1138 :     update_sharpness(lfi, lf->sharpness_level);
     739             : 
     740        1138 :     filt_lvl[0] = frm_hdr->loop_filter_params.filter_level[0];
     741        1138 :     filt_lvl[1] = frm_hdr->loop_filter_params.filter_level_u;
     742        1138 :     filt_lvl[2] = frm_hdr->loop_filter_params.filter_level_v;
     743             : 
     744        1138 :     filt_lvl_r[0] = frm_hdr->loop_filter_params.filter_level[1];
     745        1138 :     filt_lvl_r[1] = frm_hdr->loop_filter_params.filter_level_u;
     746        1138 :     filt_lvl_r[2] = frm_hdr->loop_filter_params.filter_level_v;
     747             : 
     748        2506 :     for (plane = plane_start; plane < plane_end; plane++) {
     749        1430 :         if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
     750             :             break;
     751        1368 :         else if (plane == 1 && !filt_lvl[1])
     752          92 :             continue;
     753        1276 :         else if (plane == 2 && !filt_lvl[2])
     754         130 :             continue;
     755             : 
     756       10314 :         for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
     757       27504 :             for (int32_t dir = 0; dir < 2; ++dir) {
     758       18336 :                 int32_t lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
     759       18336 :                 assert(plane >= 0 && plane <= 2);
     760       18336 :                 const int32_t seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
     761       18336 :                 if (seg_feature_active(&frm_hdr->segmentation_params, seg_id,
     762             :                     seg_lf_feature_id))
     763             :                 {
     764           0 :                     const int32_t data = get_segdata(&frm_hdr->segmentation_params,
     765             :                                                      seg_id, seg_lf_feature_id);
     766           0 :                     lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
     767             :                 }
     768             : 
     769       18336 :                 if (!lf->mode_ref_delta_enabled) {
     770             :                     // we could get rid of this if we assume that deltas are set to
     771             :                     // zero when not in use; encoder always uses deltas
     772       18336 :                     memset(lfi->lvl[plane][seg_id][dir], lvl_seg,
     773             :                         sizeof(lfi->lvl[plane][seg_id][dir]));
     774             :                 }
     775             :                 else {
     776             :                     int32_t ref, mode;
     777           0 :                     const int32_t scale = 1 << (lvl_seg >> 5);
     778           0 :                     const int32_t intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
     779           0 :                     lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] =
     780           0 :                         (uint8_t)clamp(intra_lvl, 0, MAX_LOOP_FILTER);
     781             : 
     782           0 :                     for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) {
     783           0 :                         for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
     784           0 :                             const int32_t inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
     785           0 :                                 lf->mode_deltas[mode] * scale;
     786           0 :                             lfi->lvl[plane][seg_id][dir][ref][mode] =
     787           0 :                                 (uint8_t)clamp(inter_lvl, 0, MAX_LOOP_FILTER);
     788             :                         }
     789             :                     }
     790             :                 }
     791             :             }
     792             :         }
     793             :     }
     794        1138 : }
     795             : //***************************************************************************************************//
     796             : 
     797      122755 : static INLINE int32_t scaled_buffer_offset(int32_t x_offset, int32_t y_offset, int32_t stride/*,
     798             :     const struct scale_factors *sf*/) {
     799      122755 :     const int32_t x =
     800             :         /*sf ? sf->scale_value_x(x_offset, sf) >> SCALE_EXTRA_BITS :*/ x_offset;
     801      122755 :     const int32_t y =
     802             :         /*sf ? sf->scale_value_y(y_offset, sf) >> SCALE_EXTRA_BITS :*/ y_offset;
     803      122755 :     return y * stride + x;
     804             : }
     805      122755 : static INLINE void setup_pred_plane(struct Buf2d *dst, BlockSize bsize,
     806             :     uint8_t *src, int32_t width, int32_t height,
     807             :     int32_t stride, int32_t mi_row, int32_t mi_col,
     808             :     /*const struct scale_factors *scale,*/
     809             :     int32_t subsampling_x, int32_t subsampling_y,
     810             :     int32_t is16Bit) {
     811             :     // Offset the buffer pointer
     812      122755 :     if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
     813           0 :         mi_row -= 1;
     814      122755 :     if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
     815           0 :         mi_col -= 1;
     816             : 
     817      122755 :     const int32_t x = (MI_SIZE * mi_col) >> subsampling_x;
     818      122755 :     const int32_t y = (MI_SIZE * mi_row) >> subsampling_y;
     819      122755 :     dst->buf = src + (scaled_buffer_offset(x, y, stride/*, scale*/) << is16Bit);
     820      122756 :     dst->buf0 = src;
     821      122756 :     dst->width = width;
     822      122756 :     dst->height = height;
     823      122756 :     dst->stride = stride;
     824      122756 : }
     825      122757 : void eb_av1_setup_dst_planes(struct MacroblockdPlane *planes, BlockSize bsize,
     826             :     //const Yv12BufferConfig *src,
     827             :     const EbPictureBufferDesc *src,
     828             :     int32_t mi_row, int32_t mi_col,
     829             :     const int32_t plane_start, const int32_t plane_end) {
     830             :     // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
     831             :     // the static analysis warnings.
     832             :     //for (int32_t i = plane_start; i < AOMMIN(plane_end, MAX_MB_PLANE); ++i) {
     833             :     //    struct MacroblockdPlane *const pd = &planes[i];
     834             :     //    const int32_t is_uv = i > 0;
     835             :     //    setup_pred_plane(&pd->dst, bsize, src->buffers[i], src->crop_widths[is_uv],
     836             :     //        src->crop_heights[is_uv], src->strides[is_uv], mi_row,
     837             :     //        mi_col, NULL, pd->subsampling_x, pd->subsampling_y);
     838             :     //}
     839      245512 :     for (int32_t i = plane_start; i < AOMMIN(plane_end, 3); ++i) {
     840      122759 :         if (i == 0) {
     841       56399 :             struct MacroblockdPlane *const pd = &planes[0];
     842       56399 :             setup_pred_plane(&pd->dst, bsize, &src->buffer_y[(src->origin_x + src->origin_y*src->stride_y) << pd->is16Bit], src->width,
     843       56399 :                 src->height, src->stride_y, mi_row,
     844             :                 mi_col, /*NULL,*/ pd->subsampling_x, pd->subsampling_y, pd->is16Bit); //AMIR: Updated to point to the right location
     845             :         }
     846       66360 :         else if (i == 1) {
     847       35040 :             struct MacroblockdPlane *const pd = &planes[1];
     848       35040 :             setup_pred_plane(&pd->dst, bsize, &src->buffer_cb[((src->origin_x + src->origin_y*src->stride_cb) << pd->is16Bit) / 2], src->width / 2,
     849       35040 :                 src->height / 2, src->stride_cb, mi_row,
     850             :                 mi_col, /*NULL,*/ pd->subsampling_x, pd->subsampling_y, pd->is16Bit);
     851             :         }
     852       31320 :         else if (i == 2) {
     853       31320 :             struct MacroblockdPlane *const pd = &planes[2];
     854       31320 :             setup_pred_plane(&pd->dst, bsize, &src->buffer_cr[((src->origin_x + src->origin_y*src->stride_cr) << pd->is16Bit) / 2], src->width / 2,
     855       31320 :                 src->height / 2, src->stride_cr, mi_row,
     856             :                 mi_col,/* NULL,*/ pd->subsampling_x, pd->subsampling_y, pd->is16Bit);
     857             :         }
     858             :     }
     859      122753 : }
     860             : 
     861             : #define INTER_TX_SIZE_BUF_LEN 16
     862             : 
     863             : //***************************************************************************************************//
     864             : 
     865     4699940 : static TxSize get_transform_size(const MacroBlockD *const xd,
     866             :     const MbModeInfo *const mbmi,
     867             :     const EDGE_DIR edge_dir, const int32_t mi_row,
     868             :     const int32_t mi_col, const int32_t plane,
     869             :     const struct MacroblockdPlane *plane_ptr) {
     870     4699940 :     assert(mbmi != NULL);
     871             :     (void)mi_row;
     872             :     (void)mi_col;
     873             :     (void)xd;
     874             :     //if (xd->lossless[mbmi->segment_id]) return TX_4X4;
     875             : 
     876     4700400 :     TxSize tx_size = (plane == COMPONENT_LUMA)
     877     3009750 :         ? (is_inter_block_no_intrabc(mbmi->block_mi.ref_frame[0])
     878     2467260 :         ? tx_depth_to_tx_size[0][mbmi->block_mi.sb_type]
     879      542551 :         : tx_depth_to_tx_size[mbmi->tx_depth][mbmi->block_mi.sb_type]) // use max_tx_size
     880     7709750 :         : av1_get_max_uv_txsize(mbmi->block_mi.sb_type,
     881             :             plane_ptr->subsampling_x, plane_ptr->subsampling_y);
     882     4700400 :     assert(tx_size < TX_SIZES_ALL);
     883     7710380 :     if (((plane == COMPONENT_LUMA) &&
     884     3009990 :         is_inter_block_no_intrabc(mbmi->block_mi.ref_frame[0]) &&
     885     2467300 :         !mbmi->block_mi.skip)) {  // if split tx is used
     886             : 
     887      350152 :         const TxSize mb_tx_size =
     888      350152 :             tx_depth_to_tx_size[mbmi->tx_depth][mbmi->block_mi.sb_type]; // tx_size
     889      350152 :         assert(mb_tx_size < TX_SIZES_ALL);
     890      350152 :         tx_size = mb_tx_size;
     891             :     }
     892             :     // since in case of chrominance or non-square transorm need to convert
     893             :     // transform size into transform size in particular direction.
     894             :     // for vertical edge, filter direction is horizontal, for horizontal
     895             :     // edge, filter direction is vertical.
     896     4700390 :     tx_size = (VERT_EDGE == edge_dir) ? txsize_horz_map[tx_size]
     897     4700390 :         : txsize_vert_map[tx_size];
     898     4700390 :     return tx_size;
     899             : }
     900             : 
     901             : // Return TxSize from get_transform_size(), so it is plane and direction
     902             : // awared
     903     3192120 : static TxSize set_lpf_parameters(
     904             :     AV1_DEBLOCKING_PARAMETERS *const params, const uint64_t mode_step,
     905             :     const PictureControlSet *const  pcs_ptr, const MacroBlockD *const xd,
     906             :     const EDGE_DIR edge_dir, const uint32_t x, const uint32_t y,
     907             :     const int32_t plane, const struct MacroblockdPlane *const plane_ptr) {
     908             :     // reset to initial values
     909     3192120 :     params->filter_length = 0;
     910             : 
     911             :     // no deblocking is required
     912     3192120 :     const uint32_t width = plane_ptr->dst.width;
     913     3192120 :     const uint32_t height = plane_ptr->dst.height;
     914     3192120 :     if ((width <= x) || (height <= y)) {
     915             :         // just return the smallest transform unit size
     916      745879 :         return TX_4X4;
     917             :     }
     918             : 
     919     2446240 :     const uint32_t scale_horz = plane_ptr->subsampling_x;
     920     2446240 :     const uint32_t scale_vert = plane_ptr->subsampling_y;
     921             :     // for sub8x8 block, chroma prediction mode is obtained from the bottom/right
     922             :     // mi structure of the co-located 8x8 luma block. so for chroma plane, mi_row
     923             :     // and mi_col should map to the bottom/right mi structure, i.e, both mi_row
     924             :     // and mi_col should be odd number for chroma plane.
     925             : 
     926     2446240 :     const int32_t mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2);
     927     2446240 :     const int32_t mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2);
     928     2446240 :     uint32_t mi_stride = pcs_ptr->mi_stride;
     929     2446240 :     const int32_t offset = mi_row * mi_stride + mi_col;
     930     2446240 :     ModeInfo **mi = (pcs_ptr->mi_grid_base + offset);
     931             :     //MbModeInfo **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
     932     2446240 :     const MbModeInfo *mbmi = &mi[0]->mbmi;
     933             : 
     934             :     // If current mbmi is not correctly setup, return an invalid value to stop
     935             :     // filtering. One example is that if this tile is not coded, then its mbmi
     936             :     // it not set up.
     937     2446240 :     if (mbmi == NULL) return TX_INVALID;
     938             : 
     939             :     const TxSize ts =
     940     2446240 :         get_transform_size(xd, mbmi/*mi[0]*/, edge_dir, mi_row, mi_col, plane, plane_ptr);
     941     2445860 :     assert(ts < TX_SIZES_ALL);
     942             : 
     943             :     {
     944     2447120 :         const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y);
     945     2447120 :         const uint32_t transform_masks =
     946     2447120 :             edge_dir == VERT_EDGE ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1;
     947     2447120 :         const int32_t tu_edge = (coord & transform_masks) ? (0) : (1);
     948             : 
     949     2447120 :         if (!tu_edge) return ts;
     950             : 
     951             :         // prepare outer edge parameters. deblock the edge if it's an edge of a TU
     952             :         {
     953     2445860 :             const uint32_t curr_level =
     954     2447120 :                 get_filter_level(&pcs_ptr->parent_pcs_ptr->frm_hdr,
     955     2447120 :                     &pcs_ptr->parent_pcs_ptr->lf_info, edge_dir, plane,
     956     2447120 :                     pcs_ptr->parent_pcs_ptr->curr_delta_lf, 0 /*segment_id*/,
     957     2447120 :                     mbmi->block_mi.mode, mbmi->block_mi.ref_frame[0]);
     958             : 
     959     4248760 :             const int32_t curr_skipped = mbmi->block_mi.skip &&
     960     1800970 :                 is_inter_block_no_intrabc(mbmi->block_mi.ref_frame[0]);
     961     2447790 :             uint32_t level = curr_level;
     962     2447790 :             if (coord) {
     963             :                 {
     964             :                     //const ModeInfo *const mi_prev = *(mi - mode_step);
     965     2258660 :                     const ModeInfo *const mi_prevTemp = *(mi - mode_step);
     966     2258660 :                     const MbModeInfo *const mi_prev = &mi_prevTemp[0].mbmi;
     967             :                     //
     968     2258660 :                     if (mi_prev == NULL) return TX_INVALID;
     969     2258660 :                     const int32_t pv_row =
     970     2258660 :                         (VERT_EDGE == edge_dir) ? (mi_row) : (mi_row - (1 << scale_vert));
     971     2258660 :                     const int32_t pv_col =
     972     2258660 :                         (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col);
     973     2258660 :                     const TxSize pv_ts = get_transform_size(
     974             :                         xd, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr);
     975     2261140 :                     const uint32_t pv_lvl =
     976     2257780 :                         get_filter_level(&pcs_ptr->parent_pcs_ptr->frm_hdr,
     977     2257780 :                             &pcs_ptr->parent_pcs_ptr->lf_info, edge_dir, plane,
     978     2257780 :                             pcs_ptr->parent_pcs_ptr->curr_delta_lf, 0 /*segment_id*/,
     979     2257780 :                             mi_prev->block_mi.mode, mi_prev->block_mi.ref_frame[0]);
     980             : 
     981     3905680 :                     const int32_t pv_skip = mi_prev->block_mi.skip &&
     982     1644610 :                         is_inter_block_no_intrabc(mi_prev->block_mi.ref_frame[0]);
     983             : 
     984             :                     const BlockSize bsize =
     985     2261070 :                         get_plane_block_size(mbmi->block_mi.sb_type, plane_ptr->subsampling_x, plane_ptr->subsampling_y);
     986     2257310 :                     assert(bsize < BlockSizeS_ALL);
     987     2259600 :                     const int32_t prediction_masks = edge_dir == VERT_EDGE
     988     1166790 :                         ? block_size_wide[bsize] - 1
     989     2259600 :                         : block_size_high[bsize] - 1;
     990     2259600 :                     const int32_t pu_edge = !(coord & prediction_masks);
     991             :                     // if the current and the previous blocks are skipped,
     992             :                     // deblock the edge if the edge belongs to a PU's edge only.
     993     2259600 :                     if ((curr_level || pv_lvl) &&
     994     1587780 :                         (!pv_skip || !curr_skipped || pu_edge)) {
     995     2259200 :                         const TxSize min_ts = AOMMIN(ts, pv_ts);
     996     2259200 :                         if (TX_4X4 >= min_ts)
     997      374528 :                             params->filter_length = 4;
     998     1884670 :                         else if (TX_8X8 == min_ts) {
     999      456621 :                             if (plane != 0)
    1000      173027 :                                 params->filter_length = 6;
    1001             :                             else
    1002      283594 :                                 params->filter_length = 8;
    1003             :                         }
    1004             :                         else {
    1005     1428050 :                             params->filter_length = 14;
    1006             :                             // No wide filtering for chroma plane
    1007     1428050 :                             if (plane != 0)
    1008      426416 :                                 params->filter_length = 6;
    1009             :                         }
    1010             : 
    1011             :                         // update the level if the current block is skipped,
    1012             :                         // but the previous one is not
    1013     2259200 :                         level = (curr_level) ? (curr_level) : (pv_lvl);
    1014             :                     }
    1015             :                 }
    1016             :             }
    1017             :             // prepare common parameters
    1018     2448730 :             if (params->filter_length) {
    1019     2257250 :                 const LoopFilterThresh *const limits = pcs_ptr->parent_pcs_ptr->lf_info.lfthr + level;
    1020     2257250 :                 params->lim = limits->lim;
    1021     2257250 :                 params->mblim = limits->mblim;
    1022     2257250 :                 params->hev_thr = limits->hev_thr;
    1023             :             }
    1024             :         }
    1025             :     }
    1026             : 
    1027     2448730 :     return ts;
    1028             : }
    1029             : 
    1030       61377 : void eb_av1_filter_block_plane_vert(
    1031             :     const PictureControlSet *const  pcs_ptr,
    1032             :     const MacroBlockD *const xd, const int32_t plane,
    1033             :     const MacroblockdPlane *const plane_ptr,
    1034             :     const uint32_t mi_row, const uint32_t mi_col) {
    1035       61377 :     SequenceControlSet *scs_ptr = (SequenceControlSet*)pcs_ptr->parent_pcs_ptr->sequence_control_set_wrapper_ptr->object_ptr;
    1036       61377 :     EbBool is16bit = scs_ptr->static_config.encoder_bit_depth > 8;
    1037       61377 :     const int32_t row_step = MI_SIZE >> MI_SIZE_LOG2;
    1038       61377 :     const uint32_t scale_horz = plane_ptr->subsampling_x;
    1039       61377 :     const uint32_t scale_vert = plane_ptr->subsampling_y;
    1040       61377 :     uint8_t *const dst_ptr = plane_ptr->dst.buf;
    1041       61377 :     const int32_t dst_stride = plane_ptr->dst.stride;
    1042       61377 :     const int32_t y_range = scs_ptr->seq_header.sb_size == BLOCK_128X128 ? (MAX_MIB_SIZE >> scale_vert) : (SB64_MIB_SIZE >> scale_vert);
    1043       61377 :     const int32_t x_range = scs_ptr->seq_header.sb_size == BLOCK_128X128 ? (MAX_MIB_SIZE >> scale_horz) : (SB64_MIB_SIZE >> scale_horz);
    1044      777348 :     for (int32_t y = 0; y < y_range; y += row_step) {
    1045      716566 :         uint8_t *p = dst_ptr + ((y * MI_SIZE * dst_stride) << plane_ptr->is16Bit);
    1046     2534320 :         for (int32_t x = 0; x < x_range;) {
    1047             :             // inner loop always filter vertical edges in a MI block. If MI size
    1048             :             // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
    1049             :             // If 4x4 trasnform is used, it will then filter the internal edge
    1050             :             //  aligned with a 4x4 block
    1051     1818340 :             const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
    1052     1818340 :             const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
    1053             :             uint32_t advance_units;
    1054             :             TxSize tx_size;
    1055             :             AV1_DEBLOCKING_PARAMETERS params;
    1056     1818340 :             memset(&params, 0, sizeof(params));
    1057             : 
    1058             :             tx_size =
    1059     1818340 :                 set_lpf_parameters(&params, ((uint64_t)1 << scale_horz), pcs_ptr, xd,
    1060             :                     VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
    1061     1817650 :             if (tx_size == TX_INVALID) {
    1062           0 :                 params.filter_length = 0;
    1063           0 :                 tx_size = TX_4X4;
    1064             :             }
    1065             : 
    1066     1817650 :             switch (params.filter_length) {
    1067             :                 // apply 4-tap filtering
    1068      176995 :             case 4:
    1069      176995 :                 if (is16bit)
    1070           0 :                     aom_highbd_lpf_vertical_4(
    1071             :                     (uint16_t*)(p),//CONVERT_TO_SHORTPTR(p),
    1072             :                         dst_stride,
    1073             :                         params.mblim,
    1074             :                         params.lim,
    1075             :                         params.hev_thr,
    1076           0 :                         scs_ptr->static_config.encoder_bit_depth);
    1077             :                 else
    1078      176995 :                     aom_lpf_vertical_4(
    1079             :                         p,
    1080             :                         dst_stride,
    1081             :                         params.mblim,
    1082             :                         params.lim,
    1083             :                         params.hev_thr);
    1084      177015 :                 break;
    1085      316184 :             case 6:  // apply 6-tap filter for chroma plane only
    1086      316184 :                 assert(plane != 0);
    1087      316184 :                 if (is16bit)
    1088           0 :                     aom_highbd_lpf_vertical_6(
    1089             :                     (uint16_t*)(p),//CONVERT_TO_SHORTPTR(p),
    1090             :                         dst_stride,
    1091             :                         params.mblim,
    1092             :                         params.lim,
    1093             :                         params.hev_thr,
    1094           0 :                         scs_ptr->static_config.encoder_bit_depth);
    1095             :                 else
    1096      316184 :                     aom_lpf_vertical_6(
    1097             :                         p,
    1098             :                         dst_stride,
    1099             :                         params.mblim,
    1100             :                         params.lim,
    1101             :                         params.hev_thr);
    1102      316209 :                 break;
    1103             :                 // apply 8-tap filtering
    1104      145391 :             case 8:
    1105      145391 :                 if (is16bit)
    1106           0 :                     aom_highbd_lpf_vertical_8(
    1107             :                     (uint16_t*)(p),//CONVERT_TO_SHORTPTR(p),
    1108             :                         dst_stride,
    1109             :                         params.mblim,
    1110             :                         params.lim,
    1111             :                         params.hev_thr,
    1112           0 :                         scs_ptr->static_config.encoder_bit_depth);
    1113             :                 else
    1114      145391 :                     aom_lpf_vertical_8(
    1115             :                         p,
    1116             :                         dst_stride,
    1117             :                         params.mblim,
    1118             :                         params.lim,
    1119             :                         params.hev_thr);
    1120      145407 :                 break;
    1121             :                 // apply 14-tap filtering
    1122      529157 :             case 14:
    1123      529157 :                 if (is16bit)
    1124           0 :                     aom_highbd_lpf_vertical_14(
    1125             :                     (uint16_t*)(p),//CONVERT_TO_SHORTPTR(p),
    1126             :                         dst_stride,
    1127             :                         params.mblim,
    1128             :                         params.lim,
    1129             :                         params.hev_thr,
    1130           0 :                         scs_ptr->static_config.encoder_bit_depth);
    1131             :                 else
    1132      529157 :                     aom_lpf_vertical_14(
    1133             :                         p,
    1134             :                         dst_stride,
    1135             :                         params.mblim,
    1136             :                         params.lim,
    1137             :                         params.hev_thr);
    1138      529195 :                 break;
    1139             :                 // no filtering
    1140      649923 :             default: break;
    1141             :             }
    1142             :             // advance the destination pointer
    1143     1817750 :             assert(tx_size < TX_SIZES_ALL);
    1144     1817750 :             advance_units = tx_size_wide_unit[tx_size];
    1145     1817750 :             x += advance_units;
    1146     1817750 :             p += ((advance_units * MI_SIZE) << plane_ptr->is16Bit);
    1147             :         }
    1148             :     }
    1149       60782 : }
    1150             : 
    1151       61380 : void eb_av1_filter_block_plane_horz(
    1152             :     const PictureControlSet *const  pcs_ptr,
    1153             :     const MacroBlockD *const xd, const int32_t plane,
    1154             :     const MacroblockdPlane *const plane_ptr,
    1155             :     const uint32_t mi_row, const uint32_t mi_col) {
    1156       61380 :     SequenceControlSet *scs_ptr = (SequenceControlSet*)pcs_ptr->parent_pcs_ptr->sequence_control_set_wrapper_ptr->object_ptr;
    1157       61380 :     EbBool is16bit = scs_ptr->static_config.encoder_bit_depth > 8;
    1158       61380 :     const int32_t col_step = MI_SIZE >> MI_SIZE_LOG2;
    1159       61380 :     const uint32_t scale_horz = plane_ptr->subsampling_x;
    1160       61380 :     const uint32_t scale_vert = plane_ptr->subsampling_y;
    1161       61380 :     uint8_t *const dst_ptr = plane_ptr->dst.buf;
    1162       61380 :     const int32_t dst_stride = plane_ptr->dst.stride;
    1163       61380 :     const int32_t y_range = scs_ptr->seq_header.sb_size == BLOCK_128X128 ? (MAX_MIB_SIZE >> scale_vert) : (SB64_MIB_SIZE >> scale_vert);
    1164       61380 :     const int32_t x_range = scs_ptr->seq_header.sb_size == BLOCK_128X128 ? (MAX_MIB_SIZE >> scale_horz) : (SB64_MIB_SIZE >> scale_horz);
    1165       61380 :     uint32_t mi_stride = pcs_ptr->mi_stride;
    1166      777460 :     for (int32_t x = 0; x < x_range; x += col_step) {
    1167      716545 :         uint8_t *p = dst_ptr + ((x * MI_SIZE) << plane_ptr->is16Bit);
    1168     2091240 :         for (int32_t y = 0; y < y_range;) {
    1169             :             // inner loop always filter vertical edges in a MI block. If MI size
    1170             :             // is 8x8, it will first filter the vertical edge aligned with a 8x8
    1171             :             // block. If 4x4 trasnform is used, it will then filter the internal
    1172             :             // edge aligned with a 4x4 block
    1173     1375160 :             const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
    1174     1375160 :             const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
    1175             :             uint32_t advance_units;
    1176             :             TxSize tx_size;
    1177             :             AV1_DEBLOCKING_PARAMETERS params;
    1178     1375160 :             memset(&params, 0, sizeof(params));
    1179             : 
    1180             :             tx_size =
    1181     1375160 :                 set_lpf_parameters(
    1182             :                     &params,
    1183             :                     //(pcs_ptr->parent_pcs_ptr->av1_cm->mi_stride << scale_vert),
    1184     1375160 :                     (mi_stride << scale_vert),
    1185             :                     pcs_ptr,
    1186             :                     xd,
    1187             :                     HORZ_EDGE,
    1188             :                     curr_x,
    1189             :                     curr_y,
    1190             :                     plane,
    1191             :                     plane_ptr);
    1192     1374680 :             if (tx_size == TX_INVALID) {
    1193           0 :                 params.filter_length = 0;
    1194           0 :                 tx_size = TX_4X4;
    1195             :             }
    1196             : 
    1197     1374680 :             switch (params.filter_length) {
    1198             :                 // apply 4-tap filtering
    1199      197647 :             case 4:
    1200      197647 :                 if (is16bit)
    1201           0 :                     aom_highbd_lpf_horizontal_4(
    1202             :                     (uint16_t*)(p),//CONVERT_TO_SHORTPTR(p),
    1203             :                         dst_stride,
    1204             :                         params.mblim,
    1205             :                         params.lim,
    1206             :                         params.hev_thr,
    1207           0 :                         scs_ptr->static_config.encoder_bit_depth);
    1208             :                 else
    1209      197647 :                     aom_lpf_horizontal_4(
    1210             :                         p,
    1211             :                         dst_stride,
    1212             :                         params.mblim,
    1213             :                         params.lim,
    1214             :                         params.hev_thr);
    1215      197682 :                 break;
    1216             :                 // apply 6-tap filtering
    1217      283243 :             case 6:
    1218      283243 :                 assert(plane != 0);
    1219      283243 :                 if (is16bit)
    1220           0 :                     aom_highbd_lpf_horizontal_6(
    1221             :                     (uint16_t*)(p),//CONVERT_TO_SHORTPTR(p),
    1222             :                         dst_stride,
    1223             :                         params.mblim,
    1224             :                         params.lim,
    1225             :                         params.hev_thr,
    1226           0 :                         scs_ptr->static_config.encoder_bit_depth);
    1227             :                 else
    1228      283243 :                     aom_lpf_horizontal_6(
    1229             :                         p,
    1230             :                         dst_stride,
    1231             :                         params.mblim,
    1232             :                         params.lim,
    1233             :                         params.hev_thr);
    1234      283244 :                 break;
    1235             :                 // apply 8-tap filtering
    1236      138627 :             case 8:
    1237      138627 :                 if (is16bit)
    1238           0 :                     aom_highbd_lpf_horizontal_8(
    1239             :                     (uint16_t*)(p),//CONVERT_TO_SHORTPTR(p),
    1240             :                         dst_stride,
    1241             :                         params.mblim,
    1242             :                         params.lim,
    1243             :                         params.hev_thr,
    1244           0 :                         scs_ptr->static_config.encoder_bit_depth);
    1245             :                 else
    1246      138627 :                     aom_lpf_horizontal_8(
    1247             :                         p,
    1248             :                         dst_stride,
    1249             :                         params.mblim,
    1250             :                         params.lim,
    1251             :                         params.hev_thr);
    1252      138632 :                 break;
    1253             :                 // apply 14-tap filtering
    1254      474165 :             case 14:
    1255      474165 :                 if (is16bit)
    1256           0 :                     aom_highbd_lpf_horizontal_14(
    1257             :                     (uint16_t*)(p),//CONVERT_TO_SHORTPTR(p),
    1258             :                         dst_stride,
    1259             :                         params.mblim,
    1260             :                         params.lim,
    1261             :                         params.hev_thr,
    1262           0 :                         scs_ptr->static_config.encoder_bit_depth);
    1263             :                 else
    1264      474165 :                     aom_lpf_horizontal_14(
    1265             :                         p,
    1266             :                         dst_stride,
    1267             :                         params.mblim,
    1268             :                         params.lim,
    1269             :                         params.hev_thr);
    1270      474141 :                 break;
    1271             :                 // no filtering
    1272      280997 :             default: break;
    1273             :             }
    1274             : 
    1275             :             // advance the destination pointer
    1276     1374700 :             assert(tx_size < TX_SIZES_ALL);
    1277     1374700 :             advance_units = tx_size_high_unit[tx_size];
    1278     1374700 :             y += advance_units;
    1279     1374700 :             p += ((advance_units * dst_stride * MI_SIZE) << plane_ptr->is16Bit);
    1280             :         }
    1281             :     }
    1282       60915 : }
    1283             : 
    1284             : // New function to filter each sb (64x64)
    1285       64799 : void loop_filter_sb(
    1286             :     EbPictureBufferDesc *frame_buffer,//reconpicture,
    1287             :     //Yv12BufferConfig *frame_buffer,
    1288             :     PictureControlSet *pcs_ptr,
    1289             :     MacroBlockD *xd, int32_t mi_row, int32_t mi_col,
    1290             :     int32_t plane_start, int32_t plane_end,
    1291             :     uint8_t LastCol) {
    1292       64799 :     FrameHeader *frm_hdr = &pcs_ptr->parent_pcs_ptr->frm_hdr;
    1293             :     struct MacroblockdPlane pd[3];
    1294             :     int32_t plane;
    1295             : 
    1296       64799 :     pd[0].subsampling_x = 0;
    1297       64799 :     pd[0].subsampling_y = 0;
    1298       64799 :     pd[0].plane_type = PLANE_TYPE_Y;
    1299       64799 :     pd[0].is16Bit = frame_buffer->bit_depth > 8;
    1300       64799 :     pd[1].subsampling_x = 1;
    1301       64799 :     pd[1].subsampling_y = 1;
    1302       64799 :     pd[1].plane_type = PLANE_TYPE_UV;
    1303       64799 :     pd[1].is16Bit = frame_buffer->bit_depth > 8;
    1304       64799 :     pd[2].subsampling_x = 1;
    1305       64799 :     pd[2].subsampling_y = 1;
    1306       64799 :     pd[2].plane_type = PLANE_TYPE_UV;
    1307       64799 :     pd[2].is16Bit = frame_buffer->bit_depth > 8;
    1308             : 
    1309      136438 :     for (plane = plane_start; plane < plane_end; plane++) {
    1310       75359 :         if (plane == 0 && !(frm_hdr->loop_filter_params.filter_level[0]) && !(frm_hdr->loop_filter_params.filter_level[1]))
    1311             :             break;
    1312       71639 :         else if (plane == 1 && !(frm_hdr->loop_filter_params.filter_level_u))
    1313        4560 :             continue;
    1314       67079 :         else if (plane == 2 && !(frm_hdr->loop_filter_params.filter_level_v))
    1315        5700 :             continue;
    1316             : 
    1317       61379 :         if (frm_hdr->loop_filter_params.combine_vert_horz_lf) {
    1318             :             // filter all vertical and horizontal edges in every 64x64 super block
    1319             :             // filter vertical edges
    1320       61379 :             eb_av1_setup_dst_planes(pd, pcs_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header.sb_size, frame_buffer, mi_row,
    1321             :                 mi_col, plane, plane + 1);
    1322       61377 :             eb_av1_filter_block_plane_vert(pcs_ptr, xd, plane, &pd[plane], mi_row,
    1323             :                 mi_col);
    1324             :             // filter horizontal edges
    1325       61380 :             int32_t max_mib_size = pcs_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header.sb_size == BLOCK_128X128 ? MAX_MIB_SIZE : SB64_MIB_SIZE;
    1326             : 
    1327       61380 :             if (mi_col - max_mib_size >= 0) {
    1328       55242 :                 eb_av1_setup_dst_planes(pd, pcs_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header.sb_size, frame_buffer,
    1329             :                     mi_row, mi_col - max_mib_size, plane,
    1330             :                     plane + 1);
    1331       55242 :                 eb_av1_filter_block_plane_horz(pcs_ptr, xd, plane, &pd[plane], mi_row,
    1332       55242 :                     mi_col - max_mib_size);
    1333             :             }
    1334             :             // Filter the horizontal edges of the last lcu in each row
    1335       61380 :             if (LastCol) {
    1336        6138 :                 eb_av1_setup_dst_planes(pd, pcs_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header.sb_size, frame_buffer,
    1337             :                     mi_row, mi_col, plane,
    1338             :                     plane + 1);
    1339        6138 :                 eb_av1_filter_block_plane_horz(pcs_ptr, xd, plane, &pd[plane], mi_row,
    1340             :                     mi_col);
    1341             :             }
    1342             :         }
    1343             :         else {
    1344             :             // filter all vertical edges in every 64x64 super block
    1345           0 :             eb_av1_setup_dst_planes(pd, pcs_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header.sb_size, frame_buffer, mi_row,
    1346             :                 mi_col, plane, plane + 1);
    1347             : 
    1348           0 :             eb_av1_filter_block_plane_vert(pcs_ptr, xd, plane, &pd[plane], mi_row,
    1349             :                 mi_col);
    1350             : 
    1351             :             // filter all horizontal edges in every 64x64 super block
    1352           0 :             eb_av1_setup_dst_planes(pd, pcs_ptr->parent_pcs_ptr->sequence_control_set_ptr->seq_header.sb_size, frame_buffer, mi_row,
    1353             :                 mi_col, plane, plane + 1);
    1354           0 :             eb_av1_filter_block_plane_horz(pcs_ptr, xd, plane, &pd[plane], mi_row,
    1355             :                 mi_col);
    1356             :         }
    1357             :     }
    1358       64799 : }
    1359             : 
    1360        1050 : void eb_av1_loop_filter_frame(
    1361             :     EbPictureBufferDesc *frame_buffer,
    1362             :     PictureControlSet *picture_control_set_ptr,
    1363             :     int32_t plane_start, int32_t plane_end) {
    1364        1050 :     SequenceControlSet *scs_ptr = (SequenceControlSet*)picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_wrapper_ptr->object_ptr;
    1365             :     //LargestCodingUnit                     *sb_ptr;
    1366             :     //uint16_t                                   sb_index;
    1367        1050 :     uint8_t                                   sb_size_Log2 = (uint8_t)Log2f(scs_ptr->sb_size_pix);
    1368             :     uint32_t                                   x_lcu_index;
    1369             :     uint32_t                                   y_lcu_index;
    1370             :     uint32_t                                   sb_origin_x;
    1371             :     uint32_t                                   sb_origin_y;
    1372             :     EbBool                                  endOfRowFlag;
    1373             : 
    1374        1050 :     uint32_t picture_width_in_sb = (scs_ptr->seq_header.max_frame_width + scs_ptr->sb_size_pix - 1) / scs_ptr->sb_size_pix;
    1375        1050 :     uint32_t picture_height_in_sb = (scs_ptr->seq_header.max_frame_height + scs_ptr->sb_size_pix - 1) / scs_ptr->sb_size_pix;
    1376             : 
    1377        1050 :     eb_av1_loop_filter_frame_init(&picture_control_set_ptr->parent_pcs_ptr->frm_hdr,
    1378        1050 :         &picture_control_set_ptr->parent_pcs_ptr->lf_info, plane_start, plane_end);
    1379             : 
    1380        7350 :     for (y_lcu_index = 0; y_lcu_index < picture_height_in_sb; ++y_lcu_index) {
    1381       69299 :         for (x_lcu_index = 0; x_lcu_index < picture_width_in_sb; ++x_lcu_index) {
    1382             :             //sb_index        = (uint16_t)(y_lcu_index * picture_width_in_sb + x_lcu_index);
    1383             :             //sb_ptr          = picture_control_set_ptr->sb_ptr_array[sb_index];
    1384       62999 :             sb_origin_x = x_lcu_index << sb_size_Log2;
    1385       62999 :             sb_origin_y = y_lcu_index << sb_size_Log2;
    1386       62999 :             endOfRowFlag = (x_lcu_index == picture_width_in_sb - 1) ? EB_TRUE : EB_FALSE;
    1387             : 
    1388       62999 :             loop_filter_sb(
    1389             :                 frame_buffer,
    1390             :                 picture_control_set_ptr,
    1391             :                 NULL,
    1392       62999 :                 sb_origin_y >> 2,
    1393       62999 :                 sb_origin_x >> 2,
    1394             :                 plane_start,
    1395             :                 plane_end,
    1396             :                 endOfRowFlag);
    1397             :         }
    1398             :     }
    1399        1050 : }
    1400             : extern int16_t eb_av1_ac_quant_Q3(int32_t qindex, int32_t delta, AomBitDepth bit_depth);
    1401             : 
    1402        1170 : void EbCopyBuffer(
    1403             :     EbPictureBufferDesc  *srcBuffer,
    1404             :     EbPictureBufferDesc  *dstBuffer,
    1405             :     PictureControlSet    *pcs_ptr,
    1406             :     uint8_t                   plane) {
    1407        1170 :     EbBool is16bit = (EbBool)(pcs_ptr->parent_pcs_ptr->sequence_control_set_ptr->static_config.encoder_bit_depth > EB_8BIT);
    1408        1170 :     dstBuffer->origin_x = srcBuffer->origin_x;
    1409        1170 :     dstBuffer->origin_y = srcBuffer->origin_y;
    1410        1170 :     dstBuffer->width = srcBuffer->width;
    1411        1170 :     dstBuffer->height = srcBuffer->height;
    1412        1170 :     dstBuffer->max_width = srcBuffer->max_width;
    1413        1170 :     dstBuffer->max_height = srcBuffer->max_height;
    1414        1170 :     dstBuffer->bit_depth = srcBuffer->bit_depth;
    1415        1170 :     dstBuffer->luma_size = srcBuffer->luma_size;
    1416        1170 :     dstBuffer->chroma_size = srcBuffer->chroma_size;
    1417        1170 :     dstBuffer->packedFlag = srcBuffer->packedFlag;
    1418             : 
    1419        1170 :     uint32_t   lumaBufferOffset = (srcBuffer->origin_x + srcBuffer->origin_y*srcBuffer->stride_y) << is16bit;
    1420        1170 :     uint16_t   luma_width = (uint16_t)(srcBuffer->width - pcs_ptr->parent_pcs_ptr->sequence_control_set_ptr->pad_right) << is16bit;
    1421        1170 :     uint16_t   luma_height = (uint16_t)(srcBuffer->height - pcs_ptr->parent_pcs_ptr->sequence_control_set_ptr->pad_bottom);
    1422        1170 :     uint16_t   chroma_width = (luma_width >> 1);
    1423        1170 :     if (plane == 0) {
    1424         502 :         uint16_t stride_y = srcBuffer->stride_y << is16bit;
    1425             : 
    1426         502 :         dstBuffer->stride_y = srcBuffer->stride_y;
    1427         502 :         dstBuffer->stride_bit_inc_y = srcBuffer->stride_bit_inc_y;
    1428             : 
    1429      181222 :         for (int32_t inputRowIndex = 0; inputRowIndex < luma_height; inputRowIndex++) {
    1430      180720 :             EB_MEMCPY((dstBuffer->buffer_y + lumaBufferOffset + stride_y * inputRowIndex),
    1431             :                 (srcBuffer->buffer_y + lumaBufferOffset + stride_y * inputRowIndex),
    1432             :                 luma_width);
    1433             :         }
    1434             :     }
    1435         668 :     else if (plane == 1) {
    1436         340 :         uint16_t stride_cb = srcBuffer->stride_cb << is16bit;
    1437         340 :         dstBuffer->stride_cb = srcBuffer->stride_cb;
    1438         340 :         dstBuffer->stride_bit_inc_cb = srcBuffer->stride_bit_inc_cb;
    1439             : 
    1440         340 :         uint32_t   chromaBufferOffset = (srcBuffer->origin_x / 2 + srcBuffer->origin_y / 2 * srcBuffer->stride_cb) << is16bit;
    1441             : 
    1442       61540 :         for (int32_t inputRowIndex = 0; inputRowIndex < luma_height >> 1; inputRowIndex++) {
    1443       61200 :             EB_MEMCPY((dstBuffer->buffer_cb + chromaBufferOffset + stride_cb * inputRowIndex),
    1444             :                 (srcBuffer->buffer_cb + chromaBufferOffset + stride_cb * inputRowIndex),
    1445             :                 chroma_width);
    1446             :         }
    1447             :     }
    1448         328 :     else if (plane == 2) {
    1449         328 :         uint16_t stride_cr = srcBuffer->stride_cr << is16bit;
    1450             : 
    1451         328 :         dstBuffer->stride_cr = srcBuffer->stride_cr;
    1452         328 :         dstBuffer->stride_bit_inc_cr = srcBuffer->stride_bit_inc_cr;
    1453             : 
    1454         328 :         uint32_t   chromaBufferOffset = (srcBuffer->origin_x / 2 + srcBuffer->origin_y / 2 * srcBuffer->stride_cr) << is16bit;
    1455             : 
    1456       59368 :         for (int32_t inputRowIndex = 0; inputRowIndex < luma_height >> 1; inputRowIndex++) {
    1457       59040 :             EB_MEMCPY((dstBuffer->buffer_cr + chromaBufferOffset + stride_cr * inputRowIndex),
    1458             :                 (srcBuffer->buffer_cr + chromaBufferOffset + stride_cr * inputRowIndex),
    1459             :                 chroma_width);
    1460             :         }
    1461             :     }
    1462        1170 : }
    1463             : 
    1464             : //int32_t av1_get_max_filter_level(const Av1Comp *cpi) {
    1465             : //    if (cpi->oxcf.pass == 2) {
    1466             : //        return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
    1467             : //            : MAX_LOOP_FILTER;
    1468             : //    }
    1469             : //    else {
    1470             : //        return MAX_LOOP_FILTER;
    1471             : //    }
    1472             : //}
    1473             : 
    1474         990 : uint64_t PictureSseCalculations(
    1475             :     PictureControlSet    *picture_control_set_ptr,
    1476             :     EbPictureBufferDesc *recon_ptr,
    1477             :     int32_t plane)
    1478             : 
    1479             : {
    1480         990 :     SequenceControlSet   *sequence_control_set_ptr = picture_control_set_ptr->parent_pcs_ptr->sequence_control_set_ptr;
    1481         990 :     EbBool is16bit = (sequence_control_set_ptr->static_config.encoder_bit_depth > EB_8BIT);
    1482             : 
    1483         990 :     if (!is16bit) {
    1484         990 :         EbPictureBufferDesc *input_picture_ptr = (EbPictureBufferDesc*)picture_control_set_ptr->parent_pcs_ptr->enhanced_picture_ptr;
    1485             : 
    1486             :         uint32_t   columnIndex;
    1487         990 :         uint32_t   row_index = 0;
    1488         990 :         uint64_t   residualDistortion = 0;
    1489             :         EbByte  inputBuffer;
    1490             :         EbByte  reconCoeffBuffer;
    1491         990 :         if (plane == 0) {
    1492         442 :             reconCoeffBuffer = &((recon_ptr->buffer_y)[recon_ptr->origin_x + recon_ptr->origin_y * recon_ptr->stride_y]);
    1493         442 :             inputBuffer = &((input_picture_ptr->buffer_y)[input_picture_ptr->origin_x + input_picture_ptr->origin_y * input_picture_ptr->stride_y]);
    1494             : 
    1495         442 :             residualDistortion = 0;
    1496             : 
    1497      159562 :             while (row_index < sequence_control_set_ptr->seq_header.max_frame_height) {
    1498      159120 :                 columnIndex = 0;
    1499   101915000 :                 while (columnIndex < sequence_control_set_ptr->seq_header.max_frame_width) {
    1500   101755000 :                     residualDistortion += (int64_t)SQR((int64_t)(inputBuffer[columnIndex]) - (reconCoeffBuffer[columnIndex]));
    1501   101755000 :                     ++columnIndex;
    1502             :                 }
    1503      159120 :                 inputBuffer += input_picture_ptr->stride_y;
    1504      159120 :                 reconCoeffBuffer += recon_ptr->stride_y;
    1505      159120 :                 ++row_index;
    1506             :             }
    1507             : 
    1508         442 :             return residualDistortion;
    1509             :         }
    1510             : 
    1511         548 :         else if (plane == 1) {
    1512         280 :             reconCoeffBuffer = &((recon_ptr->buffer_cb)[recon_ptr->origin_x / 2 + recon_ptr->origin_y / 2 * recon_ptr->stride_cb]);
    1513         280 :             inputBuffer = &((input_picture_ptr->buffer_cb)[input_picture_ptr->origin_x / 2 + input_picture_ptr->origin_y / 2 * input_picture_ptr->stride_cb]);
    1514             : 
    1515         280 :             residualDistortion = 0;
    1516         280 :             row_index = 0;
    1517       50680 :             while (row_index < sequence_control_set_ptr->chroma_height) {
    1518       50400 :                 columnIndex = 0;
    1519    16178400 :                 while (columnIndex < sequence_control_set_ptr->chroma_width) {
    1520    16128000 :                     residualDistortion += (int64_t)SQR((int64_t)(inputBuffer[columnIndex]) - (reconCoeffBuffer[columnIndex]));
    1521    16128000 :                     ++columnIndex;
    1522             :                 }
    1523             : 
    1524       50400 :                 inputBuffer += input_picture_ptr->stride_cb;
    1525       50400 :                 reconCoeffBuffer += recon_ptr->stride_cb;
    1526       50400 :                 ++row_index;
    1527             :             }
    1528             : 
    1529         280 :             return residualDistortion;
    1530             :         }
    1531         268 :         else if (plane == 2) {
    1532         268 :             reconCoeffBuffer = &((recon_ptr->buffer_cr)[recon_ptr->origin_x / 2 + recon_ptr->origin_y / 2 * recon_ptr->stride_cr]);
    1533         268 :             inputBuffer = &((input_picture_ptr->buffer_cr)[input_picture_ptr->origin_x / 2 + input_picture_ptr->origin_y / 2 * input_picture_ptr->stride_cr]);
    1534         268 :             residualDistortion = 0;
    1535         268 :             row_index = 0;
    1536             : 
    1537       48508 :             while (row_index < sequence_control_set_ptr->chroma_height) {
    1538       48240 :                 columnIndex = 0;
    1539    15485000 :                 while (columnIndex < sequence_control_set_ptr->chroma_width) {
    1540    15436800 :                     residualDistortion += (int64_t)SQR((int64_t)(inputBuffer[columnIndex]) - (reconCoeffBuffer[columnIndex]));
    1541    15436800 :                     ++columnIndex;
    1542             :                 }
    1543             : 
    1544       48240 :                 inputBuffer += input_picture_ptr->stride_cr;
    1545       48240 :                 reconCoeffBuffer += recon_ptr->stride_cr;
    1546       48240 :                 ++row_index;
    1547             :             }
    1548             : 
    1549         268 :             return residualDistortion;
    1550             :         }
    1551           0 :         return 0;
    1552             :     }
    1553             :     else {
    1554           0 :         EbPictureBufferDesc *input_picture_ptr = (EbPictureBufferDesc*)picture_control_set_ptr->input_frame16bit;
    1555             : 
    1556             :         uint32_t   columnIndex;
    1557           0 :         uint32_t   row_index = 0;
    1558           0 :         uint64_t   residualDistortion = 0;
    1559             :         uint16_t*  inputBuffer;
    1560             :         uint16_t*  reconCoeffBuffer;
    1561           0 :         if (plane == 0) {
    1562           0 :             reconCoeffBuffer = (uint16_t*)&((recon_ptr->buffer_y)[(recon_ptr->origin_x + recon_ptr->origin_y * recon_ptr->stride_y) << is16bit]);
    1563           0 :             inputBuffer = (uint16_t*)&((input_picture_ptr->buffer_y)[(input_picture_ptr->origin_x + input_picture_ptr->origin_y * input_picture_ptr->stride_y) << is16bit]);
    1564             : 
    1565           0 :             residualDistortion = 0;
    1566             : 
    1567           0 :             while (row_index < sequence_control_set_ptr->seq_header.max_frame_height) {
    1568           0 :                 columnIndex = 0;
    1569           0 :                 while (columnIndex < sequence_control_set_ptr->seq_header.max_frame_width) {
    1570           0 :                     residualDistortion += (int64_t)SQR(((int64_t)inputBuffer[columnIndex]) - (int64_t)(reconCoeffBuffer[columnIndex]));
    1571           0 :                     ++columnIndex;
    1572             :                 }
    1573             : 
    1574           0 :                 inputBuffer += input_picture_ptr->stride_y;
    1575           0 :                 reconCoeffBuffer += recon_ptr->stride_y;
    1576           0 :                 ++row_index;
    1577             :             }
    1578             : 
    1579           0 :             return residualDistortion;
    1580             :         }
    1581             : 
    1582           0 :         else if (plane == 1) {
    1583           0 :             reconCoeffBuffer = (uint16_t*)&((recon_ptr->buffer_cb)[(recon_ptr->origin_x / 2 + recon_ptr->origin_y / 2 * recon_ptr->stride_cb) << is16bit]);
    1584           0 :             inputBuffer = (uint16_t*)&((input_picture_ptr->buffer_cb)[(input_picture_ptr->origin_x / 2 + input_picture_ptr->origin_y / 2 * input_picture_ptr->stride_cb) << is16bit]);
    1585             : 
    1586           0 :             residualDistortion = 0;
    1587           0 :             row_index = 0;
    1588           0 :             while (row_index < sequence_control_set_ptr->chroma_height) {
    1589           0 :                 columnIndex = 0;
    1590           0 :                 while (columnIndex < sequence_control_set_ptr->chroma_width) {
    1591           0 :                     residualDistortion += (int64_t)SQR(((int64_t)inputBuffer[columnIndex]) - (int64_t)(reconCoeffBuffer[columnIndex]));
    1592           0 :                     ++columnIndex;
    1593             :                 }
    1594             : 
    1595           0 :                 inputBuffer += input_picture_ptr->stride_cb;
    1596           0 :                 reconCoeffBuffer += recon_ptr->stride_cb;
    1597           0 :                 ++row_index;
    1598             :             }
    1599             : 
    1600           0 :             return residualDistortion;
    1601             :         }
    1602           0 :         else if (plane == 2) {
    1603           0 :             reconCoeffBuffer = (uint16_t*)&((recon_ptr->buffer_cr)[(recon_ptr->origin_x / 2 + recon_ptr->origin_y / 2 * recon_ptr->stride_cr) << is16bit]);
    1604           0 :             inputBuffer = (uint16_t*)&((input_picture_ptr->buffer_cr)[(input_picture_ptr->origin_x / 2 + input_picture_ptr->origin_y / 2 * input_picture_ptr->stride_cr) << is16bit]);
    1605           0 :             residualDistortion = 0;
    1606           0 :             row_index = 0;
    1607             : 
    1608           0 :             while (row_index < sequence_control_set_ptr->chroma_height) {
    1609           0 :                 columnIndex = 0;
    1610           0 :                 while (columnIndex < sequence_control_set_ptr->chroma_width) {
    1611           0 :                     residualDistortion += (int64_t)SQR(((int64_t)inputBuffer[columnIndex]) - (int64_t)(reconCoeffBuffer[columnIndex]));
    1612           0 :                     ++columnIndex;
    1613             :                 }
    1614             : 
    1615           0 :                 inputBuffer += input_picture_ptr->stride_cr;
    1616           0 :                 reconCoeffBuffer += recon_ptr->stride_cr;
    1617           0 :                 ++row_index;
    1618             :             }
    1619             : 
    1620           0 :             return residualDistortion;
    1621             :         }
    1622             : 
    1623           0 :         return 0;
    1624             :     }
    1625             : }
    1626             : 
    1627         990 : static int64_t try_filter_frame(
    1628             :     //const Yv12BufferConfig *sd,
    1629             :     //Av1Comp *const cpi,
    1630             :     const EbPictureBufferDesc *sd,
    1631             :     EbPictureBufferDesc  *tempLfReconBuffer,
    1632             :     PictureControlSet *pcs_ptr,
    1633             :     int32_t filt_level,
    1634             :     int32_t partial_frame, int32_t plane, int32_t dir) {
    1635             :     (void)sd;
    1636             :     (void)partial_frame;
    1637             :     (void)sd;
    1638             :     int64_t filt_err;
    1639         990 :     FrameHeader *frm_hdr = &pcs_ptr->parent_pcs_ptr->frm_hdr;
    1640         990 :     assert(plane >= 0 && plane <= 2);
    1641         990 :     int32_t filter_level[2] = { filt_level, filt_level };
    1642         990 :     if (plane == 0 && dir == 0) filter_level[1] = frm_hdr->loop_filter_params.filter_level[1];
    1643         990 :     if (plane == 0 && dir == 1) filter_level[0] = frm_hdr->loop_filter_params.filter_level[0];
    1644             : 
    1645         990 :     EbBool is16bit = (EbBool)(pcs_ptr->parent_pcs_ptr->sequence_control_set_ptr->static_config.encoder_bit_depth > EB_8BIT);
    1646         990 :     EbPictureBufferDesc  *recon_buffer = is16bit ? pcs_ptr->recon_picture16bit_ptr : pcs_ptr->recon_picture_ptr;
    1647         990 :     if (pcs_ptr->parent_pcs_ptr->is_used_as_reference_flag == EB_TRUE) {
    1648             :         //get the 16bit form of the input LCU
    1649         657 :         if (is16bit)
    1650           0 :             recon_buffer = ((EbReferenceObject*)pcs_ptr->parent_pcs_ptr->reference_picture_wrapper_ptr->object_ptr)->reference_picture16bit;
    1651             :         else
    1652         657 :             recon_buffer = ((EbReferenceObject*)pcs_ptr->parent_pcs_ptr->reference_picture_wrapper_ptr->object_ptr)->reference_picture;
    1653             :     }
    1654             :     else { // non ref pictures
    1655         333 :         recon_buffer = is16bit ? pcs_ptr->recon_picture16bit_ptr : pcs_ptr->recon_picture_ptr;
    1656             :     }
    1657             : 
    1658             :     // set base filters for use of get_filter_level when in DELTA_Q_LF mode
    1659         990 :     switch (plane) {
    1660         442 :     case 0:
    1661         442 :         frm_hdr->loop_filter_params.filter_level[0] = filter_level[0];
    1662         442 :         frm_hdr->loop_filter_params.filter_level[1] = filter_level[1];
    1663         442 :         break;
    1664         280 :     case 1: frm_hdr->loop_filter_params.filter_level_u = filter_level[0]; break;
    1665         268 :     case 2: frm_hdr->loop_filter_params.filter_level_v = filter_level[0]; break;
    1666             :     }
    1667             : 
    1668         990 :     eb_av1_loop_filter_frame(recon_buffer, pcs_ptr, plane, plane + 1);
    1669             : 
    1670         990 :     filt_err = PictureSseCalculations(pcs_ptr, recon_buffer, plane);
    1671             : 
    1672             :     // Re-instate the unfiltered frame
    1673         990 :     EbCopyBuffer(tempLfReconBuffer/*cpi->last_frame_uf*/, recon_buffer /*cm->frame_to_show*/, pcs_ptr, (uint8_t)plane);
    1674             : 
    1675         990 :     return filt_err;
    1676             : }
    1677         180 : static int32_t search_filter_level(
    1678             :     //const Yv12BufferConfig *sd, Av1Comp *cpi,
    1679             :     EbPictureBufferDesc *sd, // source
    1680             :     EbPictureBufferDesc  *tempLfReconBuffer,
    1681             :     PictureControlSet *pcs_ptr,
    1682             :     int32_t partial_frame,
    1683             :     const int32_t *last_frame_filter_level,
    1684             :     double *best_cost_ret, int32_t plane, int32_t dir) {
    1685         180 :     const int32_t min_filter_level = 0;
    1686         180 :     const int32_t max_filter_level = MAX_LOOP_FILTER;// av1_get_max_filter_level(cpi);
    1687         180 :     int32_t filt_direction = 0;
    1688             :     int64_t best_err;
    1689             :     int32_t filt_best;
    1690         180 :     FrameHeader *frm_hdr = &pcs_ptr->parent_pcs_ptr->frm_hdr;
    1691             :     //Macroblock *x = &cpi->td.mb;
    1692             : 
    1693             :     // Start the search at the previous frame filter level unless it is now out of
    1694             :     // range.
    1695             :     int32_t lvl;
    1696         180 :     switch (plane) {
    1697          60 :     case 0: lvl = last_frame_filter_level[dir]; break;
    1698          60 :     case 1: lvl = last_frame_filter_level[2]; break;
    1699          60 :     case 2: lvl = last_frame_filter_level[3]; break;
    1700           0 :     default: assert(plane >= 0 && plane <= 2); return 0;
    1701             :     }
    1702         180 :     int32_t filt_mid = clamp(lvl, min_filter_level, max_filter_level);
    1703         180 :     int32_t filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
    1704             : 
    1705         180 :     EbBool is16bit = (EbBool)(pcs_ptr->parent_pcs_ptr->sequence_control_set_ptr->static_config.encoder_bit_depth > EB_8BIT);
    1706         180 :     EbPictureBufferDesc  *recon_buffer = is16bit ? pcs_ptr->recon_picture16bit_ptr : pcs_ptr->recon_picture_ptr;
    1707             : 
    1708         180 :     if (pcs_ptr->parent_pcs_ptr->is_used_as_reference_flag == EB_TRUE) {
    1709             :         //get the 16bit form of the input LCU
    1710         114 :         if (is16bit)
    1711           0 :             recon_buffer = ((EbReferenceObject*)pcs_ptr->parent_pcs_ptr->reference_picture_wrapper_ptr->object_ptr)->reference_picture16bit;
    1712             :         else
    1713         114 :             recon_buffer = ((EbReferenceObject*)pcs_ptr->parent_pcs_ptr->reference_picture_wrapper_ptr->object_ptr)->reference_picture;
    1714             :     }
    1715             :     else { // non ref pictures
    1716          66 :         recon_buffer = is16bit ? pcs_ptr->recon_picture16bit_ptr : pcs_ptr->recon_picture_ptr;
    1717             :     }
    1718             :     // Sum squared error at each filter level
    1719             :     int64_t ss_err[MAX_LOOP_FILTER + 1];
    1720             : 
    1721             :     // Set each entry to -1
    1722         180 :     memset(ss_err, 0xFF, sizeof(ss_err));
    1723             :     // make a copy of recon_buffer
    1724         180 :     EbCopyBuffer(recon_buffer/*cm->frame_to_show*/, tempLfReconBuffer/*&cpi->last_frame_uf*/, pcs_ptr, (uint8_t)plane);
    1725             : 
    1726         180 :     best_err = try_filter_frame(sd, tempLfReconBuffer, pcs_ptr, filt_mid, partial_frame, plane, dir);
    1727         180 :     filt_best = filt_mid;
    1728         180 :     ss_err[filt_mid] = best_err;
    1729             : 
    1730         180 :     if (pcs_ptr->parent_pcs_ptr->loop_filter_mode <= 2) {
    1731           0 :         filter_step = 2;
    1732           0 :         const int32_t filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
    1733           0 :         const int32_t filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
    1734             : 
    1735             :         // Bias against raising loop filter in favor of lowering it.
    1736           0 :         int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
    1737             : 
    1738             :         //if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
    1739             :         //    bias = (bias * cpi->twopass.section_intra_rating) / 20;
    1740             : 
    1741             :         // yx, bias less for large block size
    1742           0 :         if (frm_hdr->tx_mode != ONLY_4X4) bias >>= 1;
    1743             : 
    1744           0 :         if (filt_direction <= 0 && filt_low != filt_mid) {
    1745             :             // Get Low filter error score
    1746           0 :             if (ss_err[filt_low] < 0) {
    1747           0 :                 ss_err[filt_low] =
    1748           0 :                     try_filter_frame(sd, tempLfReconBuffer, pcs_ptr, filt_low, partial_frame, plane, dir);
    1749             :             }
    1750             :             // If value is close to the best so far then bias towards a lower loop
    1751             :             // filter value.
    1752           0 :             if (ss_err[filt_low] < (best_err + bias)) {
    1753             :                 // Was it actually better than the previous best?
    1754           0 :                 if (ss_err[filt_low] < best_err)
    1755           0 :                     best_err = ss_err[filt_low];
    1756           0 :                 filt_best = filt_low;
    1757             :             }
    1758             :         }
    1759             : 
    1760             :         // Now look at filt_high
    1761           0 :         if (filt_direction >= 0 && filt_high != filt_mid) {
    1762           0 :             if (ss_err[filt_high] < 0) {
    1763           0 :                 ss_err[filt_high] =
    1764           0 :                     try_filter_frame(sd, tempLfReconBuffer, pcs_ptr, filt_high, partial_frame, plane, dir);
    1765             :             }
    1766             :             // If value is significantly better than previous best, bias added against
    1767             :             // raising filter value
    1768           0 :             if (ss_err[filt_high] < (best_err - bias)) {
    1769           0 :                 best_err = ss_err[filt_high];
    1770           0 :                 filt_best = filt_high;
    1771             :             }
    1772             :         }
    1773             :     }
    1774             :     else {
    1775         983 :         while (filter_step > 0) {
    1776         803 :             const int32_t filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
    1777         803 :             const int32_t filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
    1778             : 
    1779             :             // Bias against raising loop filter in favor of lowering it.
    1780         803 :             int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
    1781             : 
    1782             :             //if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
    1783             :             //    bias = (bias * cpi->twopass.section_intra_rating) / 20;
    1784             : 
    1785             :             // yx, bias less for large block size
    1786         803 :             if (frm_hdr->tx_mode != ONLY_4X4) bias >>= 1;
    1787             : 
    1788         803 :             if (filt_direction <= 0 && filt_low != filt_mid) {
    1789             :                 // Get Low filter error score
    1790         236 :                 if (ss_err[filt_low] < 0) {
    1791         162 :                     ss_err[filt_low] =
    1792         162 :                         try_filter_frame(sd, tempLfReconBuffer, pcs_ptr, filt_low, partial_frame, plane, dir);
    1793             :                 }
    1794             :                 // If value is close to the best so far then bias towards a lower loop
    1795             :                 // filter value.
    1796         236 :                 if (ss_err[filt_low] < (best_err + bias)) {
    1797             :                     // Was it actually better than the previous best?
    1798          84 :                     if (ss_err[filt_low] < best_err)
    1799          40 :                         best_err = ss_err[filt_low];
    1800          84 :                     filt_best = filt_low;
    1801             :                 }
    1802             :             }
    1803             : 
    1804             :             // Now look at filt_high
    1805         803 :             if (filt_direction >= 0 && filt_high != filt_mid) {
    1806         729 :                 if (ss_err[filt_high] < 0) {
    1807         648 :                     ss_err[filt_high] =
    1808         648 :                         try_filter_frame(sd, tempLfReconBuffer, pcs_ptr, filt_high, partial_frame, plane, dir);
    1809             :                 }
    1810             :                 // If value is significantly better than previous best, bias added against
    1811             :                 // raising filter value
    1812         729 :                 if (ss_err[filt_high] < (best_err - bias)) {
    1813         189 :                     best_err = ss_err[filt_high];
    1814         189 :                     filt_best = filt_high;
    1815             :                 }
    1816             :             }
    1817             : 
    1818             :             // Half the step distance if the best filter value was the same as last time
    1819         803 :             if (filt_best == filt_mid) {
    1820         540 :                 filter_step /= 2;
    1821         540 :                 filt_direction = 0;
    1822             :             }
    1823             :             else {
    1824         263 :                 filt_direction = (filt_best < filt_mid) ? -1 : 1;
    1825         263 :                 filt_mid = filt_best;
    1826             :             }
    1827             :         }
    1828             :     }
    1829             :     // Update best error
    1830         180 :     best_err = ss_err[filt_best];
    1831             : 
    1832         180 :     if (best_cost_ret) *best_cost_ret = (double)best_err;//RDCOST_DBL(x->rdmult, 0, best_err);
    1833         180 :     return filt_best;
    1834             : }
    1835             : 
    1836          90 : void eb_av1_pick_filter_level(
    1837             :     DlfContext            *context_ptr,
    1838             :     EbPictureBufferDesc   *srcBuffer, // source input
    1839             :     PictureControlSet     *pcs_ptr,
    1840             :     LpfPickMethod          method) {
    1841          90 :     SequenceControlSet *scs_ptr = (SequenceControlSet*)pcs_ptr->parent_pcs_ptr->sequence_control_set_wrapper_ptr->object_ptr;
    1842          90 :     FrameHeader *frm_hdr = &pcs_ptr->parent_pcs_ptr->frm_hdr;
    1843             : 
    1844          90 :     const int32_t num_planes = 3;
    1845             :     (void)srcBuffer;
    1846          90 :     struct LoopFilter *const lf = &frm_hdr->loop_filter_params;
    1847          90 :     lf->sharpness_level = frm_hdr->frame_type == KEY_FRAME ? 0 : LF_SHARPNESS;
    1848             : 
    1849          90 :     if (method == LPF_PICK_MINIMAL_LPF) {
    1850           0 :         lf->filter_level[0] = 0;
    1851           0 :         lf->filter_level[1] = 0;
    1852             :     }
    1853          90 :     else if (method >= LPF_PICK_FROM_Q) {
    1854          30 :         const int32_t min_filter_level = 0;
    1855          30 :         const int32_t max_filter_level = MAX_LOOP_FILTER;// av1_get_max_filter_level(cpi);
    1856          30 :         const int32_t q = eb_av1_ac_quant_Q3(frm_hdr->quantization_params.base_q_idx, 0, (AomBitDepth)scs_ptr->static_config.encoder_bit_depth);
    1857             :         // These values were determined by linear fitting the result of the
    1858             :         // searched level for 8 bit depth:
    1859             :         // Keyframes: filt_guess = q * 0.06699 - 1.60817
    1860             :         // Other frames: filt_guess = q * 0.02295 + 2.48225
    1861             :         //
    1862             :         // And high bit depth separately:
    1863             :         // filt_guess = q * 0.316206 + 3.87252
    1864             :         int32_t filt_guess;
    1865          30 :         switch (scs_ptr->static_config.encoder_bit_depth) {
    1866          30 :         case EB_8BIT:
    1867          60 :             filt_guess = (frm_hdr->frame_type == KEY_FRAME)
    1868           1 :                 ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
    1869          30 :                 : ROUND_POWER_OF_TWO(q * 6017 + 650707, 18);
    1870          30 :             break;
    1871           0 :         case EB_10BIT:
    1872           0 :             filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
    1873           0 :             break;
    1874           0 :         case EB_12BIT:
    1875           0 :             filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
    1876           0 :             break;
    1877           0 :         default:
    1878           0 :             assert(0 &&
    1879             :                 "bit_depth should be AOM_BITS_8, AOM_BITS_10 "
    1880             :                 "or AOM_BITS_12");
    1881             :             return;
    1882             :         }
    1883          30 :         if (scs_ptr->static_config.encoder_bit_depth != EB_8BIT && frm_hdr->frame_type == KEY_FRAME)
    1884           0 :             filt_guess -= 4;
    1885             : 
    1886          30 :         filt_guess = filt_guess > 2 ? filt_guess - 2 : filt_guess > 1 ? filt_guess - 1 : filt_guess;
    1887          30 :         int32_t filt_guess_chroma = filt_guess > 1 ? filt_guess / 2 : filt_guess;
    1888             : 
    1889             :         // TODO(chengchen): retrain the model for Y, U, V filter levels
    1890          30 :         lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level);
    1891          30 :         lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level);
    1892          30 :         lf->filter_level_u = clamp(filt_guess_chroma, min_filter_level, max_filter_level);
    1893          30 :         lf->filter_level_v = clamp(filt_guess_chroma, min_filter_level, max_filter_level);
    1894             :     }
    1895             :     else {
    1896          60 :         const int32_t last_frame_filter_level[4] = { lf->filter_level[0],
    1897          60 :             lf->filter_level[1],
    1898          60 :             lf->filter_level_u,
    1899          60 :             lf->filter_level_v };
    1900          60 :         EbPictureBufferDesc  *tempLfReconBuffer = (scs_ptr->static_config.encoder_bit_depth != EB_8BIT) ? context_ptr->temp_lf_recon_picture16bit_ptr : context_ptr->temp_lf_recon_picture_ptr;
    1901             : 
    1902          60 :         lf->filter_level[0] = lf->filter_level[1] =
    1903          60 :             search_filter_level(srcBuffer, tempLfReconBuffer, pcs_ptr, method == LPF_PICK_FROM_SUBIMAGE,
    1904             :                 last_frame_filter_level, NULL, 0, 2);
    1905             : 
    1906          60 :         if (num_planes > 1) {
    1907          60 :             lf->filter_level_u =
    1908          60 :                 search_filter_level(srcBuffer, tempLfReconBuffer, pcs_ptr, method == LPF_PICK_FROM_SUBIMAGE,
    1909             :                     last_frame_filter_level, NULL, 1, 0);
    1910          60 :             lf->filter_level_v =
    1911          60 :                 search_filter_level(srcBuffer, tempLfReconBuffer, pcs_ptr, method == LPF_PICK_FROM_SUBIMAGE,
    1912             :                     last_frame_filter_level, NULL, 2, 0);
    1913             :         }
    1914             :     }
    1915             : }

Generated by: LCOV version 1.14