LCOV - code coverage report
Current view: top level - ASM_AVX2 - EbNoiseExtractAVX2.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 493 0.0 %
Date: 2019-11-25 17:38:06 Functions: 0 8 0.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #include "EbNoiseExtractAVX2.h"
       7             : #include "EbDefinitions.h"
       8             : #include "immintrin.h"
       9             : #include "EbUtility.h"
      10             : 
      11             : EB_EXTERN EB_ALIGN(16) const uint8_t filterType[] = {
      12             :     1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4
      13             : };
      14             : 
      15             : EB_EXTERN EB_ALIGN(16) const uint8_t WeakChromafilter[2][32] = {
      16             :         { 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4 },
      17             :         { 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 },
      18             : };
      19             : 
      20           0 : inline void luma_weak_filter_avx2_intrin(
      21             :     __m256i                        top,
      22             :     __m256i                        curr,
      23             :     __m256i                        bottom,
      24             :     __m256i                        curr_prev,
      25             :     __m256i                        curr_next,
      26             :     uint8_t                       *ptr_denoised,
      27             :     uint8_t                        *ptr_noise
      28             : )
      29             : {
      30             :     __m256i  topFirstHalf, bottomFirstHalf,
      31             :         filterFirstHalf, filterSecondHalf,
      32             :         currNextFirstHalf, currNextSecondHalf,
      33             :         weights, currLeftMidFirstHalfWeight,
      34             :         currLeftMidFirstHalflo, currLeftMidFirstHalfhi, currPrevPermutation, currPermutation, currNextPermutation,
      35             :         topPermutation, bottomPermutation;
      36             : 
      37           0 :     currPrevPermutation = _mm256_permute4x64_epi64(curr_prev, 216);
      38           0 :     currPermutation = _mm256_permute4x64_epi64(curr, 216);
      39           0 :     currLeftMidFirstHalflo = _mm256_unpacklo_epi8(currPrevPermutation, currPermutation);
      40           0 :     weights = _mm256_loadu_si256((__m256i*)filterType);
      41           0 :     currLeftMidFirstHalfWeight = _mm256_maddubs_epi16(currLeftMidFirstHalflo, weights);
      42           0 :     currNextPermutation = _mm256_permute4x64_epi64(curr_next, 88);
      43           0 :     currNextFirstHalf = _mm256_unpacklo_epi8(currNextPermutation, _mm256_setzero_si256());
      44           0 :     currLeftMidFirstHalflo = _mm256_add_epi16(currNextFirstHalf, currLeftMidFirstHalfWeight);
      45             : 
      46           0 :     currLeftMidFirstHalfhi = _mm256_unpackhi_epi8(currPrevPermutation, currPermutation);
      47           0 :     currLeftMidFirstHalfWeight = _mm256_maddubs_epi16(currLeftMidFirstHalfhi, weights);
      48           0 :     currNextPermutation = _mm256_permute4x64_epi64(curr_next, 216);
      49           0 :     currNextSecondHalf = _mm256_unpackhi_epi8(currNextPermutation, _mm256_setzero_si256());
      50           0 :     currLeftMidFirstHalfhi = _mm256_add_epi16(currNextSecondHalf, currLeftMidFirstHalfWeight);
      51             : 
      52           0 :     topPermutation = _mm256_permute4x64_epi64(top, 216);
      53           0 :     topFirstHalf = _mm256_unpacklo_epi8(topPermutation, _mm256_setzero_si256());
      54           0 :     bottomPermutation = _mm256_permute4x64_epi64(bottom, 216);
      55           0 :     bottomFirstHalf = _mm256_unpacklo_epi8(bottomPermutation, _mm256_setzero_si256());
      56           0 :     filterFirstHalf = _mm256_adds_epi16(_mm256_adds_epi16(bottomFirstHalf, topFirstHalf), currLeftMidFirstHalflo);
      57           0 :     filterFirstHalf = _mm256_srli_epi16(filterFirstHalf, 3);
      58             : 
      59           0 :     topFirstHalf = _mm256_unpackhi_epi8(topPermutation, _mm256_setzero_si256());
      60           0 :     bottomFirstHalf = _mm256_unpackhi_epi8(bottomPermutation, _mm256_setzero_si256());
      61           0 :     filterSecondHalf = _mm256_adds_epi16(_mm256_adds_epi16(bottomFirstHalf, topFirstHalf), currLeftMidFirstHalfhi);
      62           0 :     filterSecondHalf = _mm256_srli_epi16(filterSecondHalf, 3);
      63             : 
      64           0 :     filterFirstHalf = _mm256_permute4x64_epi64(_mm256_packus_epi16(filterFirstHalf, filterSecondHalf), 216);
      65             :     _mm256_storeu_si256((__m256i *)(ptr_denoised), filterFirstHalf);
      66             : 
      67           0 :     _mm256_storeu_si256((__m256i *)(ptr_noise), _mm256_subs_epu8(curr, filterFirstHalf));
      68           0 : }
      69           0 : inline void chroma_weak_luma_strong_filter_avx2_intrin(
      70             :     __m256i                        top,
      71             :     __m256i                        curr,
      72             :     __m256i                        bottom,
      73             :     __m256i                        curr_prev,
      74             :     __m256i                        curr_next,
      75             :     __m256i                        top_prev,
      76             :     __m256i                        top_next,
      77             :     __m256i                        bottom_prev,
      78             :     __m256i                        bottom_next,
      79             :     uint8_t                       *ptr_denoised
      80             : )
      81             : {
      82             :     __m256i filterFirstHalf, filterSecondHalf,
      83             :         currNextFirstHalf, currNextSecondHalf,
      84             :         weights, currLeftMidFirstHalfWeight,
      85             :         currLeftMidFirstHalflo, currLeftMidFirstHalfhi, currPrevPermutation, currPermutation, currNextPermutation,
      86             :         topPermutation, bottomPermutation,
      87             :         topPrevPermutation, topLeftMidFirstHalflo, topLeftMidFirstHalfWeight, topNextFirstHalf,
      88             :         topNextPermutation, topLeftMidFirstHalfhi, topNextSecondHalf,
      89             :         bottomPrevPermutation, bottomLeftMidFirstHalflo, bottomLeftMidFirstHalfWeight, bottomNextPermutation,
      90             :         bottomNextFirstHalf, bottomLeftMidFirstHalfhi, bottomNextSecondHalf;
      91             : 
      92             :     //  Curr
      93           0 :     currPrevPermutation = _mm256_permute4x64_epi64(curr_prev, 216);
      94           0 :     currPermutation = _mm256_permute4x64_epi64(curr, 216);
      95           0 :     currLeftMidFirstHalflo = _mm256_unpacklo_epi8(currPrevPermutation, currPermutation);
      96           0 :     weights = _mm256_loadu_si256((__m256i*)WeakChromafilter[0]);
      97           0 :     currLeftMidFirstHalfWeight = _mm256_maddubs_epi16(currLeftMidFirstHalflo, weights);
      98           0 :     currNextPermutation = _mm256_permute4x64_epi64(curr_next, 88);
      99           0 :     currNextFirstHalf = _mm256_unpacklo_epi8(currNextPermutation, _mm256_setzero_si256());
     100           0 :     currNextFirstHalf = _mm256_slli_epi16(currNextFirstHalf, 1);
     101           0 :     currLeftMidFirstHalflo = _mm256_add_epi16(currNextFirstHalf, currLeftMidFirstHalfWeight);
     102             : 
     103           0 :     currLeftMidFirstHalfhi = _mm256_unpackhi_epi8(currPrevPermutation, currPermutation);
     104           0 :     currLeftMidFirstHalfWeight = _mm256_maddubs_epi16(currLeftMidFirstHalfhi, weights);
     105           0 :     currNextPermutation = _mm256_permute4x64_epi64(curr_next, 216);
     106           0 :     currNextSecondHalf = _mm256_unpackhi_epi8(currNextPermutation, _mm256_setzero_si256());
     107           0 :     currNextSecondHalf = _mm256_slli_epi16(currNextSecondHalf, 1);
     108           0 :     currLeftMidFirstHalfhi = _mm256_add_epi16(currNextSecondHalf, currLeftMidFirstHalfWeight);
     109             : 
     110             :     // Top
     111           0 :     topPrevPermutation = _mm256_permute4x64_epi64(top_prev, 216);
     112           0 :     topPermutation = _mm256_permute4x64_epi64(top, 216);
     113           0 :     topLeftMidFirstHalflo = _mm256_unpacklo_epi8(topPrevPermutation, topPermutation);
     114           0 :     weights = _mm256_loadu_si256((__m256i*)WeakChromafilter[1]);
     115           0 :     topLeftMidFirstHalfWeight = _mm256_maddubs_epi16(topLeftMidFirstHalflo, weights);
     116           0 :     topNextPermutation = _mm256_permute4x64_epi64(top_next, 88);
     117           0 :     topNextFirstHalf = _mm256_unpacklo_epi8(topNextPermutation, _mm256_setzero_si256());
     118           0 :     topLeftMidFirstHalflo = _mm256_add_epi16(topNextFirstHalf, topLeftMidFirstHalfWeight);
     119             : 
     120           0 :     topLeftMidFirstHalfhi = _mm256_unpackhi_epi8(topPrevPermutation, topPermutation);
     121           0 :     topLeftMidFirstHalfWeight = _mm256_maddubs_epi16(topLeftMidFirstHalfhi, weights);
     122           0 :     topNextPermutation = _mm256_permute4x64_epi64(top_next, 216);
     123           0 :     topNextSecondHalf = _mm256_unpackhi_epi8(topNextPermutation, _mm256_setzero_si256());
     124           0 :     topLeftMidFirstHalfhi = _mm256_add_epi16(topNextSecondHalf, topLeftMidFirstHalfWeight);
     125             : 
     126             :     // Bottom
     127           0 :     bottomPrevPermutation = _mm256_permute4x64_epi64(bottom_prev, 216);
     128           0 :     bottomPermutation = _mm256_permute4x64_epi64(bottom, 216);
     129           0 :     bottomLeftMidFirstHalflo = _mm256_unpacklo_epi8(bottomPrevPermutation, bottomPermutation);
     130           0 :     weights = _mm256_loadu_si256((__m256i*)WeakChromafilter[1]);
     131           0 :     bottomLeftMidFirstHalfWeight = _mm256_maddubs_epi16(bottomLeftMidFirstHalflo, weights);
     132           0 :     bottomNextPermutation = _mm256_permute4x64_epi64(bottom_next, 88);
     133           0 :     bottomNextFirstHalf = _mm256_unpacklo_epi8(bottomNextPermutation, _mm256_setzero_si256());
     134           0 :     bottomLeftMidFirstHalflo = _mm256_add_epi16(bottomNextFirstHalf, bottomLeftMidFirstHalfWeight);
     135             : 
     136           0 :     bottomLeftMidFirstHalfhi = _mm256_unpackhi_epi8(bottomPrevPermutation, bottomPermutation);
     137           0 :     bottomLeftMidFirstHalfWeight = _mm256_maddubs_epi16(bottomLeftMidFirstHalfhi, weights);
     138           0 :     bottomNextPermutation = _mm256_permute4x64_epi64(bottom_next, 216);
     139           0 :     bottomNextSecondHalf = _mm256_unpackhi_epi8(bottomNextPermutation, _mm256_setzero_si256());
     140           0 :     bottomLeftMidFirstHalfhi = _mm256_add_epi16(bottomNextSecondHalf, bottomLeftMidFirstHalfWeight);
     141             : 
     142           0 :     filterFirstHalf = _mm256_adds_epi16(_mm256_adds_epi16(bottomLeftMidFirstHalflo, topLeftMidFirstHalflo), currLeftMidFirstHalflo);
     143           0 :     filterFirstHalf = _mm256_srli_epi16(filterFirstHalf, 4);
     144           0 :     filterSecondHalf = _mm256_adds_epi16(_mm256_adds_epi16(bottomLeftMidFirstHalfhi, topLeftMidFirstHalfhi), currLeftMidFirstHalfhi);
     145           0 :     filterSecondHalf = _mm256_srli_epi16(filterSecondHalf, 4);
     146             : 
     147           0 :     filterFirstHalf = _mm256_permute4x64_epi64(_mm256_packus_epi16(filterFirstHalf, filterSecondHalf), 216);
     148             :     _mm256_storeu_si256((__m256i *)(ptr_denoised), filterFirstHalf);
     149           0 : }
     150             : 
     151           0 : inline void chroma_strong_avx2_intrin(
     152             :     __m256i                        top,
     153             :     __m256i                        curr,
     154             :     __m256i                        bottom,
     155             :     __m256i                        curr_prev,
     156             :     __m256i                        curr_next,
     157             :     __m256i                        top_prev,
     158             :     __m256i                        top_next,
     159             :     __m256i                        bottom_prev,
     160             :     __m256i                        bottom_next,
     161             :     uint8_t                       *ptr_denoised
     162             : )
     163             : {
     164             :     __m256i   currLeftMidFirstHalflo, currLeftMidFirstHalfhi, currPrevPermutation, currPermutation, currNextPermutation,
     165             :         topPermutation, topPrevPermutation, topLeftMidFirstHalflo, topNextPermutation, topLeftMidFirstHalfhi,
     166             :         bottomPermutation, bottomPrevPermutation, bottomLeftMidFirstHalflo, bottomNextPermutation, bottomLeftMidFirstHalfhi;
     167             : 
     168           0 :     currPrevPermutation = _mm256_permute4x64_epi64(curr_prev, 216);
     169           0 :     currPermutation = _mm256_permute4x64_epi64(curr, 216);
     170           0 :     currNextPermutation = _mm256_permute4x64_epi64(curr_next, 216);
     171             : 
     172           0 :     currLeftMidFirstHalflo = _mm256_add_epi16(_mm256_unpacklo_epi8(currPermutation, _mm256_setzero_si256()),
     173             :         _mm256_unpacklo_epi8(currPrevPermutation, _mm256_setzero_si256()));
     174           0 :     currLeftMidFirstHalflo = _mm256_add_epi16(_mm256_unpacklo_epi8(currNextPermutation, _mm256_setzero_si256()), currLeftMidFirstHalflo);
     175             : 
     176           0 :     currLeftMidFirstHalfhi = _mm256_add_epi16(_mm256_unpackhi_epi8(currPermutation, _mm256_setzero_si256()),
     177             :         _mm256_unpackhi_epi8(currPrevPermutation, _mm256_setzero_si256()));
     178           0 :     currLeftMidFirstHalfhi = _mm256_add_epi16(_mm256_unpackhi_epi8(currNextPermutation, _mm256_setzero_si256()), currLeftMidFirstHalfhi);
     179             : 
     180           0 :     topPrevPermutation = _mm256_permute4x64_epi64(top_prev, 216);
     181           0 :     topPermutation = _mm256_permute4x64_epi64(top, 216);
     182           0 :     topNextPermutation = _mm256_permute4x64_epi64(top_next, 216);
     183             : 
     184           0 :     topLeftMidFirstHalflo = _mm256_add_epi16(_mm256_unpacklo_epi8(topPermutation, _mm256_setzero_si256()),
     185             :         _mm256_unpacklo_epi8(topPrevPermutation, _mm256_setzero_si256()));
     186           0 :     topLeftMidFirstHalflo = _mm256_add_epi16(_mm256_unpacklo_epi8(topNextPermutation, _mm256_setzero_si256()), topLeftMidFirstHalflo);
     187             : 
     188           0 :     topLeftMidFirstHalfhi = _mm256_add_epi16(_mm256_unpackhi_epi8(topPermutation, _mm256_setzero_si256()),
     189             :         _mm256_unpackhi_epi8(topPrevPermutation, _mm256_setzero_si256()));
     190           0 :     topLeftMidFirstHalfhi = _mm256_add_epi16(_mm256_unpackhi_epi8(topNextPermutation, _mm256_setzero_si256()), topLeftMidFirstHalfhi);
     191             : 
     192           0 :     bottomPrevPermutation = _mm256_permute4x64_epi64(bottom_prev, 216);
     193           0 :     bottomPermutation = _mm256_permute4x64_epi64(bottom, 216);
     194           0 :     bottomNextPermutation = _mm256_permute4x64_epi64(bottom_next, 216);
     195             : 
     196           0 :     bottomLeftMidFirstHalflo = _mm256_add_epi16(_mm256_unpacklo_epi8(bottomPermutation, _mm256_setzero_si256()),
     197             :         _mm256_unpacklo_epi8(bottomPrevPermutation, _mm256_setzero_si256()));
     198           0 :     bottomLeftMidFirstHalflo = _mm256_add_epi16(_mm256_unpacklo_epi8(bottomNextPermutation, _mm256_setzero_si256()), bottomLeftMidFirstHalflo);
     199             : 
     200           0 :     bottomLeftMidFirstHalfhi = _mm256_add_epi16(_mm256_unpackhi_epi8(bottomPermutation, _mm256_setzero_si256()),
     201             :         _mm256_unpackhi_epi8(bottomPrevPermutation, _mm256_setzero_si256()));
     202           0 :     bottomLeftMidFirstHalfhi = _mm256_add_epi16(_mm256_unpackhi_epi8(bottomNextPermutation, _mm256_setzero_si256()), bottomLeftMidFirstHalfhi);
     203             : 
     204           0 :     currLeftMidFirstHalflo = _mm256_add_epi16(_mm256_add_epi16(currLeftMidFirstHalflo, topLeftMidFirstHalflo), bottomLeftMidFirstHalflo);
     205           0 :     currLeftMidFirstHalfhi = _mm256_add_epi16(_mm256_add_epi16(currLeftMidFirstHalfhi, topLeftMidFirstHalfhi), bottomLeftMidFirstHalfhi);
     206             : 
     207           0 :     topLeftMidFirstHalflo = _mm256_unpacklo_epi16(currLeftMidFirstHalflo, _mm256_setzero_si256());
     208           0 :     topLeftMidFirstHalflo = _mm256_mullo_epi32(topLeftMidFirstHalflo, _mm256_set1_epi32(7282));
     209           0 :     topLeftMidFirstHalflo = _mm256_srli_epi32(topLeftMidFirstHalflo, 16);
     210           0 :     bottomLeftMidFirstHalflo = _mm256_unpackhi_epi16(currLeftMidFirstHalflo, _mm256_setzero_si256());
     211           0 :     bottomLeftMidFirstHalflo = _mm256_mullo_epi32(bottomLeftMidFirstHalflo, _mm256_set1_epi32(7282));
     212           0 :     bottomLeftMidFirstHalflo = _mm256_srli_epi32(bottomLeftMidFirstHalflo, 16);
     213           0 :     currLeftMidFirstHalflo = _mm256_packus_epi32(topLeftMidFirstHalflo, bottomLeftMidFirstHalflo);
     214             : 
     215           0 :     currLeftMidFirstHalflo = _mm256_insertf128_si256(_mm256_setzero_si256(), _mm_packus_epi16(_mm256_extracti128_si256(currLeftMidFirstHalflo, 0), _mm256_extracti128_si256(currLeftMidFirstHalflo, 1)), 0);
     216             : 
     217           0 :     topLeftMidFirstHalfhi = _mm256_unpacklo_epi16(currLeftMidFirstHalfhi, _mm256_setzero_si256());
     218           0 :     topLeftMidFirstHalfhi = _mm256_mullo_epi32(topLeftMidFirstHalfhi, _mm256_set1_epi32(7282));
     219           0 :     topLeftMidFirstHalfhi = _mm256_srli_epi32(topLeftMidFirstHalfhi, 16);
     220             : 
     221           0 :     bottomLeftMidFirstHalfhi = _mm256_unpackhi_epi16(currLeftMidFirstHalfhi, _mm256_setzero_si256());
     222           0 :     bottomLeftMidFirstHalfhi = _mm256_mullo_epi32(bottomLeftMidFirstHalfhi, _mm256_set1_epi32(7282));
     223           0 :     bottomLeftMidFirstHalfhi = _mm256_srli_epi32(bottomLeftMidFirstHalfhi, 16);
     224           0 :     currLeftMidFirstHalfhi = _mm256_packus_epi32(topLeftMidFirstHalfhi, bottomLeftMidFirstHalfhi);
     225             : 
     226           0 :     currLeftMidFirstHalflo = _mm256_insertf128_si256(currLeftMidFirstHalflo, _mm_packus_epi16(_mm256_extracti128_si256(currLeftMidFirstHalfhi, 0), _mm256_extracti128_si256(currLeftMidFirstHalfhi, 1)), 1);
     227             :     _mm256_storeu_si256((__m256i *)(ptr_denoised), currLeftMidFirstHalflo);
     228           0 : }
     229             : /*******************************************
     230             : * noise_extract_luma_weak
     231             : *  weak filter Luma and store noise.
     232             : *******************************************/
     233           0 : void noise_extract_luma_weak_avx2_intrin(
     234             :     EbPictureBufferDesc       *input_picture_ptr,
     235             :     EbPictureBufferDesc       *denoised_picture_ptr,
     236             :     EbPictureBufferDesc       *noise_picture_ptr,
     237             :     uint32_t                       sb_origin_y,
     238             :     uint32_t                         sb_origin_x
     239             : )
     240             : {
     241             :     uint32_t  ii, jj, kk;
     242             :     uint32_t  picHeight, sb_height;
     243             :     uint32_t  picWidth;
     244             :     uint32_t  inputOriginIndex;
     245             :     uint32_t  inputOriginIndexPad;
     246             :     uint32_t  noiseOriginIndex;
     247             : 
     248             :     uint8_t *ptrIn;
     249             :     uint32_t stride_in;
     250             :     uint8_t *ptr_denoised, *ptrDenoisedInterm;
     251             : 
     252             :     uint8_t *ptr_noise, *ptrNoiseInterm;
     253             :     uint32_t strideOut;
     254             : 
     255             :     __m256i top, curr, bottom, curr_prev, curr_next,
     256             :         secondtop, secondcurr, secondbottom, secondcurrPrev, secondcurrNext;
     257             :     (void)sb_origin_x;
     258             : 
     259             :     //Luma
     260             :     {
     261           0 :         picHeight = input_picture_ptr->height;
     262           0 :         picWidth = input_picture_ptr->width;
     263           0 :         sb_height = MIN(BLOCK_SIZE_64, picHeight - sb_origin_y);
     264           0 :         sb_height = ((sb_origin_y + BLOCK_SIZE_64 >= picHeight) || (sb_origin_y == 0)) ? sb_height - 1 : sb_height;
     265           0 :         stride_in = input_picture_ptr->stride_y;
     266           0 :         inputOriginIndex = input_picture_ptr->origin_x + (input_picture_ptr->origin_y + sb_origin_y) * input_picture_ptr->stride_y;
     267           0 :         ptrIn = &(input_picture_ptr->buffer_y[inputOriginIndex]);
     268             : 
     269           0 :         inputOriginIndexPad = denoised_picture_ptr->origin_x + (denoised_picture_ptr->origin_y + sb_origin_y) * denoised_picture_ptr->stride_y;
     270           0 :         strideOut = denoised_picture_ptr->stride_y;
     271           0 :         ptr_denoised = &(denoised_picture_ptr->buffer_y[inputOriginIndexPad]);
     272           0 :         ptrDenoisedInterm = ptr_denoised;
     273             : 
     274           0 :         noiseOriginIndex = noise_picture_ptr->origin_x + noise_picture_ptr->origin_y * noise_picture_ptr->stride_y;
     275           0 :         ptr_noise = &(noise_picture_ptr->buffer_y[noiseOriginIndex]);
     276           0 :         ptrNoiseInterm = ptr_noise;
     277             : 
     278             :         ////Luma
     279             :         //a = (p[1] +
     280             :         //    p[0 + stride] + 4 * p[1 + stride] + p[2 + stride] +
     281             :         //    p[1 + 2 * stride]) / 8;
     282             : 
     283           0 :         top = curr = secondtop = secondcurr = _mm256_setzero_si256();
     284             : 
     285           0 :         for (kk = 0; kk + BLOCK_SIZE_64 <= picWidth; kk += BLOCK_SIZE_64)
     286             :         {
     287           0 :             for (jj = 0; jj < sb_height; jj++)
     288             :             {
     289           0 :                 if (sb_origin_y == 0)
     290             :                 {
     291           0 :                     if (jj == 0)
     292             :                     {
     293           0 :                         top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in));
     294           0 :                         secondtop = _mm256_loadu_si256((__m256i*)(ptrIn + kk + 32 + jj * stride_in));
     295           0 :                         curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in));
     296           0 :                         secondcurr = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (1 + jj)*stride_in));
     297           0 :                         _mm256_storeu_si256((__m256i *)(ptr_denoised + kk), top);
     298           0 :                         _mm256_storeu_si256((__m256i *)(ptr_denoised + kk + 32), secondtop);
     299           0 :                         _mm256_storeu_si256((__m256i *)(ptr_noise + kk), _mm256_setzero_si256());
     300           0 :                         _mm256_storeu_si256((__m256i *)(ptr_noise + kk + 32), _mm256_setzero_si256());
     301             :                     }
     302           0 :                     curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in)));
     303           0 :                     curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in)));
     304           0 :                     secondcurrPrev = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) - 1 + ((1 + jj)*stride_in)));
     305           0 :                     secondcurrNext = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + 1 + ((1 + jj)*stride_in)));
     306           0 :                     bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in));
     307           0 :                     secondbottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (2 + jj)* stride_in));
     308           0 :                     ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut);
     309           0 :                     ptrNoiseInterm = ptr_noise + kk + ((1 + jj)*strideOut);
     310             :                 }
     311             :                 else
     312             :                 {
     313           0 :                     if (jj == 0)
     314             :                     {
     315           0 :                         top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in - stride_in));
     316           0 :                         secondtop = _mm256_loadu_si256((__m256i*)(ptrIn + kk + 32 + jj * stride_in - stride_in));
     317           0 :                         curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in - stride_in));
     318           0 :                         secondcurr = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (1 + jj)*stride_in - stride_in));
     319             :                     }
     320           0 :                     curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in - stride_in)));
     321           0 :                     curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in - stride_in)));
     322           0 :                     secondcurrPrev = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) - 1 + ((1 + jj)*stride_in - stride_in)));
     323           0 :                     secondcurrNext = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + 1 + ((1 + jj)*stride_in - stride_in)));
     324           0 :                     bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in - stride_in));
     325           0 :                     secondbottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (2 + jj)* stride_in - stride_in));
     326           0 :                     ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut - strideOut);
     327           0 :                     ptrNoiseInterm = ptr_noise + kk + jj * strideOut;
     328             :                 }
     329             : 
     330           0 :                 luma_weak_filter_avx2_intrin(
     331             :                     top,
     332             :                     curr,
     333             :                     bottom,
     334             :                     curr_prev,
     335             :                     curr_next,
     336             :                     ptrDenoisedInterm,
     337             :                     ptrNoiseInterm);
     338             : 
     339           0 :                 luma_weak_filter_avx2_intrin(
     340             :                     secondtop,
     341             :                     secondcurr,
     342             :                     secondbottom,
     343             :                     secondcurrPrev,
     344             :                     secondcurrNext,
     345             :                     ptrDenoisedInterm + 32,
     346             :                     ptrNoiseInterm + 32);
     347             : 
     348           0 :                 top = curr;
     349           0 :                 curr = bottom;
     350           0 :                 secondtop = secondcurr;
     351           0 :                 secondcurr = secondbottom;
     352             :             }
     353             :         }
     354             : 
     355           0 :         sb_height = MIN(BLOCK_SIZE_64, picHeight - sb_origin_y);
     356             : 
     357           0 :         for (jj = 0; jj < sb_height; jj++) {
     358           0 :             for (ii = 0; ii < picWidth; ii++) {
     359           0 :                 if (!((jj < sb_height - 1 || sb_origin_y + sb_height < picHeight) && ii > 0 && ii < picWidth - 1)) {
     360           0 :                     ptr_denoised[ii + jj * strideOut] = ptrIn[ii + jj * stride_in];
     361           0 :                     ptr_noise[ii + jj * strideOut] = 0;
     362             :                 }
     363             :             }
     364             :         }
     365             :     }
     366           0 : }
     367             : 
     368           0 : void noise_extract_luma_weak_lcu_avx2_intrin(
     369             :     EbPictureBufferDesc       *input_picture_ptr,
     370             :     EbPictureBufferDesc       *denoised_picture_ptr,
     371             :     EbPictureBufferDesc       *noise_picture_ptr,
     372             :     uint32_t                       sb_origin_y,
     373             :     uint32_t                         sb_origin_x
     374             : )
     375             : {
     376             :     uint32_t  ii, jj;
     377             :     uint32_t  picHeight, sb_height;
     378             :     uint32_t  picWidth, sb_width;
     379             :     uint32_t  inputOriginIndex;
     380             :     uint32_t  inputOriginIndexPad;
     381             :     uint32_t  noiseOriginIndex;
     382             : 
     383             :     uint8_t *ptrIn;
     384             :     uint32_t stride_in;
     385             :     uint8_t *ptr_denoised, *ptrDenoisedInterm;
     386             : 
     387             :     uint8_t *ptr_noise, *ptrNoiseInterm;
     388             :     uint32_t strideOut;
     389             : 
     390             :     __m256i top, curr, bottom, curr_prev, curr_next,
     391             :         secondtop, secondcurr, secondbottom, secondcurrPrev, secondcurrNext;
     392             :     (void)sb_origin_x;
     393             : 
     394             :     //Luma
     395             :     {
     396           0 :         picHeight = input_picture_ptr->height;
     397           0 :         picWidth = input_picture_ptr->width;
     398           0 :         sb_height = MIN(BLOCK_SIZE_64, picHeight - sb_origin_y);
     399           0 :         sb_width = MIN(BLOCK_SIZE_64, picWidth - sb_origin_x);
     400           0 :         sb_height = ((sb_origin_y + BLOCK_SIZE_64 >= picHeight) || (sb_origin_y == 0)) ? sb_height - 1 : sb_height;
     401           0 :         stride_in = input_picture_ptr->stride_y;
     402           0 :         inputOriginIndex = input_picture_ptr->origin_x + sb_origin_x + (input_picture_ptr->origin_y + sb_origin_y) * input_picture_ptr->stride_y;
     403           0 :         ptrIn = &(input_picture_ptr->buffer_y[inputOriginIndex]);
     404             : 
     405           0 :         inputOriginIndexPad = denoised_picture_ptr->origin_x + sb_origin_x + (denoised_picture_ptr->origin_y + sb_origin_y) * denoised_picture_ptr->stride_y;
     406           0 :         strideOut = denoised_picture_ptr->stride_y;
     407           0 :         ptr_denoised = &(denoised_picture_ptr->buffer_y[inputOriginIndexPad]);
     408           0 :         ptrDenoisedInterm = ptr_denoised;
     409             : 
     410           0 :         noiseOriginIndex = noise_picture_ptr->origin_x + sb_origin_x + noise_picture_ptr->origin_y * noise_picture_ptr->stride_y;
     411           0 :         ptr_noise = &(noise_picture_ptr->buffer_y[noiseOriginIndex]);
     412           0 :         ptrNoiseInterm = ptr_noise;
     413             : 
     414             :         ////Luma
     415             :         //a = (p[1] +
     416             :         //    p[0 + stride] + 4 * p[1 + stride] + p[2 + stride] +
     417             :         //    p[1 + 2 * stride]) / 8;
     418             : 
     419           0 :         top = curr = secondtop = secondcurr = _mm256_setzero_si256();
     420             : 
     421             :         //for (kk = 0; kk + BLOCK_SIZE_64 <= picWidth; kk += BLOCK_SIZE_64)
     422             :         {
     423           0 :             for (jj = 0; jj < sb_height; jj++)
     424             :             {
     425           0 :                 if (sb_origin_y == 0)
     426             :                 {
     427           0 :                     if (jj == 0)
     428             :                     {
     429           0 :                         top = _mm256_loadu_si256((__m256i*)(ptrIn + jj * stride_in));
     430           0 :                         secondtop = _mm256_loadu_si256((__m256i*)(ptrIn + 32 + jj * stride_in));
     431           0 :                         curr = _mm256_loadu_si256((__m256i*)(ptrIn + (1 + jj)*stride_in));
     432           0 :                         secondcurr = _mm256_loadu_si256((__m256i*)((ptrIn + 32) + (1 + jj)*stride_in));
     433             :                         _mm256_storeu_si256((__m256i *)(ptr_denoised), top);
     434           0 :                         _mm256_storeu_si256((__m256i *)(ptr_denoised + 32), secondtop);
     435           0 :                         _mm256_storeu_si256((__m256i *)(ptr_noise), _mm256_setzero_si256());
     436           0 :                         _mm256_storeu_si256((__m256i *)(ptr_noise + 32), _mm256_setzero_si256());
     437             :                     }
     438           0 :                     curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + ((1 + jj)*stride_in)));
     439           0 :                     curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + ((1 + jj)*stride_in)));
     440           0 :                     secondcurrPrev = _mm256_loadu_si256((__m256i*)((ptrIn + 32) - 1 + ((1 + jj)*stride_in)));
     441           0 :                     secondcurrNext = _mm256_loadu_si256((__m256i*)((ptrIn + 32) + 1 + ((1 + jj)*stride_in)));
     442           0 :                     bottom = _mm256_loadu_si256((__m256i*)((ptrIn)+(2 + jj)* stride_in));
     443           0 :                     secondbottom = _mm256_loadu_si256((__m256i*)((ptrIn + 32) + (2 + jj)* stride_in));
     444           0 :                     ptrDenoisedInterm = ptr_denoised + ((1 + jj)*strideOut);
     445           0 :                     ptrNoiseInterm = ptr_noise + ((1 + jj)*strideOut);
     446             :                 }
     447             :                 else
     448             :                 {
     449           0 :                     if (jj == 0)
     450             :                     {
     451           0 :                         top = _mm256_loadu_si256((__m256i*)(ptrIn + jj * stride_in - stride_in));
     452           0 :                         secondtop = _mm256_loadu_si256((__m256i*)(ptrIn + 32 + jj * stride_in - stride_in));
     453           0 :                         curr = _mm256_loadu_si256((__m256i*)(ptrIn + (1 + jj)*stride_in - stride_in));
     454           0 :                         secondcurr = _mm256_loadu_si256((__m256i*)((ptrIn + 32) + (1 + jj)*stride_in - stride_in));
     455             :                     }
     456           0 :                     curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + ((1 + jj)*stride_in - stride_in)));
     457           0 :                     curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + ((1 + jj)*stride_in - stride_in)));
     458           0 :                     secondcurrPrev = _mm256_loadu_si256((__m256i*)((ptrIn + 32) - 1 + ((1 + jj)*stride_in - stride_in)));
     459           0 :                     secondcurrNext = _mm256_loadu_si256((__m256i*)((ptrIn + 32) + 1 + ((1 + jj)*stride_in - stride_in)));
     460           0 :                     bottom = _mm256_loadu_si256((__m256i*)((ptrIn)+(2 + jj)* stride_in - stride_in));
     461           0 :                     secondbottom = _mm256_loadu_si256((__m256i*)((ptrIn + 32) + (2 + jj)* stride_in - stride_in));
     462           0 :                     ptrDenoisedInterm = ptr_denoised + ((1 + jj)*strideOut - strideOut);
     463           0 :                     ptrNoiseInterm = ptr_noise + jj * strideOut;
     464             :                 }
     465             : 
     466           0 :                 luma_weak_filter_avx2_intrin(
     467             :                     top,
     468             :                     curr,
     469             :                     bottom,
     470             :                     curr_prev,
     471             :                     curr_next,
     472             :                     ptrDenoisedInterm,
     473             :                     ptrNoiseInterm);
     474             : 
     475           0 :                 luma_weak_filter_avx2_intrin(
     476             :                     secondtop,
     477             :                     secondcurr,
     478             :                     secondbottom,
     479             :                     secondcurrPrev,
     480             :                     secondcurrNext,
     481             :                     ptrDenoisedInterm + 32,
     482             :                     ptrNoiseInterm + 32);
     483             : 
     484           0 :                 top = curr;
     485           0 :                 curr = bottom;
     486           0 :                 secondtop = secondcurr;
     487           0 :                 secondcurr = secondbottom;
     488             :             }
     489             :         }
     490             : 
     491           0 :         sb_height = MIN(BLOCK_SIZE_64, picHeight - sb_origin_y);
     492             : 
     493           0 :         for (jj = 0; jj < sb_height; jj++) {
     494           0 :             for (ii = 0; ii < sb_width; ii++) {
     495           0 :                 if (!((jj > 0 || sb_origin_y > 0) && (jj < sb_height - 1 || sb_origin_y + sb_height < picHeight) && (ii > 0 || sb_origin_x > 0) && (ii + sb_origin_x) < picWidth - 1)) {
     496           0 :                     ptr_denoised[ii + jj * strideOut] = ptrIn[ii + jj * stride_in];
     497           0 :                     ptr_noise[ii + jj * strideOut] = 0;
     498             :                 }
     499             :             }
     500             :         }
     501             :     }
     502           0 : }
     503             : /*******************************************
     504             : * noise_extract_luma_strong
     505             : *  strong filter Luma.
     506             : *******************************************/
     507           0 : void noise_extract_luma_strong_avx2_intrin(
     508             :     EbPictureBufferDesc       *input_picture_ptr,
     509             :     EbPictureBufferDesc       *denoised_picture_ptr,
     510             :     uint32_t                       sb_origin_y,
     511             :     uint32_t                       sb_origin_x
     512             : )
     513             : {
     514             :     uint32_t  ii, jj, kk;
     515             :     uint32_t  picHeight, sb_height;
     516             :     uint32_t  picWidth;
     517             :     uint32_t  inputOriginIndex;
     518             :     uint32_t  inputOriginIndexPad;
     519             : 
     520             :     uint8_t *ptrIn;
     521             :     uint32_t stride_in;
     522             :     uint8_t *ptr_denoised, *ptrDenoisedInterm;
     523             : 
     524             :     uint32_t strideOut;
     525             :     __m256i top, curr, bottom, curr_prev, curr_next, top_prev, top_next, bottom_prev, bottom_next,
     526             :         secondtop, secondcurr, secondcurrPrev, secondcurrNext, secondbottom, secondtopPrev, secondtopNext, secondbottomPrev, secondbottomNext;
     527             :     (void)sb_origin_x;
     528             :     //Luma
     529             :     {
     530           0 :         picHeight = input_picture_ptr->height;
     531           0 :         picWidth = input_picture_ptr->width;
     532           0 :         sb_height = MIN(BLOCK_SIZE_64, picHeight - sb_origin_y);
     533             : 
     534           0 :         sb_height = ((sb_origin_y + BLOCK_SIZE_64 >= picHeight) || (sb_origin_y == 0)) ? sb_height - 1 : sb_height;
     535           0 :         stride_in = input_picture_ptr->stride_y;
     536           0 :         inputOriginIndex = input_picture_ptr->origin_x + (input_picture_ptr->origin_y + sb_origin_y)* input_picture_ptr->stride_y;
     537           0 :         ptrIn = &(input_picture_ptr->buffer_y[inputOriginIndex]);
     538             : 
     539           0 :         inputOriginIndexPad = denoised_picture_ptr->origin_x + (denoised_picture_ptr->origin_y + sb_origin_y) * denoised_picture_ptr->stride_y;
     540           0 :         strideOut = denoised_picture_ptr->stride_y;
     541           0 :         ptr_denoised = &(denoised_picture_ptr->buffer_y[inputOriginIndexPad]);
     542           0 :         ptrDenoisedInterm = ptr_denoised;
     543             : 
     544           0 :         top = curr = secondtop = secondcurr = top_next = top_prev = curr_next = curr_prev = secondcurrPrev = secondcurrNext = secondtopPrev = secondtopNext = _mm256_setzero_si256();
     545           0 :         for (kk = 0; kk + BLOCK_SIZE_64 <= picWidth; kk += BLOCK_SIZE_64)
     546             :         {
     547           0 :             for (jj = 0; jj < sb_height; jj++)
     548             :             {
     549           0 :                 if (sb_origin_y == 0)
     550             :                 {
     551           0 :                     if (jj == 0)
     552             :                     {
     553           0 :                         top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in));
     554           0 :                         secondtop = _mm256_loadu_si256((__m256i*)(ptrIn + kk + 32 + jj * stride_in));
     555             : 
     556           0 :                         curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in));
     557           0 :                         secondcurr = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (1 + jj)*stride_in));
     558             : 
     559           0 :                         top_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((jj)*stride_in)));
     560           0 :                         secondtopPrev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + 32 + ((jj)*stride_in)));
     561             : 
     562           0 :                         top_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((jj)*stride_in)));
     563           0 :                         secondtopNext = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + 32 + ((jj)*stride_in)));
     564             : 
     565           0 :                         curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in)));
     566           0 :                         secondcurrPrev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + 32 + ((1 + jj)*stride_in)));
     567             : 
     568           0 :                         curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in)));
     569           0 :                         secondcurrNext = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + 32 + ((1 + jj)*stride_in)));
     570             : 
     571           0 :                         _mm256_storeu_si256((__m256i *)(ptr_denoised + kk), top);
     572           0 :                         _mm256_storeu_si256((__m256i *)(ptr_denoised + kk + 32), secondtop);
     573             :                     }
     574           0 :                     bottom_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((2 + jj)*stride_in)));
     575           0 :                     secondbottomPrev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + 32 + ((2 + jj)*stride_in)));
     576             : 
     577           0 :                     bottom_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((2 + jj)*stride_in)));
     578           0 :                     secondbottomNext = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + 32 + ((2 + jj)*stride_in)));
     579             : 
     580           0 :                     bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in));
     581           0 :                     secondbottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (2 + jj)* stride_in));
     582           0 :                     ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut);
     583             :                 }
     584             :                 else
     585             :                 {
     586           0 :                     if (jj == 0)
     587             :                     {
     588           0 :                         top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in - stride_in));
     589           0 :                         secondtop = _mm256_loadu_si256((__m256i*)(ptrIn + kk + 32 + jj * stride_in - stride_in));
     590           0 :                         curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in - stride_in));
     591           0 :                         secondcurr = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (1 + jj)*stride_in - stride_in));
     592           0 :                         top_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((jj)*stride_in) - stride_in));
     593           0 :                         secondtopPrev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + 32 + ((jj)*stride_in) - stride_in));
     594             : 
     595           0 :                         top_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((jj)*stride_in) - stride_in));
     596           0 :                         secondtopNext = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + 32 + ((jj)*stride_in) - stride_in));
     597             : 
     598           0 :                         curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in - stride_in)));
     599           0 :                         secondcurrPrev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + 32 + ((1 + jj)*stride_in - stride_in)));
     600             : 
     601           0 :                         curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in - stride_in)));
     602           0 :                         secondcurrNext = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + 32 + ((1 + jj)*stride_in - stride_in)));
     603             :                     }
     604           0 :                     bottom_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((2 + jj)*stride_in) - stride_in));
     605           0 :                     secondbottomPrev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + 32 + ((2 + jj)*stride_in - stride_in)));
     606             : 
     607           0 :                     bottom_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((2 + jj)*stride_in) - stride_in));
     608           0 :                     secondbottomNext = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + 32 + ((2 + jj)*stride_in - stride_in)));
     609             : 
     610           0 :                     bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in - stride_in));
     611           0 :                     secondbottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk + 32) + (2 + jj)* stride_in - stride_in));
     612             : 
     613           0 :                     ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut - strideOut);
     614             :                 }
     615             : 
     616           0 :                 chroma_weak_luma_strong_filter_avx2_intrin(
     617             :                     top,
     618             :                     curr,
     619             :                     bottom,
     620             :                     curr_prev,
     621             :                     curr_next,
     622             :                     top_prev,
     623             :                     top_next,
     624             :                     bottom_prev,
     625             :                     bottom_next,
     626             :                     ptrDenoisedInterm);
     627             : 
     628           0 :                 chroma_weak_luma_strong_filter_avx2_intrin(
     629             :                     secondtop,
     630             :                     secondcurr,
     631             :                     secondbottom,
     632             :                     secondcurrPrev,
     633             :                     secondcurrNext,
     634             :                     secondtopPrev,
     635             :                     secondtopNext,
     636             :                     secondbottomPrev,
     637             :                     secondbottomNext,
     638             :                     ptrDenoisedInterm + 32);
     639             : 
     640           0 :                 top = curr;
     641           0 :                 curr = bottom;
     642           0 :                 top_prev = curr_prev;
     643           0 :                 top_next = curr_next;
     644           0 :                 curr_prev = bottom_prev;
     645           0 :                 curr_next = bottom_next;
     646           0 :                 secondtop = secondcurr;
     647           0 :                 secondcurr = secondbottom;
     648           0 :                 secondtopPrev = secondcurrPrev;
     649           0 :                 secondtopNext = secondcurrNext;
     650           0 :                 secondcurrPrev = secondbottomPrev;
     651           0 :                 secondcurrNext = secondbottomNext;
     652             :             }
     653             :         }
     654             : 
     655           0 :         sb_height = MIN(BLOCK_SIZE_64, picHeight - sb_origin_y);
     656             : 
     657           0 :         for (jj = 0; jj < sb_height; jj++) {
     658           0 :             for (ii = 0; ii < picWidth; ii++) {
     659           0 :                 if (!((jj < sb_height - 1 || sb_origin_y + sb_height < picHeight) && ii > 0 && ii < picWidth - 1))
     660           0 :                     ptr_denoised[ii + jj * strideOut] = ptrIn[ii + jj * stride_in];
     661             :             }
     662             :         }
     663             :     }
     664           0 : }
     665             : 
     666             : /*******************************************
     667             : * noise_extract_chroma_strong
     668             : *  strong filter chroma.
     669             : *******************************************/
     670           0 : void noise_extract_chroma_strong_avx2_intrin(
     671             :     EbPictureBufferDesc       *input_picture_ptr,
     672             :     EbPictureBufferDesc       *denoised_picture_ptr,
     673             :     uint32_t                       sb_origin_y,
     674             :     uint32_t                       sb_origin_x
     675             : )
     676             : {
     677             :     uint32_t  ii, jj, kk;
     678             :     uint32_t  picHeight, sb_height;
     679             :     uint32_t  picWidth;
     680             :     uint32_t  inputOriginIndex;
     681             :     uint32_t  inputOriginIndexPad;
     682             : 
     683             :     uint8_t *ptrIn, *ptrInCr;
     684             :     uint32_t stride_in, strideInCr;
     685             :     uint8_t *ptr_denoised, *ptrDenoisedInterm, *ptrDenoisedCr, *ptrDenoisedIntermCr;
     686             : 
     687             :     uint32_t strideOut, strideOutCr;
     688             :     __m256i top, curr, bottom, curr_prev, curr_next, top_prev, top_next, bottom_prev, bottom_next,
     689             :         topCr, currCr, bottomCr, currPrevCr, currNextCr, topPrevCr, topNextCr, bottomPrevCr, bottomNextCr;
     690             :     (void)sb_origin_x;
     691             :     {
     692           0 :         picHeight = input_picture_ptr->height / 2;
     693           0 :         picWidth = input_picture_ptr->width / 2;
     694           0 :         sb_height = MIN(BLOCK_SIZE_64 / 2, picHeight - sb_origin_y);
     695             : 
     696           0 :         sb_height = ((sb_origin_y + BLOCK_SIZE_64 / 2 >= picHeight) || (sb_origin_y == 0)) ? sb_height - 1 : sb_height;
     697             : 
     698           0 :         stride_in = input_picture_ptr->stride_cb;
     699           0 :         inputOriginIndex = input_picture_ptr->origin_x / 2 + (input_picture_ptr->origin_y / 2 + sb_origin_y)  * input_picture_ptr->stride_cb;
     700           0 :         ptrIn = &(input_picture_ptr->buffer_cb[inputOriginIndex]);
     701             : 
     702           0 :         inputOriginIndexPad = denoised_picture_ptr->origin_x / 2 + (denoised_picture_ptr->origin_y / 2 + sb_origin_y)  * denoised_picture_ptr->stride_cb;
     703           0 :         strideOut = denoised_picture_ptr->stride_cb;
     704           0 :         ptr_denoised = &(denoised_picture_ptr->buffer_cb[inputOriginIndexPad]);
     705           0 :         ptrDenoisedInterm = ptr_denoised;
     706             : 
     707           0 :         strideInCr = input_picture_ptr->stride_cr;
     708           0 :         inputOriginIndex = input_picture_ptr->origin_x / 2 + (input_picture_ptr->origin_y / 2 + sb_origin_y)  * input_picture_ptr->stride_cr;
     709           0 :         ptrInCr = &(input_picture_ptr->buffer_cr[inputOriginIndex]);
     710             : 
     711           0 :         inputOriginIndexPad = denoised_picture_ptr->origin_x / 2 + (denoised_picture_ptr->origin_y / 2 + sb_origin_y)  * denoised_picture_ptr->stride_cr;
     712           0 :         strideOutCr = denoised_picture_ptr->stride_cr;
     713           0 :         ptrDenoisedCr = &(denoised_picture_ptr->buffer_cr[inputOriginIndexPad]);
     714           0 :         ptrDenoisedIntermCr = ptrDenoisedCr;
     715             :         ////Chroma
     716             :         //a = (4 * p[0] + 4 * p[1] + 4 * p[2] +
     717             :         //    4 * p[0 + stride] + 4 * p[1 + stride] + 4 * p[2 + stride] +
     718             :         //    4 * p[0 + 2 * stride] + 4 * p[1 + 2 * stride] + 4 * p[2 + 2 * stride]) / 36;
     719             : 
     720           0 :         top = curr = top_next = top_prev = curr_next = curr_prev = topCr = currCr = topNextCr = topPrevCr = currNextCr = currPrevCr = _mm256_setzero_si256();
     721             : 
     722           0 :         for (kk = 0; kk + BLOCK_SIZE_64 / 2 <= picWidth; kk += BLOCK_SIZE_64 / 2)
     723             :         {
     724           0 :             for (jj = 0; jj < sb_height; jj++)
     725             :             {
     726           0 :                 if (sb_origin_y == 0)
     727             :                 {
     728           0 :                     if (jj == 0)
     729             :                     {
     730           0 :                         top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in));
     731           0 :                         curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in));
     732           0 :                         top_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((jj)*stride_in)));
     733           0 :                         top_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((jj)*stride_in)));
     734           0 :                         curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in)));
     735           0 :                         curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in)));
     736           0 :                         topCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + jj * strideInCr));
     737           0 :                         currCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + (1 + jj)*strideInCr));
     738           0 :                         topPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((jj)*strideInCr)));
     739           0 :                         topNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((jj)*strideInCr)));
     740           0 :                         currPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((1 + jj)*strideInCr)));
     741           0 :                         currNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((1 + jj)*strideInCr)));
     742           0 :                         _mm256_storeu_si256((__m256i *)(ptr_denoised + kk), top);
     743           0 :                         _mm256_storeu_si256((__m256i *)(ptrDenoisedCr + kk), topCr);
     744             :                     }
     745           0 :                     bottom_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((2 + jj)*stride_in)));
     746           0 :                     bottom_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((2 + jj)*stride_in)));
     747           0 :                     bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in));
     748           0 :                     bottomPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((2 + jj)*strideInCr)));
     749           0 :                     bottomNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((2 + jj)*strideInCr)));
     750           0 :                     bottomCr = _mm256_loadu_si256((__m256i*)((ptrInCr + kk) + (2 + jj)* strideInCr));
     751           0 :                     ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut);
     752           0 :                     ptrDenoisedIntermCr = ptrDenoisedCr + kk + ((1 + jj)*strideOutCr);
     753             :                 }
     754             :                 else
     755             :                 {
     756           0 :                     if (jj == 0)
     757             :                     {
     758           0 :                         top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in - stride_in));
     759           0 :                         curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in - stride_in));
     760           0 :                         top_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((jj)*stride_in) - stride_in));
     761           0 :                         top_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((jj)*stride_in) - stride_in));
     762           0 :                         curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in - stride_in)));
     763           0 :                         curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in - stride_in)));
     764           0 :                         topCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + jj * strideInCr - strideInCr));
     765           0 :                         currCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + (1 + jj)*strideInCr - strideInCr));
     766           0 :                         topPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((jj)*strideInCr) - strideInCr));
     767           0 :                         topNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((jj)*strideInCr) - strideInCr));
     768           0 :                         currPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((1 + jj)*strideInCr - strideInCr)));
     769           0 :                         currNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((1 + jj)*strideInCr - strideInCr)));
     770             :                     }
     771           0 :                     bottom_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((2 + jj)*stride_in) - stride_in));
     772           0 :                     bottom_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((2 + jj)*stride_in) - stride_in));
     773           0 :                     bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in - stride_in));
     774           0 :                     ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut - strideOut);
     775           0 :                     bottomPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((2 + jj)*strideInCr) - strideInCr));
     776           0 :                     bottomNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((2 + jj)*strideInCr) - strideInCr));
     777           0 :                     bottomCr = _mm256_loadu_si256((__m256i*)((ptrInCr + kk) + (2 + jj)* strideInCr - strideInCr));
     778           0 :                     ptrDenoisedIntermCr = ptrDenoisedCr + kk + ((1 + jj)*strideOutCr - strideOutCr);
     779             :                 }
     780             : 
     781           0 :                 chroma_strong_avx2_intrin(
     782             :                     top,
     783             :                     curr,
     784             :                     bottom,
     785             :                     curr_prev,
     786             :                     curr_next,
     787             :                     top_prev,
     788             :                     top_next,
     789             :                     bottom_prev,
     790             :                     bottom_next,
     791             :                     ptrDenoisedInterm);
     792             : 
     793           0 :                 chroma_strong_avx2_intrin(
     794             :                     topCr,
     795             :                     currCr,
     796             :                     bottomCr,
     797             :                     currPrevCr,
     798             :                     currNextCr,
     799             :                     topPrevCr,
     800             :                     topNextCr,
     801             :                     bottomPrevCr,
     802             :                     bottomNextCr,
     803             :                     ptrDenoisedIntermCr);
     804             : 
     805           0 :                 top = curr;
     806           0 :                 curr = bottom;
     807           0 :                 top_prev = curr_prev;
     808           0 :                 top_next = curr_next;
     809           0 :                 curr_prev = bottom_prev;
     810           0 :                 curr_next = bottom_next;
     811           0 :                 topCr = currCr;
     812           0 :                 currCr = bottomCr;
     813           0 :                 topPrevCr = currPrevCr;
     814           0 :                 topNextCr = currNextCr;
     815           0 :                 currPrevCr = bottomPrevCr;
     816           0 :                 currNextCr = bottomNextCr;
     817             :             }
     818             :         }
     819             : 
     820           0 :         sb_height = MIN(BLOCK_SIZE_64 / 2, picHeight - sb_origin_y);
     821             : 
     822           0 :         for (jj = 0; jj < sb_height; jj++) {
     823           0 :             for (ii = 0; ii < picWidth; ii++) {
     824           0 :                 if (!((jj < sb_height - 1 || (sb_origin_y + sb_height) < picHeight) && ii > 0 && ii < picWidth - 1)) {
     825           0 :                     ptr_denoised[ii + jj * strideOut] = ptrIn[ii + jj * stride_in];
     826           0 :                     ptrDenoisedCr[ii + jj * strideOut] = ptrInCr[ii + jj * stride_in];
     827             :                 }
     828             :             }
     829             :         }
     830             :     }
     831           0 : }
     832             : 
     833             : /*******************************************
     834             : * noise_extract_chroma_weak
     835             : *  weak filter chroma.
     836             : *******************************************/
     837           0 : void noise_extract_chroma_weak_avx2_intrin(
     838             :     EbPictureBufferDesc       *input_picture_ptr,
     839             :     EbPictureBufferDesc       *denoised_picture_ptr,
     840             :     uint32_t                       sb_origin_y,
     841             :     uint32_t                       sb_origin_x
     842             : )
     843             : {
     844             :     uint32_t  ii, jj, kk;
     845             :     uint32_t  picHeight, sb_height;
     846             :     uint32_t  picWidth;
     847             :     uint32_t  inputOriginIndex;
     848             :     uint32_t  inputOriginIndexPad;
     849             : 
     850             :     uint8_t *ptrIn, *ptrInCr;
     851             :     uint32_t stride_in, strideInCr;
     852             :     uint8_t *ptr_denoised, *ptrDenoisedInterm, *ptrDenoisedCr, *ptrDenoisedIntermCr;
     853             : 
     854             :     uint32_t strideOut, strideOutCr;
     855             : 
     856             :     __m256i top, curr, bottom, curr_prev, curr_next, top_prev, top_next, bottom_prev, bottom_next,
     857             :         topCr, currCr, bottomCr, currPrevCr, currNextCr, topPrevCr, topNextCr, bottomPrevCr, bottomNextCr;
     858             :     (void)sb_origin_x;
     859             :     ////gaussian matrix(Chroma)
     860             :     //a = (1 * p[0] + 2 * p[1] + 1 * p[2] +
     861             :     //    2 * p[0 + stride] + 4 * p[1 + stride] + 2 * p[2 + stride] +
     862             :     //    1 * p[0 + 2 * stride] + 2 * p[1 + 2 * stride] + 1 * p[2 + 2 * stride]) / 16;
     863             : 
     864             :     {
     865           0 :         picHeight = input_picture_ptr->height / 2;
     866           0 :         picWidth = input_picture_ptr->width / 2;
     867             : 
     868           0 :         sb_height = MIN(BLOCK_SIZE_64 / 2, picHeight - sb_origin_y);
     869             : 
     870           0 :         sb_height = ((sb_origin_y + BLOCK_SIZE_64 / 2 >= picHeight) || (sb_origin_y == 0)) ? sb_height - 1 : sb_height;
     871           0 :         stride_in = input_picture_ptr->stride_cb;
     872           0 :         inputOriginIndex = input_picture_ptr->origin_x / 2 + (input_picture_ptr->origin_y / 2 + sb_origin_y)* input_picture_ptr->stride_cb;
     873           0 :         ptrIn = &(input_picture_ptr->buffer_cb[inputOriginIndex]);
     874             : 
     875           0 :         inputOriginIndexPad = denoised_picture_ptr->origin_x / 2 + (denoised_picture_ptr->origin_y / 2 + sb_origin_y)* denoised_picture_ptr->stride_cb;
     876           0 :         strideOut = denoised_picture_ptr->stride_cb;
     877           0 :         ptr_denoised = &(denoised_picture_ptr->buffer_cb[inputOriginIndexPad]);
     878           0 :         ptrDenoisedInterm = ptr_denoised;
     879             : 
     880           0 :         strideInCr = input_picture_ptr->stride_cr;
     881           0 :         inputOriginIndex = input_picture_ptr->origin_x / 2 + (input_picture_ptr->origin_y / 2 + sb_origin_y)  * input_picture_ptr->stride_cr;
     882           0 :         ptrInCr = &(input_picture_ptr->buffer_cr[inputOriginIndex]);
     883             : 
     884           0 :         inputOriginIndexPad = denoised_picture_ptr->origin_x / 2 + (denoised_picture_ptr->origin_y / 2 + sb_origin_y)  * denoised_picture_ptr->stride_cr;
     885           0 :         strideOutCr = denoised_picture_ptr->stride_cr;
     886           0 :         ptrDenoisedCr = &(denoised_picture_ptr->buffer_cr[inputOriginIndexPad]);
     887           0 :         ptrDenoisedIntermCr = ptrDenoisedCr;
     888             : 
     889           0 :         top = curr = top_next = top_prev = curr_next = curr_prev = topCr = currCr = topNextCr = topPrevCr = currNextCr = currPrevCr = _mm256_setzero_si256();
     890           0 :         for (kk = 0; kk + BLOCK_SIZE_64 / 2 <= picWidth; kk += BLOCK_SIZE_64 / 2)
     891             :         {
     892           0 :             for (jj = 0; jj < sb_height; jj++)
     893             :             {
     894           0 :                 if (sb_origin_y == 0)
     895             :                 {
     896           0 :                     if (jj == 0)
     897             :                     {
     898           0 :                         top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in));
     899           0 :                         curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in));
     900           0 :                         top_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((jj)*stride_in)));
     901           0 :                         top_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((jj)*stride_in)));
     902           0 :                         curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in)));
     903           0 :                         curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in)));
     904           0 :                         _mm256_storeu_si256((__m256i *)(ptr_denoised + kk), top);
     905           0 :                         topCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + jj * strideInCr));
     906           0 :                         currCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + (1 + jj)*strideInCr));
     907           0 :                         topPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((jj)*strideInCr)));
     908           0 :                         topNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((jj)*strideInCr)));
     909           0 :                         currPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((1 + jj)*strideInCr)));
     910           0 :                         currNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((1 + jj)*strideInCr)));
     911           0 :                         _mm256_storeu_si256((__m256i *)(ptrDenoisedCr + kk), topCr);
     912             :                     }
     913           0 :                     bottom_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((2 + jj)*stride_in)));
     914           0 :                     bottom_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((2 + jj)*stride_in)));
     915           0 :                     bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in));
     916           0 :                     ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut);
     917           0 :                     bottomPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((2 + jj)*strideInCr)));
     918           0 :                     bottomNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((2 + jj)*strideInCr)));
     919           0 :                     bottomCr = _mm256_loadu_si256((__m256i*)((ptrInCr + kk) + (2 + jj)* strideInCr));
     920           0 :                     ptrDenoisedIntermCr = ptrDenoisedCr + kk + ((1 + jj)*strideOutCr);
     921             :                 }
     922             :                 else
     923             :                 {
     924           0 :                     if (jj == 0)
     925             :                     {
     926           0 :                         top = _mm256_loadu_si256((__m256i*)(ptrIn + kk + jj * stride_in - stride_in));
     927           0 :                         curr = _mm256_loadu_si256((__m256i*)(ptrIn + kk + (1 + jj)*stride_in - stride_in));
     928           0 :                         top_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((jj)*stride_in) - stride_in));
     929           0 :                         top_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((jj)*stride_in) - stride_in));
     930           0 :                         curr_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((1 + jj)*stride_in - stride_in)));
     931           0 :                         curr_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((1 + jj)*stride_in - stride_in)));
     932           0 :                         topCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + jj * strideInCr - strideInCr));
     933           0 :                         currCr = _mm256_loadu_si256((__m256i*)(ptrInCr + kk + (1 + jj)*strideInCr - strideInCr));
     934           0 :                         topPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((jj)*strideInCr) - strideInCr));
     935           0 :                         topNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((jj)*strideInCr) - strideInCr));
     936           0 :                         currPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((1 + jj)*strideInCr - strideInCr)));
     937           0 :                         currNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((1 + jj)*strideInCr - strideInCr)));
     938             :                     }
     939           0 :                     bottom_prev = _mm256_loadu_si256((__m256i*)(ptrIn - 1 + kk + ((2 + jj)*stride_in) - stride_in));
     940           0 :                     bottom_next = _mm256_loadu_si256((__m256i*)(ptrIn + 1 + kk + ((2 + jj)*stride_in) - stride_in));
     941           0 :                     bottom = _mm256_loadu_si256((__m256i*)((ptrIn + kk) + (2 + jj)* stride_in - stride_in));
     942           0 :                     ptrDenoisedInterm = ptr_denoised + kk + ((1 + jj)*strideOut - strideOut);
     943           0 :                     bottomPrevCr = _mm256_loadu_si256((__m256i*)(ptrInCr - 1 + kk + ((2 + jj)*strideInCr) - strideInCr));
     944           0 :                     bottomNextCr = _mm256_loadu_si256((__m256i*)(ptrInCr + 1 + kk + ((2 + jj)*strideInCr) - strideInCr));
     945           0 :                     bottomCr = _mm256_loadu_si256((__m256i*)((ptrInCr + kk) + (2 + jj)* strideInCr - strideInCr));
     946           0 :                     ptrDenoisedIntermCr = ptrDenoisedCr + kk + ((1 + jj)*strideOutCr - strideOutCr);
     947             :                 }
     948             : 
     949           0 :                 chroma_weak_luma_strong_filter_avx2_intrin(
     950             :                     top,
     951             :                     curr,
     952             :                     bottom,
     953             :                     curr_prev,
     954             :                     curr_next,
     955             :                     top_prev,
     956             :                     top_next,
     957             :                     bottom_prev,
     958             :                     bottom_next,
     959             :                     ptrDenoisedInterm);
     960             : 
     961           0 :                 chroma_weak_luma_strong_filter_avx2_intrin(
     962             :                     topCr,
     963             :                     currCr,
     964             :                     bottomCr,
     965             :                     currPrevCr,
     966             :                     currNextCr,
     967             :                     topPrevCr,
     968             :                     topNextCr,
     969             :                     bottomPrevCr,
     970             :                     bottomNextCr,
     971             :                     ptrDenoisedIntermCr);
     972             : 
     973           0 :                 top = curr;
     974           0 :                 curr = bottom;
     975           0 :                 top_prev = curr_prev;
     976           0 :                 top_next = curr_next;
     977           0 :                 curr_prev = bottom_prev;
     978           0 :                 curr_next = bottom_next;
     979           0 :                 topCr = currCr;
     980           0 :                 currCr = bottomCr;
     981           0 :                 topPrevCr = currPrevCr;
     982           0 :                 topNextCr = currNextCr;
     983           0 :                 currPrevCr = bottomPrevCr;
     984           0 :                 currNextCr = bottomNextCr;
     985             :             }
     986             :         }
     987             : 
     988           0 :         sb_height = MIN(BLOCK_SIZE_64 / 2, picHeight - sb_origin_y);
     989           0 :         for (jj = 0; jj < sb_height; jj++) {
     990           0 :             for (ii = 0; ii < picWidth; ii++) {
     991           0 :                 if (!((jj < sb_height - 1 || (sb_origin_y + sb_height) < picHeight) && ii > 0 && ii < picWidth - 1)) {
     992           0 :                     ptr_denoised[ii + jj * strideOut] = ptrIn[ii + jj * stride_in];
     993           0 :                     ptrDenoisedCr[ii + jj * strideOut] = ptrInCr[ii + jj * strideInCr];
     994             :                 }
     995             :             }
     996             :         }
     997             :     }
     998           0 : }

Generated by: LCOV version 1.14