LCOV - coverage.info - ASM_AVX2/selfguided

LCOV - code coverage report

Current view:	top level - ASM_AVX2 - selfguided_avx2.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	344	515	66.8 %
Date:	2019-11-25 17:38:06	Functions:	11	12	91.7 %

          Line data    Source code

       1             : /*
       2             :  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include "EbDefinitions.h"
      13             : #include <immintrin.h>
      14             : #include "aom_dsp_rtcd.h"
      15             : #include "EbRestoration.h"
      16             : #include "synonyms.h"
      17             : #include "synonyms_avx2.h"
      18             : #include "transpose_avx2.h"
      19             : #include "transpose_sse2.h"
      20             : 
      21    15731100 : static INLINE void cvt_16to32bit_8x8(const __m128i s[8], __m256i r[8]) {
      22    15731100 :     r[0] = _mm256_cvtepu16_epi32(s[0]);
      23    15731100 :     r[1] = _mm256_cvtepu16_epi32(s[1]);
      24    15731100 :     r[2] = _mm256_cvtepu16_epi32(s[2]);
      25    15731100 :     r[3] = _mm256_cvtepu16_epi32(s[3]);
      26    15731100 :     r[4] = _mm256_cvtepu16_epi32(s[4]);
      27    15731100 :     r[5] = _mm256_cvtepu16_epi32(s[5]);
      28    15731100 :     r[6] = _mm256_cvtepu16_epi32(s[6]);
      29    15731100 :     r[7] = _mm256_cvtepu16_epi32(s[7]);
      30    15731100 : }
      31             : 
      32    31440100 : static INLINE void add_32bit_8x8(const __m256i neighbor, __m256i r[8]) {
      33    31440100 :     r[0] = _mm256_add_epi32(neighbor, r[0]);
      34    31440100 :     r[1] = _mm256_add_epi32(r[0], r[1]);
      35    31440100 :     r[2] = _mm256_add_epi32(r[1], r[2]);
      36    31440100 :     r[3] = _mm256_add_epi32(r[2], r[3]);
      37    31440100 :     r[4] = _mm256_add_epi32(r[3], r[4]);
      38    31440100 :     r[5] = _mm256_add_epi32(r[4], r[5]);
      39    31440100 :     r[6] = _mm256_add_epi32(r[5], r[6]);
      40    31440100 :     r[7] = _mm256_add_epi32(r[6], r[7]);
      41    31440100 : }
      42             : 
      43    15730200 : static INLINE void store_32bit_8x8(const __m256i r[8], int32_t *const buf,
      44             :     const int32_t buf_stride) {
      45    15730200 :     _mm256_store_si256((__m256i *)(buf + 0 * buf_stride), r[0]);
      46    15730200 :     _mm256_store_si256((__m256i *)(buf + 1 * buf_stride), r[1]);
      47    15730200 :     _mm256_store_si256((__m256i *)(buf + 2 * buf_stride), r[2]);
      48    15730200 :     _mm256_store_si256((__m256i *)(buf + 3 * buf_stride), r[3]);
      49    15730200 :     _mm256_store_si256((__m256i *)(buf + 4 * buf_stride), r[4]);
      50    15730200 :     _mm256_store_si256((__m256i *)(buf + 5 * buf_stride), r[5]);
      51    15730200 :     _mm256_store_si256((__m256i *)(buf + 6 * buf_stride), r[6]);
      52    15730200 :     _mm256_store_si256((__m256i *)(buf + 7 * buf_stride), r[7]);
      53    15730200 : }
      54             : 
      55             : static AOM_FORCE_INLINE void integral_images(const uint8_t *src,
      56             :     int32_t src_stride, int32_t width, int32_t height, int32_t *C, int32_t *D,
      57             :     int32_t buf_stride) {
      58      186960 :     const uint8_t *srcT = src;
      59      186960 :     int32_t *CT = C + buf_stride + 1;
      60      186960 :     int32_t *DT = D + buf_stride + 1;
      61             : 
      62      186960 :     memset(C, 0, sizeof(*C) * (width + 8));
      63      186960 :     memset(D, 0, sizeof(*D) * (width + 8));
      64             : 
      65      186960 :     int y = 0;
      66             :     do {
      67     1138560 :         __m256i CLeft = _mm256_setzero_si256();
      68     1138560 :         __m256i DLeft = _mm256_setzero_si256();
      69             : 
      70             :         // Zero the left column.
      71     1138560 :         CT[0 * buf_stride - 1] = DT[0 * buf_stride - 1] = 0;
      72     1138560 :         CT[1 * buf_stride - 1] = DT[1 * buf_stride - 1] = 0;
      73     1138560 :         CT[2 * buf_stride - 1] = DT[2 * buf_stride - 1] = 0;
      74     1138560 :         CT[3 * buf_stride - 1] = DT[3 * buf_stride - 1] = 0;
      75     1138560 :         CT[4 * buf_stride - 1] = DT[4 * buf_stride - 1] = 0;
      76     1138560 :         CT[5 * buf_stride - 1] = DT[5 * buf_stride - 1] = 0;
      77     1138560 :         CT[6 * buf_stride - 1] = DT[6 * buf_stride - 1] = 0;
      78     1138560 :         CT[7 * buf_stride - 1] = DT[7 * buf_stride - 1] = 0;
      79             : 
      80     1138560 :         int x = 0;
      81             :         do {
      82             :             __m128i s[8];
      83             :             __m256i r32[8];
      84             : 
      85     7869840 :             s[0] = _mm_loadl_epi64((__m128i *)(srcT + 0 * src_stride + x));
      86     7869840 :             s[1] = _mm_loadl_epi64((__m128i *)(srcT + 1 * src_stride + x));
      87     7869840 :             s[2] = _mm_loadl_epi64((__m128i *)(srcT + 2 * src_stride + x));
      88     7869840 :             s[3] = _mm_loadl_epi64((__m128i *)(srcT + 3 * src_stride + x));
      89     7869840 :             s[4] = _mm_loadl_epi64((__m128i *)(srcT + 4 * src_stride + x));
      90     7869840 :             s[5] = _mm_loadl_epi64((__m128i *)(srcT + 5 * src_stride + x));
      91     7869840 :             s[6] = _mm_loadl_epi64((__m128i *)(srcT + 6 * src_stride + x));
      92     7869840 :             s[7] = _mm_loadl_epi64((__m128i *)(srcT + 7 * src_stride + x));
      93             : 
      94     7869840 :             partial_transpose_8bit_8x8(s, s);
      95             : 
      96    15741200 :             s[7] = _mm_unpackhi_epi8(s[3], _mm_setzero_si128());
      97    15741200 :             s[6] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
      98    15741200 :             s[5] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
      99    15741200 :             s[4] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
     100    15741200 :             s[3] = _mm_unpackhi_epi8(s[1], _mm_setzero_si128());
     101    15741200 :             s[2] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
     102    15741200 :             s[1] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
     103     7870610 :             s[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
     104             : 
     105     7870610 :             cvt_16to32bit_8x8(s, r32);
     106     7870400 :             add_32bit_8x8(DLeft, r32);
     107     7870310 :             DLeft = r32[7];
     108             : 
     109     7870310 :             transpose_32bit_8x8_avx2(r32, r32);
     110             : 
     111             :             const __m256i DTop =
     112     7870400 :                 _mm256_load_si256((__m256i *)(DT - buf_stride + x));
     113     7870400 :             add_32bit_8x8(DTop, r32);
     114     7869730 :             store_32bit_8x8(r32, DT + x, buf_stride);
     115             : 
     116     7869230 :             s[0] = _mm_mullo_epi16(s[0], s[0]);
     117     7869230 :             s[1] = _mm_mullo_epi16(s[1], s[1]);
     118     7869230 :             s[2] = _mm_mullo_epi16(s[2], s[2]);
     119     7869230 :             s[3] = _mm_mullo_epi16(s[3], s[3]);
     120     7869230 :             s[4] = _mm_mullo_epi16(s[4], s[4]);
     121     7869230 :             s[5] = _mm_mullo_epi16(s[5], s[5]);
     122     7869230 :             s[6] = _mm_mullo_epi16(s[6], s[6]);
     123     7869230 :             s[7] = _mm_mullo_epi16(s[7], s[7]);
     124             : 
     125     7869230 :             cvt_16to32bit_8x8(s, r32);
     126     7869430 :             add_32bit_8x8(CLeft, r32);
     127     7869540 :             CLeft = r32[7];
     128             : 
     129     7869540 :             transpose_32bit_8x8_avx2(r32, r32);
     130             : 
     131             :             const __m256i CTop =
     132     7870260 :                 _mm256_load_si256((__m256i *)(CT - buf_stride + x));
     133     7870260 :             add_32bit_8x8(CTop, r32);
     134     7869690 :             store_32bit_8x8(r32, CT + x, buf_stride);
     135     7869840 :             x += 8;
     136     7869840 :         } while (x < width);
     137             : 
     138     1138560 :         srcT += 8 * src_stride;
     139     1138560 :         CT += 8 * buf_stride;
     140     1138560 :         DT += 8 * buf_stride;
     141     1138560 :         y += 8;
     142     1138560 :     } while (y < height);
     143      186958 : }
     144             : 
     145             : static AOM_FORCE_INLINE void integral_images_highbd(const uint16_t *src,
     146             :     int32_t src_stride, int32_t width, int32_t height, int32_t *C, int32_t *D,
     147             :     int32_t buf_stride) {
     148           0 :     const uint16_t *srcT = src;
     149           0 :     int32_t *CT = C + buf_stride + 1;
     150           0 :     int32_t *DT = D + buf_stride + 1;
     151             : 
     152           0 :     memset(C, 0, sizeof(*C) * (width + 8));
     153           0 :     memset(D, 0, sizeof(*D) * (width + 8));
     154             : 
     155           0 :     int y = 0;
     156             :     do {
     157           0 :         __m256i CLeft = _mm256_setzero_si256();
     158           0 :         __m256i DLeft = _mm256_setzero_si256();
     159             : 
     160             :         // Zero the left column.
     161           0 :         CT[0 * buf_stride - 1] = DT[0 * buf_stride - 1] = 0;
     162           0 :         CT[1 * buf_stride - 1] = DT[1 * buf_stride - 1] = 0;
     163           0 :         CT[2 * buf_stride - 1] = DT[2 * buf_stride - 1] = 0;
     164           0 :         CT[3 * buf_stride - 1] = DT[3 * buf_stride - 1] = 0;
     165           0 :         CT[4 * buf_stride - 1] = DT[4 * buf_stride - 1] = 0;
     166           0 :         CT[5 * buf_stride - 1] = DT[5 * buf_stride - 1] = 0;
     167           0 :         CT[6 * buf_stride - 1] = DT[6 * buf_stride - 1] = 0;
     168           0 :         CT[7 * buf_stride - 1] = DT[7 * buf_stride - 1] = 0;
     169             : 
     170           0 :         int x = 0;
     171             :         do {
     172             :             __m128i s[8];
     173             :             __m256i r32[8], a32[8];
     174             : 
     175           0 :             s[0] = _mm_loadu_si128((__m128i *)(srcT + 0 * src_stride + x));
     176           0 :             s[1] = _mm_loadu_si128((__m128i *)(srcT + 1 * src_stride + x));
     177           0 :             s[2] = _mm_loadu_si128((__m128i *)(srcT + 2 * src_stride + x));
     178           0 :             s[3] = _mm_loadu_si128((__m128i *)(srcT + 3 * src_stride + x));
     179           0 :             s[4] = _mm_loadu_si128((__m128i *)(srcT + 4 * src_stride + x));
     180           0 :             s[5] = _mm_loadu_si128((__m128i *)(srcT + 5 * src_stride + x));
     181           0 :             s[6] = _mm_loadu_si128((__m128i *)(srcT + 6 * src_stride + x));
     182           0 :             s[7] = _mm_loadu_si128((__m128i *)(srcT + 7 * src_stride + x));
     183             : 
     184           0 :             transpose_16bit_8x8(s, s);
     185             : 
     186           0 :             cvt_16to32bit_8x8(s, r32);
     187             : 
     188           0 :             a32[0] = _mm256_madd_epi16(r32[0], r32[0]);
     189           0 :             a32[1] = _mm256_madd_epi16(r32[1], r32[1]);
     190           0 :             a32[2] = _mm256_madd_epi16(r32[2], r32[2]);
     191           0 :             a32[3] = _mm256_madd_epi16(r32[3], r32[3]);
     192           0 :             a32[4] = _mm256_madd_epi16(r32[4], r32[4]);
     193           0 :             a32[5] = _mm256_madd_epi16(r32[5], r32[5]);
     194           0 :             a32[6] = _mm256_madd_epi16(r32[6], r32[6]);
     195           0 :             a32[7] = _mm256_madd_epi16(r32[7], r32[7]);
     196             : 
     197           0 :             add_32bit_8x8(CLeft, a32);
     198           0 :             CLeft = a32[7];
     199             : 
     200           0 :             transpose_32bit_8x8_avx2(a32, a32);
     201             : 
     202             :             const __m256i CTop =
     203           0 :                 _mm256_load_si256((__m256i *)(CT - buf_stride + x));
     204           0 :             add_32bit_8x8(CTop, a32);
     205           0 :             store_32bit_8x8(a32, CT + x, buf_stride);
     206             : 
     207           0 :             add_32bit_8x8(DLeft, r32);
     208           0 :             DLeft = r32[7];
     209             : 
     210           0 :             transpose_32bit_8x8_avx2(r32, r32);
     211             : 
     212             :             const __m256i DTop =
     213           0 :                 _mm256_load_si256((__m256i *)(DT - buf_stride + x));
     214           0 :             add_32bit_8x8(DTop, r32);
     215           0 :             store_32bit_8x8(r32, DT + x, buf_stride);
     216           0 :             x += 8;
     217           0 :         } while (x < width);
     218             : 
     219           0 :         srcT += 8 * src_stride;
     220           0 :         CT += 8 * buf_stride;
     221           0 :         DT += 8 * buf_stride;
     222           0 :         y += 8;
     223           0 :     } while (y < height);
     224           0 : }
     225             : 
     226             : // Compute 8 values of boxsum from the given integral image. ii should point
     227             : // at the middle of the box (for the first value). r is the box radius.
     228   138282000 : static INLINE __m256i boxsum_from_ii(const int32_t *ii, int32_t stride,
     229             :     int32_t r) {
     230   138282000 :     const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
     231   138252000 :     const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
     232   138176000 :     const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride);
     233   138126000 :     const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride);
     234   138084000 :     const __m256i u = _mm256_sub_epi32(tr, tl);
     235   138084000 :     const __m256i v = _mm256_sub_epi32(br, bl);
     236   138084000 :     return _mm256_sub_epi32(v, u);
     237             : }
     238             : 
     239     1069420 : static INLINE __m256i round_for_shift(unsigned shift) {
     240     2138840 :     return _mm256_set1_epi32((1 << shift) >> 1);
     241             : }
     242             : 
     243    69096000 : static INLINE __m256i compute_p(__m256i sum1, __m256i sum2, int32_t n) {
     244    69096000 :     const __m256i bb = _mm256_madd_epi16(sum1, sum1);
     245   138192000 :     const __m256i an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n));
     246    69096000 :     return _mm256_sub_epi32(an, bb);
     247             : }
     248             : 
     249           0 : static INLINE __m256i compute_p_highbd(__m256i sum1, __m256i sum2,
     250             :     int32_t bit_depth, int32_t n) {
     251           0 :     const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8));
     252           0 :     const __m256i rounding_b = round_for_shift(bit_depth - 8);
     253           0 :     const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
     254           0 :     const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
     255             :     const __m256i a =
     256           0 :         _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a);
     257             :     const __m256i b =
     258           0 :         _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b);
     259             :     // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
     260             :     // mullo to square it
     261           0 :     const __m256i bb = _mm256_madd_epi16(b, b);
     262             :     const __m256i an =
     263           0 :         _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb);
     264           0 :     return _mm256_sub_epi32(an, bb);
     265             : }
     266             : 
     267             : // Assumes that C, D are integral images for the original buffer which has been
     268             : // extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
     269             : // on the sides. A, B, C, D point at logical position (0, 0).
     270             : static AOM_FORCE_INLINE void calc_ab(int32_t *A, int32_t *B, const int32_t *C,
     271             :     const int32_t *D, int32_t width, int32_t height, int32_t buf_stride,
     272             :     int32_t bit_depth, int32_t sgr_params_idx, int32_t radius_idx) {
     273      160141 :     const SgrParamsType *const params = &eb_sgr_params[sgr_params_idx];
     274      160141 :     const int32_t r = params->r[radius_idx];
     275      160141 :     const int32_t n = (2 * r + 1) * (2 * r + 1);
     276      160141 :     const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
     277             :     // one_over_n[n-1] is 2^12/n, so easily fits in an int16
     278      160141 :     const __m256i one_over_n = _mm256_set1_epi32(eb_one_by_x[n - 1]);
     279      160141 :     const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
     280      160142 :     const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
     281             : 
     282      160141 :     A -= buf_stride + 1;
     283      160141 :     B -= buf_stride + 1;
     284      160141 :     C -= buf_stride + 1;
     285      160141 :     D -= buf_stride + 1;
     286             : 
     287      160141 :     int32_t i = height + 2;
     288             : 
     289      160141 :     if (bit_depth == 8) {
     290             :         do {
     291     6823740 :             int32_t j = 0;
     292             :             do {
     293    48070700 :                 const __m256i sum1 = boxsum_from_ii(D + j, buf_stride, r);
     294    48041200 :                 const __m256i sum2 = boxsum_from_ii(C + j, buf_stride, r);
     295    47973500 :                 const __m256i p = compute_p(sum1, sum2, n);
     296   239906000 :                 const __m256i z = _mm256_min_epi32(
     297             :                     _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
     298             :                         SGRPROJ_MTABLE_BITS),
     299             :                     _mm256_set1_epi32(255));
     300    47981200 :                 const __m256i a_res = _mm256_i32gather_epi32(eb_x_by_xplus1, z, 4);
     301    47981200 :                 yy_storeu_256(A + j, a_res);
     302             : 
     303             :                 const __m256i a_complement =
     304    96085200 :                     _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
     305             : 
     306             :                 // sum1 might have lanes greater than 2^15, so we can't use madd to do
     307             :                 // multiplication involving sum1. However, a_complement and one_over_n
     308             :                 // are both less than 256, so we can multiply them first.
     309    48042600 :                 const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
     310    48042600 :                 const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
     311    48042600 :                 const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
     312             :                     SGRPROJ_RECIP_BITS);
     313    48042600 :                 yy_storeu_256(B + j, b_res);
     314    48070600 :                 j += 8;
     315    48070600 :             } while (j < width + 2);
     316             : 
     317     6823590 :             A += buf_stride;
     318     6823590 :             B += buf_stride;
     319     6823590 :             C += buf_stride;
     320     6823590 :             D += buf_stride;
     321     6823590 :         } while (--i);
     322             :     }
     323             :     else {
     324             :         do {
     325           0 :             int32_t j = 0;
     326             :             do {
     327           0 :                 const __m256i sum1 = boxsum_from_ii(D + j, buf_stride, r);
     328           0 :                 const __m256i sum2 = boxsum_from_ii(C + j, buf_stride, r);
     329           0 :                 const __m256i p = compute_p_highbd(sum1, sum2, bit_depth, n);
     330           0 :                 const __m256i z = _mm256_min_epi32(
     331             :                     _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
     332             :                         SGRPROJ_MTABLE_BITS),
     333             :                     _mm256_set1_epi32(255));
     334           0 :                 const __m256i a_res = _mm256_i32gather_epi32(eb_x_by_xplus1, z, 4);
     335           0 :                 yy_storeu_256(A + j, a_res);
     336             : 
     337             :                 const __m256i a_complement =
     338           0 :                     _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
     339             : 
     340             :                 // sum1 might have lanes greater than 2^15, so we can't use madd to do
     341             :                 // multiplication involving sum1. However, a_complement and one_over_n
     342             :                 // are both less than 256, so we can multiply them first.
     343           0 :                 const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
     344           0 :                 const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
     345           0 :                 const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
     346             :                     SGRPROJ_RECIP_BITS);
     347           0 :                 yy_storeu_256(B + j, b_res);
     348           0 :                 j += 8;
     349           0 :             } while (j < width + 2);
     350             : 
     351         163 :             A += buf_stride;
     352         163 :             B += buf_stride;
     353         163 :             C += buf_stride;
     354         163 :             D += buf_stride;
     355         163 :         } while (--i);
     356             :     }
     357      160151 : }
     358             : 
     359             : // Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter
     360             : // where the outer four corners have weight 3 and all other pixels have weight
     361             : // 4.
     362             : //
     363             : // Pixels are indexed as follows:
     364             : // xtl  xt   xtr
     365             : // xl    x   xr
     366             : // xbl  xb   xbr
     367             : //
     368             : // buf points to x
     369             : //
     370             : // fours = xl + xt + xr + xb + x
     371             : // threes = xtl + xtr + xbr + xbl
     372             : // cross_sum = 4 * fours + 3 * threes
     373             : //           = 4 * (fours + threes) - threes
     374             : //           = (fours + threes) << 2 - threes
     375    78928000 : static INLINE __m256i cross_sum(const int32_t *buf, int32_t stride) {
     376    78928000 :     const __m256i xtl = yy_loadu_256(buf - 1 - stride);
     377    78921000 :     const __m256i xt = yy_loadu_256(buf - stride);
     378    78899700 :     const __m256i xtr = yy_loadu_256(buf + 1 - stride);
     379    78896200 :     const __m256i xl = yy_loadu_256(buf - 1);
     380    78898300 :     const __m256i x = yy_loadu_256(buf);
     381    78892500 :     const __m256i xr = yy_loadu_256(buf + 1);
     382    78881900 :     const __m256i xbl = yy_loadu_256(buf - 1 + stride);
     383    78876800 :     const __m256i xb = yy_loadu_256(buf + stride);
     384    78873200 :     const __m256i xbr = yy_loadu_256(buf + 1 + stride);
     385             : 
     386   315475000 :     const __m256i fours = _mm256_add_epi32(
     387             :         xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x))));
     388             :     const __m256i threes =
     389   236607000 :         _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
     390             : 
     391   236607000 :     return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2),
     392             :         threes);
     393             : }
     394             : 
     395             : // The final filter for self-guided restoration. Computes a weighted average
     396             : // across A, B with "cross sums" (see cross_sum implementation above).
     397             : static AOM_FORCE_INLINE void final_filter(int32_t *dst, int32_t dst_stride,
     398             :     const int32_t *A, const int32_t *B, int32_t buf_stride, const uint8_t *dgd8,
     399             :     int32_t dgd_stride, int32_t width, int32_t height, int32_t highbd) {
     400      160151 :     const int32_t nb = 5;
     401             :     const __m256i rounding =
     402      160151 :         round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
     403      160151 :     int32_t i = height;
     404             : 
     405      160151 :     if (!highbd) {
     406             :         do {
     407     6503410 :             int32_t j = 0;
     408             :             do {
     409    39542400 :                 const __m256i a = cross_sum(A + j, buf_stride);
     410    39511300 :                 const __m256i b = cross_sum(B + j, buf_stride);
     411    39496800 :                 const __m128i raw = xx_loadl_64(dgd8 + j);
     412    39494000 :                 const __m256i src =_mm256_cvtepu8_epi32(raw);
     413    39494000 :                 const __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
     414    78988000 :                 const __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
     415             :                     SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
     416    39494000 :                 yy_storeu_256(dst + j, w);
     417    39542400 :                 j += 8;
     418    39542400 :             } while (j < width);
     419             : 
     420     6503370 :             A += buf_stride;
     421     6503370 :             B += buf_stride;
     422     6503370 :             dgd8 += dgd_stride;
     423     6503370 :             dst += dst_stride;
     424     6503370 :         } while (--i);
     425             :     }
     426             :     else {
     427           0 :         const uint16_t *dgd_real = CONVERT_TO_SHORTPTR(dgd8);
     428             : 
     429             :         do {
     430           0 :             int32_t j = 0;
     431             :             do {
     432           0 :                 const __m256i a = cross_sum(A + j, buf_stride);
     433           0 :                 const __m256i b = cross_sum(B + j, buf_stride);
     434           0 :                 const __m128i raw = xx_loadu_128(dgd_real + j);
     435           0 :                 const __m256i src = _mm256_cvtepu16_epi32(raw);
     436           0 :                 const __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
     437           0 :                 const __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
     438             :                     SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
     439           0 :                 yy_storeu_256(dst + j, w);
     440           0 :                 j += 8;
     441           0 :             } while (j < width);
     442             : 
     443          37 :             A += buf_stride;
     444          37 :             B += buf_stride;
     445          37 :             dgd_real += dgd_stride;
     446          37 :             dst += dst_stride;
     447          37 :         } while (--i);
     448             :     }
     449      160150 : }
     450             : 
     451             : // Assumes that C, D are integral images for the original buffer which has been
     452             : // extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
     453             : // on the sides. A, B, C, D point at logical position (0, 0).
     454             : static AOM_FORCE_INLINE void calc_ab_fast(int32_t *A, int32_t *B,
     455             :     const int32_t *C, const int32_t *D, int32_t width, int32_t height,
     456             :     int32_t buf_stride, int32_t bit_depth, int32_t sgr_params_idx,
     457             :     int32_t radius_idx) {
     458      143711 :     const SgrParamsType *const params = &eb_sgr_params[sgr_params_idx];
     459      143711 :     const int32_t r = params->r[radius_idx];
     460      143711 :     const int32_t n = (2 * r + 1) * (2 * r + 1);
     461      143711 :     const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
     462             :     // one_over_n[n-1] is 2^12/n, so easily fits in an int16
     463      143711 :     const __m256i one_over_n = _mm256_set1_epi32(eb_one_by_x[n - 1]);
     464      143711 :     const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
     465      143711 :     const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
     466             : 
     467      143711 :     A -= buf_stride + 1;
     468      143711 :     B -= buf_stride + 1;
     469      143711 :     C -= buf_stride + 1;
     470      143711 :     D -= buf_stride + 1;
     471             : 
     472      143711 :     int32_t i = 0;
     473      143711 :     if (bit_depth == 8) {
     474             :         do {
     475     3043490 :             int32_t j = 0;
     476             :             do {
     477    21356300 :                 const __m256i sum1 = boxsum_from_ii(D + j, buf_stride, r);
     478    21342000 :                 const __m256i sum2 = boxsum_from_ii(C + j, buf_stride, r);
     479    21328200 :                 const __m256i p = compute_p(sum1, sum2, n);
     480   106646000 :                 const __m256i z = _mm256_min_epi32(
     481             :                     _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
     482             :                         SGRPROJ_MTABLE_BITS),
     483             :                     _mm256_set1_epi32(255));
     484    21329200 :                 const __m256i a_res = _mm256_i32gather_epi32(eb_x_by_xplus1, z, 4);
     485    21329200 :                 yy_storeu_256(A + j, a_res);
     486             : 
     487             :                 const __m256i a_complement =
     488    42707800 :                     _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
     489             : 
     490             :                 // sum1 might have lanes greater than 2^15, so we can't use madd to do
     491             :                 // multiplication involving sum1. However, a_complement and one_over_n
     492             :                 // are both less than 256, so we can multiply them first.
     493    21353900 :                 const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
     494    21353900 :                 const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
     495    21353900 :                 const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
     496             :                     SGRPROJ_RECIP_BITS);
     497    21353900 :                 yy_storeu_256(B + j, b_res);
     498    21356300 :                 j += 8;
     499    21356300 :             } while (j < width + 2);
     500             : 
     501     3043520 :             A += 2 * buf_stride;
     502     3043520 :             B += 2 * buf_stride;
     503     3043520 :             C += 2 * buf_stride;
     504     3043520 :             D += 2 * buf_stride;
     505     3043520 :             i += 2;
     506     3043520 :         } while (i < height + 2);
     507             :     }
     508             :     else {
     509             :         do {
     510           0 :             int32_t j = 0;
     511             :             do {
     512          33 :                 const __m256i sum1 = boxsum_from_ii(D + j, buf_stride, r);
     513           0 :                 const __m256i sum2 = boxsum_from_ii(C + j, buf_stride, r);
     514           0 :                 const __m256i p = compute_p_highbd(sum1, sum2, bit_depth, n);
     515           0 :                 const __m256i z = _mm256_min_epi32(
     516             :                     _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
     517             :                         SGRPROJ_MTABLE_BITS),
     518             :                     _mm256_set1_epi32(255));
     519           0 :                 const __m256i a_res = _mm256_i32gather_epi32(eb_x_by_xplus1, z, 4);
     520           0 :                 yy_storeu_256(A + j, a_res);
     521             : 
     522             :                 const __m256i a_complement =
     523           0 :                     _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
     524             : 
     525             :                 // sum1 might have lanes greater than 2^15, so we can't use madd to do
     526             :                 // multiplication involving sum1. However, a_complement and one_over_n
     527             :                 // are both less than 256, so we can multiply them first.
     528           0 :                 const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
     529           0 :                 const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
     530           0 :                 const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
     531             :                     SGRPROJ_RECIP_BITS);
     532           0 :                 yy_storeu_256(B + j, b_res);
     533           0 :                 j += 8;
     534           0 :             } while (j < width + 2);
     535             : 
     536           0 :             A += 2 * buf_stride;
     537           0 :             B += 2 * buf_stride;
     538           0 :             C += 2 * buf_stride;
     539           0 :             D += 2 * buf_stride;
     540           0 :             i += 2;
     541           0 :         } while (i < height + 2);
     542             :     }
     543      143711 : }
     544             : 
     545             : // Calculate 8 values of the "cross sum" starting at buf.
     546             : //
     547             : // Pixels are indexed like this:
     548             : // xtl  xt   xtr
     549             : //  -   buf   -
     550             : // xbl  xb   xbr
     551             : //
     552             : // Pixels are weighted like this:
     553             : //  5    6    5
     554             : //  0    0    0
     555             : //  5    6    5
     556             : //
     557             : // fives = xtl + xtr + xbl + xbr
     558             : // sixes = xt + xb
     559             : // cross_sum = 6 * sixes + 5 * fives
     560             : //           = 5 * (fives + sixes) - sixes
     561             : //           = (fives + sixes) << 2 + (fives + sixes) + sixes
     562    35037300 : static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf,
     563             :     int32_t stride) {
     564    35037300 :     const __m256i xtl = yy_loadu_256(buf - 1 - stride);
     565    35034200 :     const __m256i xt = yy_loadu_256(buf - stride);
     566    35027200 :     const __m256i xtr = yy_loadu_256(buf + 1 - stride);
     567    35023300 :     const __m256i xbl = yy_loadu_256(buf - 1 + stride);
     568    35020300 :     const __m256i xb = yy_loadu_256(buf + stride);
     569    35018000 :     const __m256i xbr = yy_loadu_256(buf + 1 + stride);
     570             : 
     571             :     const __m256i fives =
     572   105049000 :         _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
     573    35016400 :     const __m256i sixes = _mm256_add_epi32(xt, xb);
     574    35016400 :     const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
     575             : 
     576   105049000 :     return _mm256_add_epi32(
     577             :         _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
     578             :             fives_plus_sixes),
     579             :         sixes);
     580             : }
     581             : 
     582             : // Calculate 8 values of the "cross sum" starting at buf.
     583             : //
     584             : // Pixels are indexed like this:
     585             : // xl    x   xr
     586             : //
     587             : // Pixels are weighted like this:
     588             : //  5    6    5
     589             : //
     590             : // buf points to x
     591             : //
     592             : // fives = xl + xr
     593             : // sixes = x
     594             : // cross_sum = 5 * fives + 6 * sixes
     595             : //           = 4 * (fives + sixes) + (fives + sixes) + sixes
     596             : //           = (fives + sixes) << 2 + (fives + sixes) + sixes
     597    35032900 : static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) {
     598    35032900 :     const __m256i xl = yy_loadu_256(buf - 1);
     599    35032100 :     const __m256i x = yy_loadu_256(buf);
     600    35028500 :     const __m256i xr = yy_loadu_256(buf + 1);
     601             : 
     602    35026900 :     const __m256i fives = _mm256_add_epi32(xl, xr);
     603    35026900 :     const __m256i sixes = x;
     604             : 
     605    35026900 :     const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
     606             : 
     607   105081000 :     return _mm256_add_epi32(
     608             :         _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
     609             :             fives_plus_sixes),
     610             :         sixes);
     611             : }
     612             : 
     613             : // The final filter for the self-guided restoration. Computes a
     614             : // weighted average across A, B with "cross sums" (see cross_sum_...
     615             : // implementations above).
     616             : static AOM_FORCE_INLINE void final_filter_fast(int32_t *dst, int32_t dst_stride,
     617             :     const int32_t *A, const int32_t *B, int32_t buf_stride, const uint8_t *dgd8,
     618             :     int32_t dgd_stride, int32_t width, int32_t height, int32_t highbd) {
     619      143711 :     const int32_t nb0 = 5;
     620      143711 :     const int32_t nb1 = 4;
     621             :     const __m256i rounding0 =
     622      143711 :         round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
     623             :     const __m256i rounding1 =
     624      143711 :         round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
     625      143711 :     int32_t i = 0;
     626             : 
     627      143711 :     if (!highbd) {
     628             :         do {
     629     5797380 :             if (!(i & 1)) {  // even row
     630     2898560 :                 int32_t j = 0;
     631             :                 do {
     632             :                     const __m256i a =
     633    17539900 :                         cross_sum_fast_even_row(A + j, buf_stride);
     634             :                     const __m256i b =
     635    17519000 :                         cross_sum_fast_even_row(B + j, buf_stride);
     636    17520000 :                     const __m128i raw = xx_loadl_64(dgd8 + j);
     637    17520300 :                     const __m256i src = _mm256_cvtepu8_epi32(raw);
     638    17520300 :                     const __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
     639             :                     const __m256i w =
     640    35040600 :                         _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
     641             :                             SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
     642    17520300 :                     yy_storeu_256(dst + j, w);
     643    17531800 :                     j += 8;
     644    17531800 :                 } while (j < width);
     645             :             }
     646             :             else {  // odd row
     647     2898810 :                 int32_t j = 0;
     648             :                 do {
     649    17518000 :                     const __m256i a = cross_sum_fast_odd_row(A + j);
     650    17520400 :                     const __m256i b = cross_sum_fast_odd_row(B + j);
     651    17516500 :                     const __m128i raw = xx_loadl_64(dgd8 + j);
     652    17515900 :                     const __m256i src = _mm256_cvtepu8_epi32(raw);
     653    17515900 :                     const __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
     654             :                     const __m256i w =
     655    35031900 :                         _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
     656             :                             SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
     657    17515900 :                     yy_storeu_256(dst + j, w);
     658    17526200 :                     j += 8;
     659    17526200 :                 } while (j < width);
     660             :             }
     661             : 
     662     5797370 :             A += buf_stride;
     663     5797370 :             B += buf_stride;
     664     5797370 :             dgd8 += dgd_stride;
     665     5797370 :             dst += dst_stride;
     666     5797370 :         } while (++i < height);
     667             :     }
     668             :     else {
     669           0 :         const uint16_t *dgd_real = CONVERT_TO_SHORTPTR(dgd8);
     670             : 
     671             :         do {
     672           0 :             if (!(i & 1)) {  // even row
     673           0 :                 int32_t j = 0;
     674             :                 do {
     675             :                     const __m256i a =
     676           0 :                         cross_sum_fast_even_row(A + j, buf_stride);
     677             :                     const __m256i b =
     678           0 :                         cross_sum_fast_even_row(B + j, buf_stride);
     679           0 :                     const __m128i raw = xx_loadu_128(dgd_real + j);
     680           0 :                     const __m256i src = _mm256_cvtepu16_epi32(raw);
     681           0 :                     const __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
     682             :                     const __m256i w =
     683           0 :                         _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
     684             :                             SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
     685           0 :                     yy_storeu_256(dst + j, w);
     686           0 :                     j += 8;
     687           0 :                 } while (j < width);
     688             :             }
     689             :             else {  // odd row
     690           0 :                 int32_t j = 0;
     691             :                 do {
     692           1 :                     const __m256i a = cross_sum_fast_odd_row(A + j);
     693           0 :                     const __m256i b = cross_sum_fast_odd_row(B + j);
     694           0 :                     const __m128i raw = xx_loadu_128(dgd_real + j);
     695           0 :                     const __m256i src = _mm256_cvtepu16_epi32(raw);
     696           0 :                     const __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
     697             :                     const __m256i w =
     698           0 :                         _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
     699             :                             SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
     700           0 :                     yy_storeu_256(dst + j, w);
     701           0 :                     j += 8;
     702           0 :                 } while (j < width);
     703             :             }
     704             : 
     705           0 :             A += buf_stride;
     706           0 :             B += buf_stride;
     707           0 :             dgd_real += dgd_stride;
     708           0 :             dst += dst_stride;
     709           0 :         } while (++i < height);
     710             :     }
     711      143702 : }
     712             : 
     713      186960 : void eb_av1_selfguided_restoration_avx2(const uint8_t *dgd8, int32_t width,
     714             :     int32_t height, int32_t dgd_stride, int32_t *flt0, int32_t *flt1,
     715             :     int32_t flt_stride, int32_t sgr_params_idx, int32_t bit_depth,
     716             :     int32_t highbd) {
     717             :     // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
     718             :     // Ctl and Dtl is 32-byte aligned.
     719      186960 :     const int32_t buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
     720             : 
     721             :     DECLARE_ALIGNED(32, int32_t,
     722             :     buf[4 * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)]);
     723             : 
     724      186960 :     const int32_t width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
     725      186960 :     const int32_t height_ext = height + 2 * SGRPROJ_BORDER_VERT;
     726             : 
     727             :     // Adjusting the stride of A and B here appears to avoid bad cache effects,
     728             :     // leading to a significant speed improvement.
     729             :     // We also align the stride to a multiple of 32 bytes for efficiency.
     730      186960 :     int32_t buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3);
     731             : 
     732             :     // The "tl" pointers point at the top-left of the initialised data for the
     733             :     // array.
     734      186960 :     int32_t *Atl = buf + 0 * buf_elts + 7;
     735      186960 :     int32_t *Btl = buf + 1 * buf_elts + 7;
     736      186960 :     int32_t *Ctl = buf + 2 * buf_elts + 7;
     737      186960 :     int32_t *Dtl = buf + 3 * buf_elts + 7;
     738             : 
     739             :     // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
     740             :     // there's a zero row and column in A, B (integral images), so we move down
     741             :     // and right one for them.
     742      186960 :     const int32_t buf_diag_border =
     743      186960 :         SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
     744             : 
     745      186960 :     int32_t *A0 = Atl + 1 + buf_stride;
     746      186960 :     int32_t *B0 = Btl + 1 + buf_stride;
     747      186960 :     int32_t *C0 = Ctl + 1 + buf_stride;
     748      186960 :     int32_t *D0 = Dtl + 1 + buf_stride;
     749             : 
     750             :     // Finally, A, B, C, D point at position (0, 0).
     751      186960 :     int32_t *A = A0 + buf_diag_border;
     752      186960 :     int32_t *B = B0 + buf_diag_border;
     753      186960 :     int32_t *C = C0 + buf_diag_border;
     754      186960 :     int32_t *D = D0 + buf_diag_border;
     755             : 
     756      186960 :     const int32_t dgd_diag_border =
     757      186960 :         SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
     758      186960 :     const uint8_t *dgd0 = dgd8 - dgd_diag_border;
     759             : 
     760             :     // Generate integral images from the input. C will contain sums of squares; D
     761             :     // will contain just sums
     762      186960 :     if (highbd)
     763           0 :         integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
     764             :             height_ext, Ctl, Dtl, buf_stride);
     765             :     else
     766             :         integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
     767             :             buf_stride);
     768             : 
     769      186958 :     const SgrParamsType *const params = &eb_sgr_params[sgr_params_idx];
     770             :     // Write to flt0 and flt1
     771             :     // If params->r == 0 we skip the corresponding filter. We only allow one of
     772             :     // the radii to be 0, as having both equal to 0 would be equivalent to
     773             :     // skipping SGR entirely.
     774             :     assert(!(params->r[0] == 0 && params->r[1] == 0));
     775             :     assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
     776             :     assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
     777             : 
     778      186958 :     if (params->r[0] > 0) {
     779             :         calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
     780             :             sgr_params_idx, 0);
     781             :         final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
     782             :             width, height, highbd);
     783             :     }
     784             : 
     785      186949 :     if (params->r[1] > 0) {
     786             :         calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
     787             :             1);
     788             :         final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
     789             :             height, highbd);
     790             :     }
     791      186958 : }
     792             : 
     793       14160 : void eb_apply_selfguided_restoration_avx2(const uint8_t *dat8, int32_t width,
     794             :     int32_t height, int32_t stride, int32_t eps, const int32_t *xqd,
     795             :     uint8_t *dst8, int32_t dst_stride, int32_t *tmpbuf, int32_t bit_depth,
     796             :     int32_t highbd) {
     797       14160 :     int32_t *flt0 = tmpbuf;
     798       14160 :     int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
     799             :     assert(width * height <= RESTORATION_UNITPELS_MAX);
     800       14160 :     eb_av1_selfguided_restoration_avx2(dat8, width, height, stride, flt0, flt1,
     801             :         width, eps, bit_depth, highbd);
     802       14160 :     const SgrParamsType *const params = &eb_sgr_params[eps];
     803             :     int32_t xq[2];
     804       14160 :     eb_decode_xq(xqd, xq, params);
     805             : 
     806       14160 :     const __m256i xq0 = _mm256_set1_epi32(xq[0]);
     807       14160 :     const __m256i xq1 = _mm256_set1_epi32(xq[1]);
     808             :     const __m256i rounding =
     809       14160 :         round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
     810             : 
     811       14160 :     int32_t i = height;
     812             : 
     813       14160 :     if (!highbd) {
     814       14160 :         const __m256i idx = _mm256_setr_epi32(0, 4, 1, 5, 0, 0, 0, 0);
     815             : 
     816             :         do {
     817             :             // Calculate output in batches of 16 pixels
     818      619200 :             int32_t j = 0;
     819             :             do {
     820     2016000 :                 const __m128i src = xx_loadu_128(dat8 + j);
     821     2016000 :                 const __m256i ep_0 = _mm256_cvtepu8_epi32(src);
     822     4032000 :                 const __m256i ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src, 8));
     823     2016000 :                 const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
     824     2016000 :                 const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
     825     2016000 :                 __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
     826     2016000 :                 __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
     827             : 
     828     2016000 :                 if (params->r[0] > 0) {
     829     2004480 :                     const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[j + 0]), u_0);
     830     4008960 :                     const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[j + 8]), u_1);
     831     4008960 :                     v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
     832     4008960 :                     v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
     833             :                 }
     834             : 
     835     2016000 :                 if (params->r[1] > 0) {
     836     1673280 :                     const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[j + 0]), u_0);
     837     3346560 :                     const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[j + 8]), u_1);
     838     3346560 :                     v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
     839     3346560 :                     v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
     840             :                 }
     841             : 
     842     4032000 :                 const __m256i w_0 = _mm256_srai_epi32(
     843             :                     _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
     844     4032000 :                 const __m256i w_1 = _mm256_srai_epi32(
     845             :                     _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
     846             : 
     847             :                 // Pack into 8 bits and clamp to [0, 256)
     848             :                 // Note that each pack messes up the order of the bits,
     849             :                 // so we use a permute function to correct this
     850             :                 // 0, 1, 4, 5, 2, 3, 6, 7
     851     2016000 :                 const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
     852             :                 // 0, 1, 4, 5, 2, 3, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7
     853     2016000 :                 const __m256i tmp2 = _mm256_packus_epi16(tmp, tmp);
     854             :                 // 0, 1, 2, 3, 4, 5, 6, 7, ...
     855     2016000 :                 const __m256i tmp3 = _mm256_permutevar8x32_epi32(tmp2, idx);
     856     2016000 :                 const __m128i res = _mm256_castsi256_si128(tmp3);
     857     2016000 :                 xx_storeu_128(dst8 + j, res);
     858     2016000 :                 j += 16;
     859     2016000 :             } while (j < width);
     860             : 
     861      619200 :             dat8 += stride;
     862      619200 :             flt0 += width;
     863      619200 :             flt1 += width;
     864      619200 :             dst8 += dst_stride;
     865      619200 :         } while (--i);
     866             :     }
     867             :     else {
     868           0 :         const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
     869           0 :         const uint16_t *dat16 = CONVERT_TO_SHORTPTR(dat8);
     870           0 :         uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8);
     871             : 
     872             :         do {
     873             :             // Calculate output in batches of 16 pixels
     874           0 :             int32_t j = 0;
     875             :             do {
     876           0 :                 const __m128i src_0 = xx_loadu_128(dat16 + j + 0);
     877           0 :                 const __m128i src_1 = xx_loadu_128(dat16 + j + 8);
     878           0 :                 const __m256i ep_0 = _mm256_cvtepu16_epi32(src_0);
     879           0 :                 const __m256i ep_1 = _mm256_cvtepu16_epi32(src_1);
     880           0 :                 const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
     881           0 :                 const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
     882           0 :                 __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
     883           0 :                 __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
     884             : 
     885           0 :                 if (params->r[0] > 0) {
     886           0 :                     const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[j + 0]), u_0);
     887           0 :                     const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[j + 8]), u_1);
     888           0 :                     v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
     889           0 :                     v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
     890             :                 }
     891             : 
     892           0 :                 if (params->r[1] > 0) {
     893           0 :                     const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[j + 0]), u_0);
     894           0 :                     const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[j + 8]), u_1);
     895           0 :                     v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
     896           0 :                     v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
     897             :                 }
     898             : 
     899           0 :                 const __m256i w_0 = _mm256_srai_epi32(
     900             :                     _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
     901           0 :                 const __m256i w_1 = _mm256_srai_epi32(
     902             :                     _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
     903             : 
     904             :                 // Pack into 16 bits and clamp to [0, 2^bit_depth)
     905             :                 // Note that packing into 16 bits messes up the order of the bits,
     906             :                 // so we use a permute function to correct this
     907           0 :                 const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
     908           0 :                 const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
     909           0 :                 const __m256i res = _mm256_min_epi16(tmp2, max);
     910           0 :                 yy_storeu_256(dst16 + j, res);
     911           0 :                 j += 16;
     912           0 :             } while (j < width);
     913             : 
     914           0 :             dat16 += stride;
     915           0 :             flt0 += width;
     916           0 :             flt1 += width;
     917           0 :             dst16 += dst_stride;
     918           0 :         } while (--i);
     919             :     }
     920       14160 : }

Generated by: LCOV version 1.14