LCOV - coverage.info - ASM_AVX2/variance

LCOV - code coverage report

Current view:	top level - ASM_AVX2 - variance_avx2.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	132	232	56.9 %
Date:	2019-11-25 17:38:06	Functions:	32	43	74.4 %

          Line data    Source code

       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include "EbDefinitions.h"
      13             : #include <immintrin.h>
      14             : #include "aom_dsp_rtcd.h"
      15             : #include "EbVariance_SSE2.h"
      16             : 
      17             : 
      18             : // Alpha blending with alpha values from the range [0, 256], where 256
      19             : // means use the first input and 0 means use the second input.
      20             : #define AOM_BLEND_A256_ROUND_BITS 8
      21             : #define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS)  // 256
      22             : 
      23             : #define AOM_BLEND_A256(a, v0, v1)                                          \
      24             :   ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
      25             :                      AOM_BLEND_A256_ROUND_BITS)
      26             : 
      27      793944 : static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
      28     1587890 :     return _mm_add_epi16(_mm256_castsi256_si128(val),
      29      793944 :         _mm256_extractf128_si256(val, 1));
      30             : }
      31             : 
      32     3427330 : static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
      33     6854670 :     return _mm_add_epi32(_mm256_castsi256_si128(val),
      34     3427330 :         _mm256_extractf128_si256(val, 1));
      35             : }
      36             : 
      37    20677700 : static INLINE void variance_kernel_no_sum_avx2(const __m256i src, const __m256i ref,
      38             :     __m256i *const sse) {
      39    20677700 :     const __m256i adj_sub = _mm256_set1_epi16((short)0xff01);  // (1,-1)
      40             : 
      41             :     // unpack into pairs of source and reference values
      42    20677700 :     const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
      43    20677700 :     const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
      44             : 
      45             :     // subtract adjacent elements using src*1 + ref*-1
      46    20677700 :     const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
      47    20677700 :     const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
      48    20677700 :     const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
      49    20677700 :     const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
      50             : 
      51             :     // add to the running totals
      52    20677700 :     *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
      53    20677700 : }
      54             : 
      55     2590120 : static INLINE void variance_final_from_32bit_no_sum_avx2(__m256i vsse,
      56             :     uint32_t *const sse) {
      57             :     // extract the low lane and add it to the high lane
      58     2590120 :     const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
      59     2590060 :     const __m128i zero = _mm_setzero_si128();
      60             : 
      61             :     // unpack sse and sum registers and add
      62     2590060 :     const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, zero);
      63     2590060 :     const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, zero);
      64     2590060 :     const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
      65             : 
      66             :     // perform the final summation and extract the results
      67     5180130 :     const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
      68     2590060 :     *((int32_t *)sse) = _mm_cvtsi128_si32(res);
      69     2590060 : }
      70             : 
      71             : // handle pixels (<= 512)
      72     2590150 : static INLINE void variance_final_512_no_sum_avx2(__m256i vsse,
      73             :     uint32_t *const sse) {
      74             :     // extract the low lane and add it to the high lane
      75     2590150 :     variance_final_from_32bit_no_sum_avx2(vsse, sse);
      76     2589590 : }
      77             : 
      78             : // handle 1024 pixels (32x32, 16x64, 64x16)
      79           0 : static INLINE void variance_final_1024_no_sum_avx2(__m256i vsse,
      80             :     uint32_t *const sse) {
      81             :     // extract the low lane and add it to the high lane
      82           0 :     variance_final_from_32bit_no_sum_avx2(vsse, sse);
      83           0 : }
      84             : 
      85       28067 : static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
      86       28067 :     const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
      87             :     const __m256i sum_hi =
      88       56134 :         _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
      89       28067 :     return _mm256_add_epi32(sum_lo, sum_hi);
      90             : }
      91             : 
      92             : // handle 2048 pixels (32x64, 64x32)
      93           0 : static INLINE void variance_final_2048_no_sum_avx2(__m256i vsse,
      94             :     uint32_t *const sse) {
      95           0 :     variance_final_from_32bit_no_sum_avx2(vsse, sse);
      96           0 : }
      97             : 
      98    20689000 : static INLINE void variance16_kernel_no_sum_avx2(
      99             :     const uint8_t *const src, const int32_t src_stride, const uint8_t *const ref,
     100             :     const int32_t ref_stride, __m256i *const sse) {
     101    20689000 :     const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
     102    41378000 :     const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
     103    20689000 :     const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
     104    41378000 :     const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
     105    20689000 :     const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
     106    20689000 :     const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
     107    20689000 :     variance_kernel_no_sum_avx2(s, r, sse);
     108    20683500 : }
     109             : 
     110           0 : static INLINE void variance32_kernel_no_sum_avx2(const uint8_t *const src,
     111             :     const uint8_t *const ref,
     112             :     __m256i *const sse) {
     113           0 :     const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
     114           0 :     const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
     115           0 :     variance_kernel_no_sum_avx2(s, r, sse);
     116           0 : }
     117             : 
     118     2589790 : static INLINE void variance16_no_sum_avx2(const uint8_t *src,
     119             :     const int32_t src_stride, const uint8_t *ref, const int32_t ref_stride,
     120             :     const int32_t h, __m256i *const vsse) {
     121    23273100 :     for (int32_t i = 0; i < h; i += 2) {
     122    20679200 :         variance16_kernel_no_sum_avx2(src, src_stride, ref, ref_stride, vsse);
     123    20683300 :         src += 2 * src_stride;
     124    20683300 :         ref += 2 * ref_stride;
     125             :     }
     126     2593840 : }
     127             : 
     128           0 : static INLINE void variance32_no_sum_avx2(const uint8_t *src, const int32_t src_stride,
     129             :     const uint8_t *ref, const int32_t ref_stride,
     130             :     const int32_t h, __m256i *const vsse) {
     131           0 :     for (int32_t i = 0; i < h; i++) {
     132           0 :         variance32_kernel_no_sum_avx2(src, ref, vsse);
     133           0 :         src += src_stride;
     134           0 :         ref += ref_stride;
     135             :     }
     136           0 : }
     137             : 
     138           0 : static INLINE void variance64_no_sum_avx2(const uint8_t *src, const int32_t src_stride,
     139             :     const uint8_t *ref, const int32_t ref_stride,
     140             :     const int32_t h, __m256i *const vsse) {
     141           0 :     for (int32_t i = 0; i < h; i++) {
     142           0 :         variance32_kernel_no_sum_avx2(src + 0, ref + 0, vsse);
     143           0 :         variance32_kernel_no_sum_avx2(src + 32, ref + 32, vsse);
     144           0 :         src += src_stride;
     145           0 :         ref += ref_stride;
     146             :     }
     147           0 : }
     148             : 
     149           0 : static INLINE void variance128_no_sum_avx2(const uint8_t *src, const int32_t src_stride,
     150             :     const uint8_t *ref, const int32_t ref_stride,
     151             :     const int32_t h, __m256i *const vsse) {
     152           0 :     for (int32_t i = 0; i < h; i++) {
     153           0 :         variance32_kernel_no_sum_avx2(src + 0, ref + 0, vsse);
     154           0 :         variance32_kernel_no_sum_avx2(src + 32, ref + 32, vsse);
     155           0 :         variance32_kernel_no_sum_avx2(src + 64, ref + 64, vsse);
     156           0 :         variance32_kernel_no_sum_avx2(src + 96, ref + 96, vsse);
     157           0 :         src += src_stride;
     158           0 :         ref += ref_stride;
     159             :     }
     160           0 : }
     161             : 
     162             : #define AOM_VAR_NO_LOOP_NO_SUM_AVX2(bw, bh, bits, max_pixel)                         \
     163             :   void eb_aom_variance##bw##x##bh##_no_sum_avx2(                                \
     164             :       const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, \
     165             :       uint32_t *sse) {                                                    \
     166             :     __m256i vsse = _mm256_setzero_si256();                                    \
     167             :     variance##bw##_no_sum_avx2(src, src_stride, ref, ref_stride, bh, &vsse);  \
     168             :     variance_final_##max_pixel##_no_sum_avx2(vsse,  sse);       \
     169             :   }
     170             : 
     171     5179510 : AOM_VAR_NO_LOOP_NO_SUM_AVX2(16, 16, 8, 512);
     172             : 
     173     2589580 : uint32_t eb_aom_mse16x16_avx2(const uint8_t *src, int32_t src_stride,
     174             :     const uint8_t *ref, int32_t ref_stride,
     175             :     uint32_t *sse) {
     176     2589580 :     eb_aom_variance16x16_no_sum_avx2(src, src_stride, ref, ref_stride, sse);
     177     2589600 :     return *sse;
     178             : }
     179             : 
     180           0 : void highbd_variance64_avx2(const uint8_t *a8, int32_t a_stride,
     181             :     const uint8_t *b8, int32_t b_stride, int32_t w, int32_t h,
     182             :     uint64_t *sse) {
     183           0 :     const uint8_t *a = a8;
     184           0 :     const uint8_t *b = b8;
     185             : 
     186           0 :     if (w == 4) {
     187           0 :         __m128i vsse = _mm_setzero_si128();
     188             :         uint32_t tsse;
     189           0 :         variance4_no_sum_sse2(a8, a_stride, b8, b_stride, h, &vsse);
     190           0 :         variance_final_128_pel_no_sum_sse2(vsse, &tsse);
     191           0 :         *sse = tsse;
     192             :     }
     193           0 :     else if (w == 8) {
     194           0 :         __m128i vsse = _mm_setzero_si128();
     195             :         uint32_t tsse;
     196           0 :         variance8_no_sum_sse2(a8, a_stride, b8, b_stride, h, &vsse);
     197           0 :         variance_final_256_pel_no_sum_sse2(vsse, &tsse);
     198           0 :         *sse = tsse;
     199             :     }
     200           0 :     else if (w == 16) {
     201           0 :         __m256i vsse = _mm256_setzero_si256();
     202             :         uint32_t tsse;
     203           0 :         variance16_no_sum_avx2(a8, a_stride, b8, b_stride, h, &vsse);
     204           0 :         variance_final_1024_no_sum_avx2(vsse, &tsse);
     205           0 :         *sse = tsse;
     206             :     }
     207           0 :     else if (w == 32) {
     208           0 :         if (h <= 64) {
     209           0 :             __m256i vsse = _mm256_setzero_si256();
     210             :             uint32_t tsse;
     211           0 :             variance32_no_sum_avx2(a8, a_stride, b8, b_stride, h, &vsse);
     212           0 :             variance_final_2048_no_sum_avx2(vsse, &tsse);
     213           0 :             *sse = tsse;
     214             :         }
     215             :         else {
     216           0 :             __m256i vsse = _mm256_setzero_si256();
     217             :             uint32_t tsse;
     218           0 :             variance32_no_sum_avx2(a8, a_stride, b8, b_stride, 64, &vsse);
     219           0 :             variance32_no_sum_avx2(a8 + 64 * a_stride, a_stride, b8 + 64 * b_stride,
     220             :                 b_stride, h - 64, &vsse);
     221           0 :             variance_final_from_32bit_no_sum_avx2(vsse, &tsse);
     222           0 :             *sse = tsse;
     223             :         }
     224             :     }
     225           0 :     else if (w == 64) {
     226           0 :         if (h <= 32) {
     227           0 :             __m256i vsse = _mm256_setzero_si256();
     228             :             uint32_t tsse;
     229           0 :             variance64_no_sum_avx2(a8, a_stride, b8, b_stride, h, &vsse);
     230           0 :             variance_final_2048_no_sum_avx2(vsse, &tsse);
     231           0 :             *sse = tsse;
     232             :         }
     233             :         else {
     234           0 :             __m256i vsse = _mm256_setzero_si256();
     235             :             uint32_t tsse;
     236             : 
     237           0 :             int32_t i = 0;
     238             :             do {
     239           0 :                 variance64_no_sum_avx2(a8, a_stride, b8, b_stride, 32, &vsse);
     240           0 :                 a8 += 32 * a_stride;
     241           0 :                 b8 += 32 * b_stride;
     242           0 :             } while (++i < (h / 32));
     243           0 :             variance_final_from_32bit_no_sum_avx2(vsse, &tsse);
     244           0 :             *sse = tsse;
     245             :         }
     246             :     }
     247           0 :     else if (w == 128) {
     248           0 :         __m256i vsse = _mm256_setzero_si256();
     249             :         uint32_t tsse;
     250             : 
     251           0 :         int32_t i = 0;
     252             :         do {
     253           0 :             variance128_no_sum_avx2(a8, a_stride, b8, b_stride, 16, &vsse);
     254           0 :             a8 += 16 * a_stride;
     255           0 :             b8 += 16 * b_stride;
     256           0 :         } while (++i < (h / 16));
     257           0 :         variance_final_from_32bit_no_sum_avx2(vsse, &tsse);
     258           0 :         *sse = tsse;
     259             :     }
     260             :     else
     261           0 :         highbd_variance64_c(a, a_stride, b, b_stride, w, h, sse);
     262             : #ifdef _WIN32
     263             :     // Add this redundant instruction to fix a Visual Studio compiler bug, which
     264             :     // falsely loads 64-bit intermediate result into *sse in
     265             :     // variance_final_from_32bit_sum_avx2(), instead of 32-bit result as we
     266             :     // wanted. We and *sse back to 32-bit correct result.
     267             :     // No overflow happens here,  since for the largest 8-bit 128x128 block,
     268             :     // *sse is at most 255 * 255 * 128 * 128, i.e., 0x000000003F804000L.
     269             :     *sse &= 0x00000000FFFFFFFFL;
     270             : #endif
     271           0 : }
     272             : 
     273      815633 : static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
     274             :     unsigned int *const sse) {
     275             :     // extract the low lane and add it to the high lane
     276      815633 :     const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
     277             : 
     278             :     // unpack sse and sum registers and add
     279      815557 :     const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
     280      815557 :     const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
     281      815557 :     const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
     282             : 
     283             :     // perform the final summation and extract the results
     284     1631110 :     const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
     285      815557 :     *((int *)sse) = _mm_cvtsi128_si32(res);
     286      815557 :     return _mm_extract_epi32(res, 1);
     287             : }
     288             : 
     289             : // handle pixels (<= 512)
     290      734495 : static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
     291             :     unsigned int *const sse) {
     292             :     // extract the low lane and add it to the high lane
     293      734495 :     const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
     294     1468740 :     const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
     295      734371 :     const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
     296      734371 :     return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
     297             : }
     298             : 
     299             : // handle 1024 pixels (32x32, 16x64, 64x16)
     300       59596 : static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
     301             :     unsigned int *const sse) {
     302             :     // extract the low lane and add it to the high lane
     303       59596 :     const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
     304             :     const __m128i vsum_64 =
     305      119188 :         _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
     306       59594 :             _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
     307       59594 :     return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
     308             : }
     309             : 
     310       15284 : static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
     311             :     unsigned int *const sse) {
     312       15284 :     vsum = sum_to_32bit_avx2(vsum);
     313       15284 :     const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
     314       15284 :     return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
     315             : }
     316             : 
     317     9168220 : static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
     318             :     __m256i *const sse,
     319             :     __m256i *const sum) {
     320     9168220 :     const __m256i adj_sub = _mm256_set1_epi16(0xff01);  // (1,-1)
     321             : 
     322             :     // unpack into pairs of source and reference values
     323     9168220 :     const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
     324     9168220 :     const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
     325             : 
     326             :     // subtract adjacent elements using src*1 + ref*-1
     327     9168220 :     const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
     328     9168220 :     const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
     329     9168220 :     const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
     330     9168220 :     const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
     331             : 
     332             :     // add to the running totals
     333    18336400 :     *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
     334     9168220 :     *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
     335     9168220 : }
     336             : 
     337     5278350 : static INLINE void variance16_kernel_avx2(
     338             :     const uint8_t *const src, const int src_stride, const uint8_t *const ref,
     339             :     const int ref_stride, __m256i *const sse, __m256i *const sum) {
     340     5278350 :     const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
     341    10556700 :     const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
     342     5278350 :     const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
     343    10556700 :     const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
     344     5278350 :     const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
     345     5278350 :     const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
     346     5278350 :     variance_kernel_avx2(s, r, sse, sum);
     347     5298790 : }
     348             : 
     349     3901550 : static INLINE void variance32_kernel_avx2(const uint8_t *const src,
     350             :     const uint8_t *const ref,
     351             :     __m256i *const sse,
     352             :     __m256i *const sum) {
     353     3901550 :     const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
     354     3901550 :     const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
     355     3901550 :     variance_kernel_avx2(s, r, sse, sum);
     356     3905580 : }
     357             : 
     358      690824 : static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
     359             :     const uint8_t *ref, const int ref_stride,
     360             :     const int h, __m256i *const vsse,
     361             :     __m256i *const vsum) {
     362      690824 :     *vsum = _mm256_setzero_si256();
     363             : 
     364     5987560 :     for (int i = 0; i < h; i += 2) {
     365     5282870 :         variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
     366     5296740 :         src += 2 * src_stride;
     367     5296740 :         ref += 2 * ref_stride;
     368             :     }
     369      704690 : }
     370             : 
     371       98455 : static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
     372             :     const uint8_t *ref, const int ref_stride,
     373             :     const int h, __m256i *const vsse,
     374             :     __m256i *const vsum) {
     375       98455 :     *vsum = _mm256_setzero_si256();
     376             : 
     377     2341720 :     for (int i = 0; i < h; i++) {
     378     2243270 :         variance32_kernel_avx2(src, ref, vsse, vsum);
     379     2243260 :         src += src_stride;
     380     2243260 :         ref += ref_stride;
     381             :     }
     382       98448 : }
     383             : 
     384       31984 : static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
     385             :     const uint8_t *ref, const int ref_stride,
     386             :     const int h, __m256i *const vsse,
     387             :     __m256i *const vsum) {
     388       31984 :     *vsum = _mm256_setzero_si256();
     389             : 
     390      865303 :     for (int i = 0; i < h; i++) {
     391      833222 :         variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
     392      834068 :         variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
     393      833319 :         src += src_stride;
     394      833319 :         ref += ref_stride;
     395             :     }
     396       32081 : }
     397             : 
     398           0 : static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
     399             :     const uint8_t *ref, const int ref_stride,
     400             :     const int h, __m256i *const vsse,
     401             :     __m256i *const vsum) {
     402           0 :     *vsum = _mm256_setzero_si256();
     403             : 
     404           0 :     for (int i = 0; i < h; i++) {
     405           0 :         variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
     406           0 :         variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
     407           0 :         variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
     408           0 :         variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
     409           0 :         src += src_stride;
     410           0 :         ref += ref_stride;
     411             :     }
     412           0 : }
     413             : 
     414             : #define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel)                         \
     415             :   unsigned int eb_aom_variance##bw##x##bh##_avx2(                                \
     416             :       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
     417             :       unsigned int *sse) {                                                    \
     418             :     __m256i vsse = _mm256_setzero_si256();                                    \
     419             :     __m256i vsum;                                                             \
     420             :     variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
     421             :     const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse);       \
     422             :     return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
     423             :   }
     424             : 
     425      157744 : AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512);
     426      109710 : AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512);
     427     1046330 : AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512);
     428       42842 : AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512);
     429       25292 : AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024);
     430             : 
     431       68046 : AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512);
     432       42958 : AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512);
     433       70076 : AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024);
     434       15832 : AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048);
     435             : 
     436       23814 : AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024);
     437       14736 : AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048);
     438             : 
     439             : #define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh)                                   \
     440             :   unsigned int eb_aom_variance##bw##x##bh##_avx2(                                \
     441             :       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
     442             :       unsigned int *sse) {                                                    \
     443             :     __m256i vsse = _mm256_setzero_si256();                                    \
     444             :     __m256i vsum = _mm256_setzero_si256();                                    \
     445             :     for (int i = 0; i < (bh / uh); i++) {                                     \
     446             :       __m256i vsum16;                                                         \
     447             :       variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse,        \
     448             :                           &vsum16);                                           \
     449             :       vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));               \
     450             :       src += uh * src_stride;                                                 \
     451             :       ref += uh * ref_stride;                                                 \
     452             :     }                                                                         \
     453             :     const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);                     \
     454             :     const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);  \
     455             :     return *sse - (unsigned int)(((int64_t)sum * sum) >> bits);               \
     456             :   }
     457             : 
     458       31746 : AOM_VAR_LOOP_AVX2(64, 64, 12, 32);    // 64x32 * ( 64/32)
     459           0 : AOM_VAR_LOOP_AVX2(64, 128, 13, 32);   // 64x32 * (128/32)
     460           0 : AOM_VAR_LOOP_AVX2(128, 64, 13, 16);   // 128x16 * ( 64/16)
     461           0 : AOM_VAR_LOOP_AVX2(128, 128, 14, 16);  // 128x16 * (128/16)

Generated by: LCOV version 1.14