LCOV - code coverage report
Current view: top level - ASM_AVX2 - EbPictureOperators_Inline_AVX2.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 88 101 87.1 %
Date: 2019-11-25 17:38:06 Functions: 4 5 80.0 %

          Line data    Source code
       1             : /*
       2             : * Copyright(c) 2019 Intel Corporation
       3             : * SPDX - License - Identifier: BSD - 2 - Clause - Patent
       4             : */
       5             : 
       6             : #ifndef EbPictureOperators_Inline_AVX2_h
       7             : #define EbPictureOperators_Inline_AVX2_h
       8             : 
       9             : #include <immintrin.h>
      10             : #include "EbDefinitions.h"
      11             : #include "EbMemory_AVX2.h"
      12             : #include "EbPictureOperators_SSE2.h"
      13             : 
      14             : #ifdef __cplusplus
      15             : extern "C" {
      16             : #endif
      17             : 
      18             :     SIMD_INLINE void residual_kernel4_avx2(
      19             :         const uint8_t *input, const uint32_t input_stride, const uint8_t *pred,
      20             :         const uint32_t pred_stride, int16_t *residual,
      21             :         const uint32_t residual_stride, const uint32_t area_height)
      22             :     {
      23    20595000 :         const __m256i zero = _mm256_setzero_si256();
      24    20595000 :         uint32_t y = area_height;
      25             : 
      26             :         do {
      27    36208200 :             const __m256i in = load_u8_4x4_avx2(input, input_stride);
      28    36212800 :             const __m256i pr = load_u8_4x4_avx2(pred, pred_stride);
      29    36208700 :             const __m256i in_lo = _mm256_unpacklo_epi8(in, zero);
      30    36208700 :             const __m256i pr_lo = _mm256_unpacklo_epi8(pr, zero);
      31    36208700 :             const __m256i re_lo = _mm256_sub_epi16(in_lo, pr_lo);
      32    36208700 :             const __m128i r0 = _mm256_castsi256_si128(re_lo);
      33    36208700 :             const __m128i r1 = _mm256_extracti128_si256(re_lo, 1);
      34             : 
      35    36208700 :             store_s16_4x2_sse2(r0, residual + 0 * residual_stride, residual_stride);
      36    36210700 :             store_s16_4x2_sse2(r1, residual + 2 * residual_stride, residual_stride);
      37             : 
      38    36203100 :             input += 4 * input_stride;
      39    36203100 :             pred += 4 * pred_stride;
      40    36203100 :             residual += 4 * residual_stride;
      41    36203100 :             y -= 4;
      42    36203100 :         } while (y);
      43    20589800 :     }
      44             : 
      45             :     SIMD_INLINE void residual_kernel8_avx2(
      46             :         const uint8_t *input, const uint32_t input_stride, const uint8_t *pred,
      47             :         const uint32_t pred_stride, int16_t *residual,
      48             :         const uint32_t residual_stride, const uint32_t area_height)
      49             :     {
      50    24146600 :         const __m256i zero = _mm256_setzero_si256();
      51    24146600 :         uint32_t y = area_height;
      52             : 
      53             :         do {
      54    68984000 :             const __m256i in = load_u8_8x4_avx2(input, input_stride);
      55    69005500 :             const __m256i pr = load_u8_8x4_avx2(pred, pred_stride);
      56    69008400 :             const __m256i in_lo = _mm256_unpacklo_epi8(in, zero);
      57    69008400 :             const __m256i in_hi = _mm256_unpackhi_epi8(in, zero);
      58    69008400 :             const __m256i pr_lo = _mm256_unpacklo_epi8(pr, zero);
      59    69008400 :             const __m256i pr_hi = _mm256_unpackhi_epi8(pr, zero);
      60    69008400 :             const __m256i r0 = _mm256_sub_epi16(in_lo, pr_lo);
      61    69008400 :             const __m256i r1 = _mm256_sub_epi16(in_hi, pr_hi);
      62             : 
      63    69008400 :             storeu_s16_8x2_avx2(r0, residual + 0 * residual_stride, 2 * residual_stride);
      64    69014000 :             storeu_s16_8x2_avx2(r1, residual + 1 * residual_stride, 2 * residual_stride);
      65             : 
      66    68981100 :             input += 4 * input_stride;
      67    68981100 :             pred += 4 * pred_stride;
      68    68981100 :             residual += 4 * residual_stride;
      69    68981100 :             y -= 4;
      70    68981100 :         } while (y);
      71    24143700 :     }
      72             : 
      73             :     SIMD_INLINE void residual_kernel16_avx2(
      74             :         const uint8_t *input, const uint32_t input_stride, const uint8_t *pred,
      75             :         const uint32_t pred_stride, int16_t *residual,
      76             :         const uint32_t residual_stride, const uint32_t area_height)
      77             :     {
      78    15538700 :         const __m256i zero = _mm256_setzero_si256();
      79    15538700 :         uint32_t y = area_height;
      80             : 
      81             :         do {
      82   117448000 :             const __m256i in0 = loadu_u8_16x2_avx2(input, input_stride);
      83   117490000 :             const __m256i pr0 = loadu_u8_16x2_avx2(pred, pred_stride);
      84   117450000 :             const __m256i in1 = _mm256_permute4x64_epi64(in0, 0xD8);
      85   117450000 :             const __m256i pr1 = _mm256_permute4x64_epi64(pr0, 0xD8);
      86   117450000 :             const __m256i in_lo = _mm256_unpacklo_epi8(in1, zero);
      87   117450000 :             const __m256i in_hi = _mm256_unpackhi_epi8(in1, zero);
      88   117450000 :             const __m256i pr_lo = _mm256_unpacklo_epi8(pr1, zero);
      89   117450000 :             const __m256i pr_hi = _mm256_unpackhi_epi8(pr1, zero);
      90   117450000 :             const __m256i re_lo = _mm256_sub_epi16(in_lo, pr_lo);
      91   117450000 :             const __m256i re_hi = _mm256_sub_epi16(in_hi, pr_hi);
      92             : 
      93             :             _mm256_storeu_si256((__m256i*)(residual + 0 * residual_stride), re_lo);
      94   117450000 :             _mm256_storeu_si256((__m256i*)(residual + 1 * residual_stride), re_hi);
      95   117450000 :             input += 2 * input_stride;
      96   117450000 :             pred += 2 * pred_stride;
      97   117450000 :             residual += 2 * residual_stride;
      98   117450000 :             y -= 2;
      99   117450000 :         } while (y);
     100    15540600 :     }
     101             : 
     102  4390600000 :     static INLINE void Distortion_AVX2_INTRIN(const __m256i input,
     103             :         const __m256i recon, __m256i *const sum) {
     104  8781210000 :         const __m256i in = _mm256_unpacklo_epi8(input, _mm256_setzero_si256());
     105  8781210000 :         const __m256i re = _mm256_unpacklo_epi8(recon, _mm256_setzero_si256());
     106  4390600000 :         const __m256i diff = _mm256_sub_epi16(in, re);
     107  4390600000 :         const __m256i dist = _mm256_madd_epi16(diff, diff);
     108  4390600000 :         *sum = _mm256_add_epi32(*sum, dist);
     109  4390600000 :     }
     110             : 
     111  5675640000 :     static INLINE void SpatialFullDistortionKernel16_AVX2_INTRIN(
     112             :         const uint8_t *const input, const uint8_t *const recon,
     113             :         __m256i *const sum)
     114             :     {
     115  5675640000 :         const __m128i in8 = _mm_loadu_si128((__m128i *)input);
     116  5675640000 :         const __m128i re8 = _mm_loadu_si128((__m128i *)recon);
     117  5675640000 :         const __m256i in16 = _mm256_cvtepu8_epi16(in8);
     118  5675640000 :         const __m256i re16 = _mm256_cvtepu8_epi16(re8);
     119  5675640000 :         const __m256i diff = _mm256_sub_epi16(in16, re16);
     120  5675640000 :         const __m256i dist = _mm256_madd_epi16(diff, diff);
     121 11351300000 :         *sum = _mm256_add_epi32(*sum, dist);
     122  5675640000 :     }
     123             : 
     124           0 :     static INLINE void SpatialFullDistortionKernel32Leftover_AVX2_INTRIN(
     125             :         const uint8_t *const input, const uint8_t *const recon, __m256i *const sum0,
     126             :         __m256i *const sum1)
     127             :     {
     128           0 :         const __m256i in = _mm256_loadu_si256((__m256i *)input);
     129           0 :         const __m256i re = _mm256_loadu_si256((__m256i *)recon);
     130           0 :         const __m256i max = _mm256_max_epu8(in, re);
     131           0 :         const __m256i min = _mm256_min_epu8(in, re);
     132           0 :         const __m256i diff = _mm256_sub_epi8(max, min);
     133           0 :         const __m256i diff_L = _mm256_unpacklo_epi8(diff, _mm256_setzero_si256());
     134           0 :         const __m256i diff_H = _mm256_unpackhi_epi8(diff, _mm256_setzero_si256());
     135           0 :         const __m256i dist_L = _mm256_madd_epi16(diff_L, diff_L);
     136           0 :         const __m256i dist_H = _mm256_madd_epi16(diff_H, diff_H);
     137           0 :         *sum0 = _mm256_add_epi32(*sum0, dist_L);
     138           0 :         *sum1 = _mm256_add_epi32(*sum1, dist_H);
     139           0 :     }
     140             : 
     141  5059830000 :     static INLINE void SpatialFullDistortionKernel32_AVX2_INTRIN(
     142             :         const uint8_t *const input, const uint8_t *const recon, __m256i *const sum)
     143             :     {
     144  5059830000 :         const __m256i in = _mm256_loadu_si256((__m256i *)input);
     145  5059830000 :         const __m256i re = _mm256_loadu_si256((__m256i *)recon);
     146  5059830000 :         const __m256i max = _mm256_max_epu8(in, re);
     147  5059830000 :         const __m256i min = _mm256_min_epu8(in, re);
     148  5059830000 :         const __m256i diff = _mm256_sub_epi8(max, min);
     149 10119700000 :         const __m256i diff_L = _mm256_unpacklo_epi8(diff, _mm256_setzero_si256());
     150 10119700000 :         const __m256i diff_H = _mm256_unpackhi_epi8(diff, _mm256_setzero_si256());
     151  5059830000 :         const __m256i dist_L = _mm256_madd_epi16(diff_L, diff_L);
     152  5059830000 :         const __m256i dist_H = _mm256_madd_epi16(diff_H, diff_H);
     153  5059830000 :         const __m256i dist = _mm256_add_epi32(dist_L, dist_H);
     154  5059830000 :         *sum = _mm256_add_epi32(*sum, dist);
     155  5059830000 :     }
     156             : 
     157  1194270300 :     static INLINE int32_t Hadd32_AVX2_INTRIN(const __m256i src) {
     158  1194270300 :         const __m128i src_L = _mm256_extracti128_si256(src, 0);
     159  1194270300 :         const __m128i src_H = _mm256_extracti128_si256(src, 1);
     160  1194270300 :         const __m128i sum = _mm_add_epi32(src_L, src_H);
     161             : 
     162  1194270300 :         return Hadd32_SSE2_INTRIN(sum);
     163             :     }
     164             : 
     165             : #ifdef __cplusplus
     166             : }
     167             : #endif
     168             : 
     169             : #endif // EbPictureOperators_Inline_AVX2_h

Generated by: LCOV version 1.14